Merge commit '1ad9253c9d34ccbce3e7e4ea5d87c266cbf93410'

deplib features commented out due to removal upstream; will add back as a localmod Conflicts: include/llvm/ADT/Triple.h include/llvm/MC/MCAssembler.h include/llvm/Target/TargetFrameLowering.h lib/CodeGen/AsmPrinter/DwarfDebug.cpp lib/CodeGen/AsmPrinter/DwarfDebug.h lib/CodeGen/BranchFolding.cpp lib/LLVMBuild.txt lib/Linker/LinkArchives.cpp lib/MC/MCAssembler.cpp lib/MC/MCELFStreamer.cpp lib/Makefile lib/Target/ARM/ARMExpandPseudoInsts.cpp lib/Target/ARM/ARMFrameLowering.cpp lib/Target/ARM/ARMISelLowering.cpp lib/Target/ARM/ARMSubtarget.h lib/Target/ARM/ARMTargetObjectFile.cpp lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp lib/Target/Mips/MipsInstrFPU.td lib/Target/Mips/MipsInstrInfo.td lib/Target/X86/X86CodeEmitter.cpp lib/Target/X86/X86Subtarget.h lib/VMCore/Module.cpp test/MC/MachO/ARM/nop-armv4-padding.s tools/Makefile tools/llc/llc.cpp tools/lto/LTOModule.cpp tools/lto/lto.cpp
author: Derek Schuff <dschuff@chromium.org> 2013-01-09 16:55:43 -0800
committer: Derek Schuff <dschuff@chromium.org> 2013-01-11 13:47:37 -0800
commit: b770d0e0636a4b5ad61b1ca661caee67576c05fc (patch)
tree: c486ce032d41f97313c50629bd5b879f53e6ccbf /test/CodeGen
parent: b835840cf112a6178506d834b58aa625f59a8994 (diff)
parent: 1ad9253c9d34ccbce3e7e4ea5d87c266cbf93410 (diff)
81 files changed, 2389 insertions, 128 deletions
diff --git a/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll b/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
index 5cfbb4f944..1272a25793 100644
--- a/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
+++ b/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
@@ -10,7 +10,8 @@
 @STRIDE = internal global i32 8
 
 ; ASM:          .type   array00,%object         @ @array00
-; ASM-NEXT:     .lcomm  array00,80
+; ASM-NEXT:     .local  array00
+; ASM-NEXT:     .comm   array00,80,1
 ; ASM-NEXT:     .type   _MergedGlobals,%object  @ @_MergedGlobals
 
 
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index 6e0ef96196..f563eeef01 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -1,13 +1,5 @@
 ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
 
-; Should trigger a NEON store.
-; CHECK: vstr
-define void @f_0_12(i8* nocapture %c) nounwind optsize {
-entry:
-  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
-  ret void
-}
-
 ; Trigger multiple NEON stores.
 ; CHECK:      vst1.64
 ; CHECK-NEXT: vst1.64
diff --git a/test/CodeGen/ARM/2012-08-23-legalize-vmull.ll b/test/CodeGen/ARM/2012-08-23-legalize-vmull.ll
new file mode 100644
index 0000000000..2f55204aa4
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-23-legalize-vmull.ll
@@ -0,0 +1,150 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+; PR12281
+; Test generataion of code for vmull instruction when multiplying 128-bit
+; vectors that were created by sign-extending smaller vector sizes.
+;
+; The vmull operation requires 64-bit vectors, so we must extend the original
+; vector size to 64 bits for vmull operation.
+; Previously failed with an assertion because the <4 x i8> vector was too small
+; for vmull.
+
+; Vector x Constant
+; v4i8
+;
+define void @sextload_v4i8_c(<4 x i8>* %v) nounwind {
+;CHECK: sextload_v4i8_c:
+entry:
+  %0 = load <4 x i8>* %v, align 8
+  %v0  = sext <4 x i8> %0 to <4 x i32>
+;CHECK: vmull
+  %v1 = mul <4 x i32>  %v0, <i32 3, i32 3, i32 3, i32 3>
+  store <4 x i32> %v1, <4 x i32>* undef, align 8
+  ret void;
+}
+
+; v2i8
+;
+define void @sextload_v2i8_c(<2 x i8>* %v) nounwind {
+;CHECK: sextload_v2i8_c:
+entry:
+  %0   = load <2 x i8>* %v, align 8
+  %v0  = sext <2 x i8>  %0 to <2 x i64>
+;CHECK: vmull
+  %v1  = mul <2 x i64>  %v0, <i64 3, i64 3>
+  store <2 x i64> %v1, <2 x i64>* undef, align 8
+  ret void;
+}
+
+; v2i16
+;
+define void @sextload_v2i16_c(<2 x i16>* %v) nounwind {
+;CHECK: sextload_v2i16_c:
+entry:
+  %0   = load <2 x i16>* %v, align 8
+  %v0  = sext <2 x i16>  %0 to <2 x i64>
+;CHECK: vmull
+  %v1  = mul <2 x i64>  %v0, <i64 3, i64 3>
+  store <2 x i64> %v1, <2 x i64>* undef, align 8
+  ret void;
+}
+
+
+; Vector x Vector
+; v4i8
+;
+define void @sextload_v4i8_v(<4 x i8>* %v, <4 x i8>* %p) nounwind {
+;CHECK: sextload_v4i8_v:
+entry:
+  %0 = load <4 x i8>* %v, align 8
+  %v0  = sext <4 x i8> %0 to <4 x i32>
+
+  %1  = load <4 x i8>* %p, align 8
+  %v2 = sext <4 x i8> %1 to <4 x i32>
+;CHECK: vmull
+  %v1 = mul <4 x i32>  %v0, %v2
+  store <4 x i32> %v1, <4 x i32>* undef, align 8
+  ret void;
+}
+
+; v2i8
+;
+define void @sextload_v2i8_v(<2 x i8>* %v, <2 x i8>* %p) nounwind {
+;CHECK: sextload_v2i8_v:
+entry:
+  %0 = load <2 x i8>* %v, align 8
+  %v0  = sext <2 x i8> %0 to <2 x i64>
+
+  %1  = load <2 x i8>* %p, align 8
+  %v2 = sext <2 x i8> %1 to <2 x i64>
+;CHECK: vmull
+  %v1 = mul <2 x i64>  %v0, %v2
+  store <2 x i64> %v1, <2 x i64>* undef, align 8
+  ret void;
+}
+
+; v2i16
+;
+define void @sextload_v2i16_v(<2 x i16>* %v, <2 x i16>* %p) nounwind {
+;CHECK: sextload_v2i16_v:
+entry:
+  %0 = load <2 x i16>* %v, align 8
+  %v0  = sext <2 x i16> %0 to <2 x i64>
+
+  %1  = load <2 x i16>* %p, align 8
+  %v2 = sext <2 x i16> %1 to <2 x i64>
+;CHECK: vmull
+  %v1 = mul <2 x i64>  %v0, %v2
+  store <2 x i64> %v1, <2 x i64>* undef, align 8
+  ret void;
+}
+
+
+; Vector(small) x Vector(big)
+; v4i8 x v4i16
+;
+define void @sextload_v4i8_vs(<4 x i8>* %v, <4 x i16>* %p) nounwind {
+;CHECK: sextload_v4i8_vs:
+entry:
+  %0 = load <4 x i8>* %v, align 8
+  %v0  = sext <4 x i8> %0 to <4 x i32>
+
+  %1  = load <4 x i16>* %p, align 8
+  %v2 = sext <4 x i16> %1 to <4 x i32>
+;CHECK: vmull
+  %v1 = mul <4 x i32>  %v0, %v2
+  store <4 x i32> %v1, <4 x i32>* undef, align 8
+  ret void;
+}
+
+; v2i8
+; v2i8 x v2i16
+define void @sextload_v2i8_vs(<2 x i8>* %v, <2 x i16>* %p) nounwind {
+;CHECK: sextload_v2i8_vs:
+entry:
+  %0 = load <2 x i8>* %v, align 8
+  %v0  = sext <2 x i8> %0 to <2 x i64>
+
+  %1  = load <2 x i16>* %p, align 8
+  %v2 = sext <2 x i16> %1 to <2 x i64>
+;CHECK: vmull
+  %v1 = mul <2 x i64>  %v0, %v2
+  store <2 x i64> %v1, <2 x i64>* undef, align 8
+  ret void;
+}
+
+; v2i16
+; v2i16 x v2i32
+define void @sextload_v2i16_vs(<2 x i16>* %v, <2 x i32>* %p) nounwind {
+;CHECK: sextload_v2i16_vs:
+entry:
+  %0 = load <2 x i16>* %v, align 8
+  %v0  = sext <2 x i16> %0 to <2 x i64>
+
+  %1  = load <2 x i32>* %p, align 8
+  %v2 = sext <2 x i32> %1 to <2 x i64>
+;CHECK: vmull
+  %v1 = mul <2 x i64>  %v0, %v2
+  store <2 x i64> %v1, <2 x i64>* undef, align 8
+  ret void;
+}
diff --git a/test/CodeGen/ARM/alloc-no-stack-realign.ll b/test/CodeGen/ARM/alloc-no-stack-realign.ll
new file mode 100644
index 0000000000..273041dee3
--- /dev/null
+++ b/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -O0 -realign-stack=0 | FileCheck %s -check-prefix=NO-REALIGN
+; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s
+
+; rdar://12713765
+; When realign-stack is set to false, make sure we are not creating stack
+; objects that are assumed to be 64-byte aligned.
+@T3_retval = common global <16 x float> zeroinitializer, align 16
+
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: test
+; CHECK: bic sp, sp, #63
+; CHECK: orr [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
+; CHECK: vst1.64
+; CHECK: orr [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
+; CHECK: vst1.64
+; CHECK: orr [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
+; CHECK: vst1.64
+; CHECK: vst1.64
+; CHECK: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
+; CHECK: vst1.64
+; CHECK: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
+; CHECK: vst1.64
+; CHECK: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
+; CHECK: vst1.64
+; CHECK: vst1.64
+; NO-REALIGN: test
+; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
+; NO-REALIGN: vst1.64
+; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
+; NO-REALIGN: vst1.64
+; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
+; NO-REALIGN: vst1.64
+; NO-REALIGN: vst1.64
+; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #48
+; NO-REALIGN: vst1.64
+; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #32
+; NO-REALIGN: vst1.64
+; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
+; NO-REALIGN: vst1.64
+; NO-REALIGN: vst1.64
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index be51e3c129..69da6221b7 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -126,3 +126,64 @@ define void @test9(i64* %ptr, i64 %val) {
   store atomic i64 %val, i64* %ptr seq_cst, align 8
   ret void
 }
+
+define i64 @test10(i64* %ptr, i64 %val) {
+; CHECK: test10:
+; CHECK: dmb ish
+; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
+; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
+; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
+; CHECK: ble
+; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: cmp
+; CHECK: bne
+; CHECK: dmb ish
+  %r = atomicrmw min i64* %ptr, i64 %val seq_cst
+  ret i64 %r
+}
+
+define i64 @test11(i64* %ptr, i64 %val) {
+; CHECK: test11:
+; CHECK: dmb ish
+; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
+; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
+; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
+; CHECK: bls
+; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: cmp
+; CHECK: bne
+; CHECK: dmb ish
+  %r = atomicrmw umin i64* %ptr, i64 %val seq_cst
+  ret i64 %r
+}
+
+define i64 @test12(i64* %ptr, i64 %val) {
+; CHECK: test12:
+; CHECK: dmb ish
+; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
+; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
+; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
+; CHECK: bge
+; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: cmp
+; CHECK: bne
+; CHECK: dmb ish
+  %r = atomicrmw max i64* %ptr, i64 %val seq_cst
+  ret i64 %r
+}
+
+define i64 @test13(i64* %ptr, i64 %val) {
+; CHECK: test13:
+; CHECK: dmb ish
+; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
+; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
+; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
+; CHECK: bhs
+; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: cmp
+; CHECK: bne
+; CHECK: dmb ish
+  %r = atomicrmw umax i64* %ptr, i64 %val seq_cst
+  ret i64 %r
+}
+
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index 5bdad1d838..e7bd5f41bb 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -317,3 +317,44 @@ if.end4:                                          ; preds = %if.else3, %if.then2
   store <2 x i64> %result.2, <2 x i64>* %agg.result, align 128
   ret void
 }
+
+; <rdar://problem/12758887>
+; RegisterCoalescer::updateRegDefsUses() could visit an instruction more than
+; once under rare circumstances. When widening a register from QPR to DTriple
+; with the original virtual register in dsub_1_dsub_2, the double rewrite would
+; produce an invalid sub-register.
+;
+; This is because dsub_1_dsub_2 is not an idempotent sub-register index.
+; It will translate %vr:dsub_0 -> %vr:dsub_1.
+define hidden fastcc void @radar12758887() nounwind optsize ssp {
+entry:
+  br i1 undef, label %for.body, label %for.end70
+
+for.body:                                         ; preds = %for.end, %entry
+  br i1 undef, label %for.body29, label %for.end
+
+for.body29:                                       ; preds = %for.body29, %for.body
+  %0 = load <2 x double>* null, align 1
+  %splat40 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul41 = fmul <2 x double> undef, %splat40
+  %add42 = fadd <2 x double> undef, %mul41
+  %splat44 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul45 = fmul <2 x double> undef, %splat44
+  %add46 = fadd <2 x double> undef, %mul45
+  br i1 undef, label %for.end, label %for.body29
+
+for.end:                                          ; preds = %for.body29, %for.body
+  %accumR2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add42, %for.body29 ]
+  %accumI2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add46, %for.body29 ]
+  %1 = shufflevector <2 x double> %accumI2.0.lcssa, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %add58 = fadd <2 x double> undef, %1
+  %mul61 = fmul <2 x double> %add58, undef
+  %add63 = fadd <2 x double> undef, %mul61
+  %add64 = fadd <2 x double> undef, %add63
+  %add67 = fadd <2 x double> undef, %add64
+  store <2 x double> %add67, <2 x double>* undef, align 1
+  br i1 undef, label %for.end70, label %for.body
+
+for.end70:                                        ; preds = %for.end, %entry
+  ret void
+}
diff --git a/test/CodeGen/ARM/crash.ll b/test/CodeGen/ARM/crash.ll
index 0f6f33e044..4e3e2010b0 100644
--- a/test/CodeGen/ARM/crash.ll
+++ b/test/CodeGen/ARM/crash.ll
@@ -69,3 +69,26 @@ bb:
   store <4 x float> %tmp154, <4 x float>* undef, align 16
   ret void
 }
+
+; <rdar://problem/12721258>
+%A = type { %B }
+%B = type { i32 }
+
+define void @_Z3Foov() ssp {
+entry:
+  br i1 true, label %exit, label %false
+
+false:
+  invoke void undef(%A* undef)
+          to label %exit unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  unreachable
+
+exit:
+  ret void
+}
+
+declare i32 @__gxx_personality_sj0(...)
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 4f4ff8e817..a3a1fc0b2f 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -52,7 +52,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !6 = metadata !{i32 590083, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
 !7 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 589857, i64 0, i64 3}         ; [ DW_TAG_subrange_type ]
+!9 = metadata !{i32 589857, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
 !10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null} ; [ DW_TAG_subprogram ]
 !11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !12 = metadata !{metadata !13}
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index 97c9c66c58..c6bfe3a25e 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -49,7 +49,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !6 = metadata !{i32 590083, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
 !7 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 589857, i64 0, i64 3}         ; [ DW_TAG_subrange_type ]
+!9 = metadata !{i32 589857, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
 !10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null} ; [ DW_TAG_subprogram ]
 !11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !12 = metadata !{metadata !13}
diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
index 0ebac94e13..b5586cc99f 100644
--- a/test/CodeGen/ARM/domain-conv-vmovs.ll
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll
@@ -78,7 +78,7 @@ define float @test_ineligible(float, float %in) {
   ; use-def chains would be messed up. Primarily a compile-test (we used to
   ; internal fault).
   call void @bar()
-; CHECL: bl bar
+; CHECK: bl bar
 ; CHECK: vext.32
 ; CHECK: vext.32
   ret float %val
diff --git a/test/CodeGen/ARM/elf-lcomm-align.ll b/test/CodeGen/ARM/elf-lcomm-align.ll
index 46792990e5..a98b3c06f5 100644
--- a/test/CodeGen/ARM/elf-lcomm-align.ll
+++ b/test/CodeGen/ARM/elf-lcomm-align.ll
@@ -4,8 +4,9 @@
 @c = internal global i8 0, align 1
 @x = internal global i32 0, align 4
 
-; CHECK: .lcomm c,1
-; .lcomm doesn't support alignment.
+; .lcomm doesn't support alignment, so we always use .local/.comm.
+; CHECK: .local c
+; CHECK-NEXT: .comm c,1,1
 ; CHECK: .local x
 ; CHECK-NEXT: .comm x,4,4
 
diff --git a/test/CodeGen/ARM/extload-knownzero.ll b/test/CodeGen/ARM/extload-knownzero.ll
new file mode 100644
index 0000000000..8fd6b6bd77
--- /dev/null
+++ b/test/CodeGen/ARM/extload-knownzero.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; rdar://12771555
+
+define void @foo(i16* %ptr, i32 %a) nounwind {
+entry:
+; CHECK: foo:
+  %tmp1 = icmp ult i32 %a, 100
+  br i1 %tmp1, label %bb1, label %bb2
+bb1:
+; CHECK: ldrh
+  %tmp2 = load i16* %ptr, align 2
+  br label %bb2
+bb2:
+; CHECK-NOT: uxth
+; CHECK: cmp
+  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
+  %cmp = icmp ult i16 %tmp3, 24
+  br i1 %cmp, label %bb3, label %exit
+bb3:
+  call void @bar() nounwind
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @bar () 
diff --git a/test/CodeGen/ARM/fast-isel-icmp.ll b/test/CodeGen/ARM/fast-isel-icmp.ll
index 8764bef7da..8357ed5c54 100644
--- a/test/CodeGen/ARM/fast-isel-icmp.ll
+++ b/test/CodeGen/ARM/fast-isel-icmp.ll
@@ -1,6 +1,21 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
 
+define i32 @icmp_i16_signed(i16 %a, i16 %b) nounwind {
+entry:
+; ARM: icmp_i16_signed
+; ARM: sxth r0, r0
+; ARM: sxth r1, r1
+; ARM: cmp	r0, r1
+; THUMB: icmp_i16_signed
+; THUMB: sxth r0, r0
+; THUMB: sxth r1, r1
+; THUMB: cmp	r0, r1
+  %cmp = icmp slt i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
 define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
 entry:
 ; ARM: icmp_i16_unsigned
@@ -31,6 +46,21 @@ entry:
   ret i32 %conv2
 }
 
+define i32 @icmp_i8_unsigned(i8 %a, i8 %b) nounwind {
+entry:
+; ARM: icmp_i8_unsigned
+; ARM: uxtb r0, r0
+; ARM: uxtb r1, r1
+; ARM: cmp r0, r1
+; THUMB: icmp_i8_unsigned
+; THUMB: uxtb r0, r0
+; THUMB: uxtb r1, r1
+; THUMB: cmp r0, r1
+  %cmp = icmp ugt i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
 define i32 @icmp_i1_unsigned(i1 %a, i1 %b) nounwind {
 entry:
 ; ARM: icmp_i1_unsigned
diff --git a/test/CodeGen/ARM/fast-isel-indirectbr.ll b/test/CodeGen/ARM/fast-isel-indirectbr.ll
index be8035ec79..ebc0e8426d 100644
--- a/test/CodeGen/ARM/fast-isel-indirectbr.ll
+++ b/test/CodeGen/ARM/fast-isel-indirectbr.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
 
 define void @t1(i8* %x) {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index b73fceff6c..7d38cc2a7f 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -35,7 +35,7 @@ define void @t1() nounwind ssp {
 ; THUMB-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
 ; THUMB-LONG: ldr r3, [r3]
 ; THUMB-LONG: blx r3
-  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @message1, i32 0, i32 5), i8 64, i32 10, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @message1, i32 0, i32 5), i8 64, i32 10, i32 4, i1 false)
   ret void
 }
 
@@ -73,7 +73,7 @@ define void @t2() nounwind ssp {
 ; THUMB-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
 ; THUMB-LONG: ldr r3, [r3]
 ; THUMB-LONG: blx r3
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 17, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 17, i32 4, i1 false)
   ret void
 }
 
@@ -125,6 +125,7 @@ define void @t4() nounwind ssp {
 ; ARM: ldrh r1, [r0, #24]
 ; ARM: strh r1, [r0, #12]
 ; ARM: bx lr
+; THUMB: t4
 ; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
 ; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
 ; THUMB: ldr r0, [r0]
@@ -135,8 +136,98 @@ define void @t4() nounwind ssp {
 ; THUMB: ldrh r1, [r0, #24]
 ; THUMB: strh r1, [r0, #12]
 ; THUMB: bx lr
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 4, i1 false)
   ret void
 }
 
 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define void @t5() nounwind ssp {
+; ARM: t5
+; ARM: movw r0, :lower16:L_temp$non_lazy_ptr
+; ARM: movt r0, :upper16:L_temp$non_lazy_ptr
+; ARM: ldr r0, [r0]
+; ARM: ldrh r1, [r0, #16]
+; ARM: strh r1, [r0, #4]
+; ARM: ldrh r1, [r0, #18]
+; ARM: strh r1, [r0, #6]
+; ARM: ldrh r1, [r0, #20]
+; ARM: strh r1, [r0, #8]
+; ARM: ldrh r1, [r0, #22]
+; ARM: strh r1, [r0, #10]
+; ARM: ldrh r1, [r0, #24]
+; ARM: strh r1, [r0, #12]
+; ARM: bx lr
+; THUMB: t5
+; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
+; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
+; THUMB: ldr r0, [r0]
+; THUMB: ldrh r1, [r0, #16]
+; THUMB: strh r1, [r0, #4]
+; THUMB: ldrh r1, [r0, #18]
+; THUMB: strh r1, [r0, #6]
+; THUMB: ldrh r1, [r0, #20]
+; THUMB: strh r1, [r0, #8]
+; THUMB: ldrh r1, [r0, #22]
+; THUMB: strh r1, [r0, #10]
+; THUMB: ldrh r1, [r0, #24]
+; THUMB: strh r1, [r0, #12]
+; THUMB: bx lr
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 2, i1 false)
+  ret void
+}
+
+define void @t6() nounwind ssp {
+; ARM: t6
+; ARM: movw r0, :lower16:L_temp$non_lazy_ptr
+; ARM: movt r0, :upper16:L_temp$non_lazy_ptr
+; ARM: ldr r0, [r0]
+; ARM: ldrb r1, [r0, #16]
+; ARM: strb r1, [r0, #4]
+; ARM: ldrb r1, [r0, #17]
+; ARM: strb r1, [r0, #5]
+; ARM: ldrb r1, [r0, #18]
+; ARM: strb r1, [r0, #6]
+; ARM: ldrb r1, [r0, #19]
+; ARM: strb r1, [r0, #7]
+; ARM: ldrb r1, [r0, #20]
+; ARM: strb r1, [r0, #8]
+; ARM: ldrb r1, [r0, #21]
+; ARM: strb r1, [r0, #9]
+; ARM: ldrb r1, [r0, #22]
+; ARM: strb r1, [r0, #10]
+; ARM: ldrb r1, [r0, #23]
+; ARM: strb r1, [r0, #11]
+; ARM: ldrb r1, [r0, #24]
+; ARM: strb r1, [r0, #12]
+; ARM: ldrb r1, [r0, #25]
+; ARM: strb r1, [r0, #13]
+; ARM: bx lr
+; THUMB: t6
+; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
+; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
+; THUMB: ldr r0, [r0]
+; THUMB: ldrb r1, [r0, #16]
+; THUMB: strb r1, [r0, #4]
+; THUMB: ldrb r1, [r0, #17]
+; THUMB: strb r1, [r0, #5]
+; THUMB: ldrb r1, [r0, #18]
+; THUMB: strb r1, [r0, #6]
+; THUMB: ldrb r1, [r0, #19]
+; THUMB: strb r1, [r0, #7]
+; THUMB: ldrb r1, [r0, #20]
+; THUMB: strb r1, [r0, #8]
+; THUMB: ldrb r1, [r0, #21]
+; THUMB: strb r1, [r0, #9]
+; THUMB: ldrb r1, [r0, #22]
+; THUMB: strb r1, [r0, #10]
+; THUMB: ldrb r1, [r0, #23]
+; THUMB: strb r1, [r0, #11]
+; THUMB: ldrb r1, [r0, #24]
+; THUMB: strb r1, [r0, #12]
+; THUMB: ldrb r1, [r0, #25]
+; THUMB: strb r1, [r0, #13]
+; THUMB: bx lr
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/ARM/fast-isel-pred.ll b/test/CodeGen/ARM/fast-isel-pred.ll
index 8de54ad533..27731def1f 100644
--- a/test/CodeGen/ARM/fast-isel-pred.ll
+++ b/test/CodeGen/ARM/fast-isel-pred.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mtriple=armv7-apple-darwin < %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=armv7-apple-darwin < %s
 
 define i32 @main() nounwind ssp {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-redefinition.ll b/test/CodeGen/ARM/fast-isel-redefinition.ll
index e50c3a4954..563880dab0 100644
--- a/test/CodeGen/ARM/fast-isel-redefinition.ll
+++ b/test/CodeGen/ARM/fast-isel-redefinition.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -optimize-regalloc -regalloc=basic < %s
+; RUN: llc -O0 -verify-machineinstrs -optimize-regalloc -regalloc=basic < %s
 ; This isn't exactly a useful set of command-line options, but check that it
 ; doesn't crash.  (It was crashing because a register was getting redefined.)
 
diff --git a/test/CodeGen/ARM/fast-isel-static.ll b/test/CodeGen/ARM/fast-isel-static.ll
index a86e3251f7..e8759a7fc4 100644
--- a/test/CodeGen/ARM/fast-isel-static.ll
+++ b/test/CodeGen/ARM/fast-isel-static.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -relocation-model=static -arm-long-calls | FileCheck -check-prefix=LONG %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -relocation-model=static | FileCheck -check-prefix=NORM %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -relocation-model=static -arm-long-calls | FileCheck -check-prefix=LONG %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -relocation-model=static | FileCheck -check-prefix=NORM %s
 
 define void @myadd(float* %sum, float* %addend) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/machine-cse-cmp.ll b/test/CodeGen/ARM/machine-cse-cmp.ll
index 3ac7d77d6f..03abd762a2 100644
--- a/test/CodeGen/ARM/machine-cse-cmp.ll
+++ b/test/CodeGen/ARM/machine-cse-cmp.ll
@@ -45,3 +45,35 @@ for.cond1.preheader:                              ; preds = %entry
 }
 
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+; rdar://12462006
+define i8* @f3(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK: f3:
+; CHECK-NOT: sub
+; CHECK: cmp
+; CHECK: blt
+%0 = load i32* %offset, align 4
+%cmp = icmp slt i32 %0, %size
+%s = sub nsw i32 %0, %size
+%size2 = sub nsw i32 %size, 0
+br i1 %cmp, label %return, label %if.end
+
+if.end:
+; We are checking cse between %sub here and %s in entry block.
+%sub = sub nsw i32 %0, %size2
+%s2 = sub nsw i32 %s, %size
+%s3 = sub nsw i32 %sub, %s2
+; CHECK: sub [[R1:r[0-9]+]], [[R2:r[0-9]+]], r2
+; CHECK: sub [[R3:r[0-9]+]], [[R1]], r2
+; CHECK: sub [[R4:r[0-9]+]], [[R1]], [[R3]]
+; CHECK-NOT: sub
+; CHECK: str
+store i32 %s3, i32* %offset, align 4
+%add.ptr = getelementptr inbounds i8* %base, i32 %sub
+br label %return
+
+return:
+%retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ret i8* %retval.0
+}
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index dc772827f2..d846e5cb26 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -1,18 +1,115 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -disable-post-ra | FileCheck %s
-
-; CHECK: ldrd
-; CHECK: strd
-; CHECK: ldrb
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
 
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
 @src = external global %struct.x
 @dst = external global %struct.x
 
-define i32 @t() {
+@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
+@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+
+define i32 @t0() {
 entry:
+; CHECK: t0:
+; CHECK: vldr [[REG1:d[0-9]+]],
+; CHECK: vstr [[REG1]], 
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
   ret i32 0
 }
 
+define void @t1(i8* nocapture %C) nounwind {
+entry:
+; CHECK: t1:
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK: adds r0, #15
+; CHECK: adds r1, #15
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
+  ret void
+}
+
+define void @t2(i8* nocapture %C) nounwind {
+entry:
+; CHECK: t2:
+; CHECK: ldr [[REG2:r[0-9]+]], [r1, #32]
+; CHECK: str [[REG2]], [r0, #32]
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK: adds r0, #16
+; CHECK: adds r1, #16
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
+  ret void
+}
+
+define void @t3(i8* nocapture %C) nounwind {
+entry:
+; CHECK: t3:
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK: adds r0, #16
+; CHECK: adds r1, #16
+; CHECK: vld1.8 {d{{[0-9]+}}}, [r1]
+; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
+  ret void
+}
+
+define void @t4(i8* nocapture %C) nounwind {
+entry:
+; CHECK: t4:
+; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
+; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
+  ret void
+}
+
+define void @t5(i8* nocapture %C) nounwind {
+entry:
+; CHECK: t5:
+; CHECK: movs [[REG5:r[0-9]+]], #0
+; CHECK: strb [[REG5]], [r0, #6]
+; CHECK: movw [[REG6:r[0-9]+]], #21587
+; CHECK: strh [[REG6]], [r0, #4]
+; CHECK: ldr [[REG7:r[0-9]+]], 
+; CHECK: str [[REG7]]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+define void @t6() nounwind {
+entry:
+; CHECK: t6:
+; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0]
+; CHECK: vstr [[REG8]], [r1]
+; CHECK: adds r1, #6
+; CHECK: adds r0, #6
+; CHECK: vld1.8
+; CHECK: vst1.16
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
+  ret void
+}
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+; CHECK: t7
+; CHECK: vld1.32
+; CHECK: vst1.32
+  %0 = bitcast %struct.Foo* %a to i8*
+  %1 = bitcast %struct.Foo* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+  ret void
+}
+
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
new file mode 100644
index 0000000000..ee8c364338
--- /dev/null
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
+
+define void @t1(i8* nocapture %c) nounwind optsize {
+entry:
+; CHECK: t1:
+; CHECK: movs r1, #0
+; CHECK: str r1, [r0]
+; CHECK: str r1, [r0, #4]
+; CHECK: str r1, [r0, #8]
+  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK: t2:
+; CHECK: add.w r1, r0, #10
+; CHECK: vmov.i32 {{q[0-9]+}}, #0x0
+; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+  %buf = alloca [26 x i8], align 1
+  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
+  call void @something(i8* %0) nounwind
+  ret void
+}
+
+declare void @something(i8*) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll
new file mode 100644
index 0000000000..0b9c9467c2
--- /dev/null
+++ b/test/CodeGen/ARM/popcnt.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; Implement ctpop with vcnt
+
+define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
+;CHECK: vcnt8:
+;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
+;CHECK: vcntQ8:
+;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
+; CHECK: vcnt16:
+; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
+; CHECK: vcntQ16:
+; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
+; CHECK: vcnt32:
+; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
+; CHECK: vcntQ32:
+; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
+;CHECK: vclz8:
+;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
+;CHECK: vclz16:
+;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
+;CHECK: vclz32:
+;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
+;CHECK: vclzQ8:
+;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
+;CHECK: vclzQ16:
+;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
+;CHECK: vclzQ32:
+;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
+;CHECK: vclss8:
+;CHECK: vcls.s8
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
+;CHECK: vclss16:
+;CHECK: vcls.s16
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
+;CHECK: vclss32:
+;CHECK: vcls.s32
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
+;CHECK: vclsQs8:
+;CHECK: vcls.s8
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
+;CHECK: vclsQs16:
+;CHECK: vcls.s16
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
+;CHECK: vclsQs32:
+;CHECK: vcls.s32
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/reg_asc_order.ll b/test/CodeGen/ARM/reg_asc_order.ll
deleted file mode 100644
index d1d0ee5f3e..0000000000
--- a/test/CodeGen/ARM/reg_asc_order.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
-; Check that memcpy gets lowered to ldm/stm, at least in this very smple case.
-
-%struct.Foo = type { i32, i32, i32, i32 }
-
-define void @_Z10CopyStructP3FooS0_(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
-entry:
-;CHECK: ldm
-;CHECK: stm
-  %0 = bitcast %struct.Foo* %a to i8*
-  %1 = bitcast %struct.Foo* %b to i8*
-  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/ret_sret_vector.ll b/test/CodeGen/ARM/ret_sret_vector.ll
new file mode 100644
index 0000000000..9bb3519555
--- /dev/null
+++ b/test/CodeGen/ARM/ret_sret_vector.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+define <4 x double> @PR14337(<4 x double> %a, <4 x double> %b) {
+  %foo = fadd <4 x double>  %a, %b
+  ret <4 x double> %foo
+; CHECK: PR14337:
+; CHECK: vst1.64
+; CHECK: vst1.64
+}
diff --git a/test/CodeGen/ARM/subreg-remat.ll b/test/CodeGen/ARM/subreg-remat.ll
index 455bfce0f2..1bc0315354 100644
--- a/test/CodeGen/ARM/subreg-remat.ll
+++ b/test/CodeGen/ARM/subreg-remat.ll
@@ -12,7 +12,7 @@ target triple = "thumbv7-apple-ios"
 ;
 ; CHECK: f1
 ; CHECK: vmov    d0, r0, r0
-; CHECK: vldr s0, LCPI
+; CHECK: vldr s1, LCPI
 ; The vector must be spilled:
 ; CHECK: vstr d0,
 ; CHECK: asm clobber d0
@@ -20,8 +20,8 @@ target triple = "thumbv7-apple-ios"
 ; CHECK: vldr [[D16:d[0-9]+]],
 ; CHECK: vstr [[D16]], [r1]
 define void @f1(float %x, <2 x float>* %p) {
-  %v1 = insertelement <2 x float> undef, float %x, i32 1
-  %v2 = insertelement <2 x float> %v1, float 0x400921FB60000000, i32 0
+  %v1 = insertelement <2 x float> undef, float %x, i32 0
+  %v2 = insertelement <2 x float> %v1, float 0x400921FB60000000, i32 1
   %y = call double asm sideeffect "asm clobber $0", "=w,0,~{d1},~{d2},~{d3},~{d4},~{d5},~{d6},~{d7},~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14},~{d15},~{d16},~{d17},~{d18},~{d19},~{d20},~{d21},~{d22},~{d23},~{d24},~{d25},~{d26},~{d27},~{d28},~{d29},~{d30},~{d31}"(<2 x float> %v2) nounwind
   store <2 x float> %v2, <2 x float>* %p, align 8
   ret void
diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index 8a6efb620e..767a442612 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
-; CHECK: r[[T0:[0-9]+]] = #7
-; CHECK: memw(r29 + #0) = r[[T0]]
+; CHECK: memw(r29{{ *}}+{{ *}}#0){{ *}}={{ *}}#7
 ; CHECK: r5 = #6
 ; CHECK: r0 = #1
 ; CHECK: r1 = #2
diff --git a/test/CodeGen/Hexagon/dualstore.ll b/test/CodeGen/Hexagon/dualstore.ll
index 9b27dda52c..067499530f 100644
--- a/test/CodeGen/Hexagon/dualstore.ll
+++ b/test/CodeGen/Hexagon/dualstore.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; Check that we generate dual stores in one packet in V4
 
-; CHECK: memw(r{{[0-9]+}} + #{{[0-9]+}}) = r{{[0-9]+}}
-; CHECK-NEXT: memw(r{{[0-9]+}} + #{{[0-9]+}}) = r{{[0-9]+}}
+; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}#100000
+; CHECK-NEXT: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}#500000
 ; CHECK-NEXT: }
 
 @Reg = global i32 0, align 4
diff --git a/test/CodeGen/Hexagon/postinc-load.ll b/test/CodeGen/Hexagon/postinc-load.ll
index 4b5ea67090..855a347d74 100644
--- a/test/CodeGen/Hexagon/postinc-load.ll
+++ b/test/CodeGen/Hexagon/postinc-load.ll
@@ -1,4 +1,4 @@
-; RUN: true || llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 
 ; Check that post-increment load instructions are being generated.
 ; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}{{ *}}++{{ *}}#4{{ *}})
diff --git a/test/CodeGen/NVPTX/tuple-literal.ll b/test/CodeGen/NVPTX/tuple-literal.ll
new file mode 100644
index 0000000000..5c0cb2c15c
--- /dev/null
+++ b/test/CodeGen/NVPTX/tuple-literal.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_13
+
+define ptx_device void @test_function({i8, i8}*) {
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/vector-compare.ll b/test/CodeGen/NVPTX/vector-compare.ll
new file mode 100644
index 0000000000..2180499952
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-compare.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20
+
+; This test makes sure that the result of vector compares are properly
+; scalarized.  If codegen fails, then the type legalizer incorrectly
+; tried to promote <2 x i1> to <2 x i8> and instruction selection failed.
+
+define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) {
+  %aval = load <2 x i32>* %a
+  %bval = load <2 x i32>* %b
+  %res = icmp slt <2 x i32> %aval, %bval
+  %t1 = extractelement <2 x i1> %res, i32 0
+  %t2 = extractelement <2 x i1> %res, i32 1
+  %t1a = zext i1 %t1 to i32
+  %t2a = zext i1 %t2 to i32
+  store i32 %t1a, i32* %r1
+  store i32 %t2a, i32* %r2
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/vector-select.ll b/test/CodeGen/NVPTX/vector-select.ll
new file mode 100644
index 0000000000..11893df103
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-select.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20
+
+; This test makes sure that vector selects are scalarized by the type legalizer.
+; If not, type legalization will fail.
+
+define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) {
+entry:
+  %tmp4 = load <2 x i32> addrspace(1)* %def_a
+  %tmp6 = load <2 x i32> addrspace(1)* %def_c
+  %tmp8 = load <2 x i32> addrspace(1)* %def_b
+  %0 = icmp sge <2 x i32> %tmp4, zeroinitializer
+  %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
+  store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/buildvec_canonicalize.ll b/test/CodeGen/PowerPC/buildvec_canonicalize.ll
index 0454c584bc..e155a35c4d 100644
--- a/test/CodeGen/PowerPC/buildvec_canonicalize.ll
+++ b/test/CodeGen/PowerPC/buildvec_canonicalize.ll
@@ -1,10 +1,4 @@
-; There should be exactly one vxor here.
-; RUN: llc < %s -march=ppc32 -mcpu=g5 --enable-unsafe-fp-math | \
-; RUN:   grep vxor | count 1
-
-; There should be exactly one vsplti here.
-; RUN: llc < %s -march=ppc32 -mcpu=g5 --enable-unsafe-fp-math | \
-; RUN:   grep vsplti | count 1
+; RUN: llc < %s -march=ppc32 -mattr=+altivec --enable-unsafe-fp-math | FileCheck %s
 
 define void @VXOR(<4 x float>* %P1, <4 x i32>* %P2, <4 x float>* %P3) {
         %tmp = load <4 x float>* %P3            ; <<4 x float>> [#uses=1]
@@ -15,10 +9,16 @@ define void @VXOR(<4 x float>* %P1, <4 x i32>* %P2, <4 x float>* %P3) {
         store <4 x i32> zeroinitializer, <4 x i32>* %P2
         ret void
 }
+; The fmul will spill a vspltisw to create a -0.0 vector used as the addend
+; to vmaddfp (so it would IEEE compliant with zero sign propagation).
+; CHECK: @VXOR
+; CHECK: vsplti
+; CHECK: vxor
 
 define void @VSPLTI(<4 x i32>* %P2, <8 x i16>* %P3) {
         store <4 x i32> bitcast (<16 x i8> < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > to <4 x i32>), <4 x i32>* %P2
         store <8 x i16> < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >, <8 x i16>* %P3
         ret void
 }
-
+; CHECK: @VSPLTI
+; CHECK: vsplti
diff --git a/test/CodeGen/PowerPC/mcm-1.ll b/test/CodeGen/PowerPC/mcm-1.ll
new file mode 100644
index 0000000000..62fe88c2b8
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-1.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading and storing an external variable.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@ei = external global i32
+
+define signext i32 @test_external() nounwind {
+entry:
+  %0 = load i32* @ei, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @ei, align 4
+  ret i32 %0
+}
+
+; CHECK: test_external:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
+; CHECK: stw {{[0-9]+}}, 0([[REG2]])
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc {{[a-z0-9A-Z_.]+}}[TC],{{[a-z0-9A-Z_.]+}}
diff --git a/test/CodeGen/PowerPC/mcm-2.ll b/test/CodeGen/PowerPC/mcm-2.ll
new file mode 100644
index 0000000000..45df0ab14f
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-2.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading and storing a static variable scoped to a function.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@test_fn_static.si = internal global i32 0, align 4
+
+define signext i32 @test_fn_static() nounwind {
+entry:
+  %0 = load i32* @test_fn_static.si, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @test_fn_static.si, align 4
+  ret i32 %0
+}
+
+; CHECK: test_fn_static:
+; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
+; CHECK: stw {{[0-9]+}}, 0([[REG2]])
+; CHECK: .type [[VAR]],@object
+; CHECK: .local [[VAR]]
+; CHECK: .comm [[VAR]],4,4
diff --git a/test/CodeGen/PowerPC/mcm-3.ll b/test/CodeGen/PowerPC/mcm-3.ll
new file mode 100644
index 0000000000..0e7bbe798b
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-3.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading and storing a file-scope static variable.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@gi = global i32 5, align 4
+
+define signext i32 @test_file_static() nounwind {
+entry:
+  %0 = load i32* @gi, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @gi, align 4
+  ret i32 %0
+}
+
+; CHECK: test_file_static:
+; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
+; CHECK: stw {{[0-9]+}}, 0([[REG2]])
+; CHECK: .type [[VAR]],@object
+; CHECK: .data
+; CHECK: .globl [[VAR]]
+; CHECK: [[VAR]]:
+; CHECK: .long 5
diff --git a/test/CodeGen/PowerPC/mcm-4.ll b/test/CodeGen/PowerPC/mcm-4.ll
new file mode 100644
index 0000000000..db36d0bcf7
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-4.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading a value from the constant pool (TOC-relative).
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @test_double_const() nounwind {
+entry:
+  ret double 0x3F4FD4920B498CF0
+}
+
+; CHECK: [[VAR:[a-z0-9A-Z_.]+]]:
+; CHECK: .quad 4562098671269285104
+; CHECK: test_double_const:
+; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; CHECK: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; CHECK: lfd {{[0-9]+}}, 0([[REG2]])
diff --git a/test/CodeGen/PowerPC/mcm-5.ll b/test/CodeGen/PowerPC/mcm-5.ll
new file mode 100644
index 0000000000..10d89f5215
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-5.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading the address of a jump table from the TOC.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define signext i32 @test_jump_table(i32 signext %i) nounwind {
+entry:
+  %i.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  switch i32 %0, label %sw.default [
+    i32 3, label %sw.bb
+    i32 4, label %sw.bb1
+    i32 5, label %sw.bb2
+    i32 6, label %sw.bb3
+  ]
+
+sw.default:                                       ; preds = %entry
+  br label %sw.epilog
+
+sw.bb:                                            ; preds = %entry
+  %1 = load i32* %i.addr, align 4
+  %mul = mul nsw i32 %1, 7
+  store i32 %mul, i32* %i.addr, align 4
+  br label %sw.bb1
+
+sw.bb1:                                           ; preds = %entry, %sw.bb
+  %2 = load i32* %i.addr, align 4
+  %dec = add nsw i32 %2, -1
+  store i32 %dec, i32* %i.addr, align 4
+  br label %sw.bb2
+
+sw.bb2:                                           ; preds = %entry, %sw.bb1
+  %3 = load i32* %i.addr, align 4
+  %add = add nsw i32 %3, 3
+  store i32 %add, i32* %i.addr, align 4
+  br label %sw.bb3
+
+sw.bb3:                                           ; preds = %entry, %sw.bb2
+  %4 = load i32* %i.addr, align 4
+  %shl = shl i32 %4, 1
+  store i32 %shl, i32* %i.addr, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb3, %sw.default
+  %5 = load i32* %i.addr, align 4
+  ret i32 %5
+}
+
+; CHECK: test_jump_table:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: ldx {{[0-9]+}}, {{[0-9]+}}, [[REG2]]
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc {{[a-z0-9A-Z_.]+}}[TC],{{[a-z0-9A-Z_.]+}}
diff --git a/test/CodeGen/PowerPC/mcm-6.ll b/test/CodeGen/PowerPC/mcm-6.ll
new file mode 100644
index 0000000000..0a7fa762d4
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-6.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading and storing a tentatively defined variable.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@ti = common global i32 0, align 4
+
+define signext i32 @test_tentative() nounwind {
+entry:
+  %0 = load i32* @ti, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @ti, align 4
+  ret i32 %0
+}
+
+; CHECK: test_tentative:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
+; CHECK: stw {{[0-9]+}}, 0([[REG2]])
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc [[VAR:[a-z0-9A-Z_.]+]][TC],{{[a-z0-9A-Z_.]+}}
+; CHECK: .comm [[VAR]],4,4
diff --git a/test/CodeGen/PowerPC/mcm-7.ll b/test/CodeGen/PowerPC/mcm-7.ll
new file mode 100644
index 0000000000..0e9fa2b38b
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-7.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading a function address.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i8* @test_fnaddr() nounwind {
+entry:
+  %func = alloca i32 (i32)*, align 8
+  store i32 (i32)* @foo, i32 (i32)** %func, align 8
+  %0 = load i32 (i32)** %func, align 8
+  %1 = bitcast i32 (i32)* %0 to i8*
+  ret i8* %1
+}
+
+declare signext i32 @foo(i32 signext)
+
+; CHECK: test_fnaddr:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc {{[a-z0-9A-Z_.]+}}[TC],{{[a-z0-9A-Z_.]+}}
diff --git a/test/CodeGen/PowerPC/mcm-default.ll b/test/CodeGen/PowerPC/mcm-default.ll
new file mode 100644
index 0000000000..19de2536ae
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-default.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -O0 <%s | FileCheck %s
+
+; Test that we generate code for the medium model as the default.
+; Use an external variable reference as an example.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@ei = external global i32
+
+define signext i32 @test_external() nounwind {
+entry:
+  %0 = load i32* @ei, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @ei, align 4
+  ret i32 %0
+}
+
+; CHECK: test_external:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
+; CHECK: stw {{[0-9]+}}, 0([[REG2]])
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc {{[a-z0-9A-Z_.]+}}[TC],{{[a-z0-9A-Z_.]+}}
diff --git a/test/CodeGen/PowerPC/mcm-obj.ll b/test/CodeGen/PowerPC/mcm-obj.ll
new file mode 100644
index 0000000000..ec1b7b0084
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-obj.ll
@@ -0,0 +1,193 @@
+; RUN: llc -O0 -mcpu=pwr7 -code-model=medium -filetype=obj %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+; FIXME: When asm-parse is available, could make this an assembly test.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@ei = external global i32
+
+define signext i32 @test_external() nounwind {
+entry:
+  %0 = load i32* @ei, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @ei, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing external variable ei.
+;
+; CHECK:       '.rela.text'
+; CHECK:       Relocation 0
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 1
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000040
+
+@test_fn_static.si = internal global i32 0, align 4
+
+define signext i32 @test_fn_static() nounwind {
+entry:
+  %0 = load i32* @test_fn_static.si, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @test_fn_static.si, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
+; accessing function-scoped variable si.
+;
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM2:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 3
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM2]]
+; CHECK-NEXT:  'r_type', 0x00000030
+
+@gi = global i32 5, align 4
+
+define signext i32 @test_file_static() nounwind {
+entry:
+  %0 = load i32* @gi, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @gi, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
+; accessing file-scope variable gi.
+;
+; CHECK:       Relocation 4
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM3:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 5
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM3]]
+; CHECK-NEXT:  'r_type', 0x00000030
+
+define double @test_double_const() nounwind {
+entry:
+  ret double 0x3F4FD4920B498CF0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
+; accessing a constant.
+;
+; CHECK:       Relocation 6
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM4:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 7
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM4]]
+; CHECK-NEXT:  'r_type', 0x00000030
+
+define signext i32 @test_jump_table(i32 signext %i) nounwind {
+entry:
+  %i.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  switch i32 %0, label %sw.default [
+    i32 3, label %sw.bb
+    i32 4, label %sw.bb1
+    i32 5, label %sw.bb2
+    i32 6, label %sw.bb3
+  ]
+
+sw.default:                                       ; preds = %entry
+  br label %sw.epilog
+
+sw.bb:                                            ; preds = %entry
+  %1 = load i32* %i.addr, align 4
+  %mul = mul nsw i32 %1, 7
+  store i32 %mul, i32* %i.addr, align 4
+  br label %sw.bb1
+
+sw.bb1:                                           ; preds = %entry, %sw.bb
+  %2 = load i32* %i.addr, align 4
+  %dec = add nsw i32 %2, -1
+  store i32 %dec, i32* %i.addr, align 4
+  br label %sw.bb2
+
+sw.bb2:                                           ; preds = %entry, %sw.bb1
+  %3 = load i32* %i.addr, align 4
+  %add = add nsw i32 %3, 3
+  store i32 %add, i32* %i.addr, align 4
+  br label %sw.bb3
+
+sw.bb3:                                           ; preds = %entry, %sw.bb2
+  %4 = load i32* %i.addr, align 4
+  %shl = shl i32 %4, 1
+  store i32 %shl, i32* %i.addr, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb3, %sw.default
+  %5 = load i32* %i.addr, align 4
+  ret i32 %5
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing a jump table address.
+;
+; CHECK:       Relocation 8
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM5:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 9
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM5]]
+; CHECK-NEXT:  'r_type', 0x00000040
+
+@ti = common global i32 0, align 4
+
+define signext i32 @test_tentative() nounwind {
+entry:
+  %0 = load i32* @ti, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @ti, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing tentatively declared variable ti.
+;
+; CHECK:       Relocation 10
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM6:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 11
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM6]]
+; CHECK-NEXT:  'r_type', 0x00000040
+
+define i8* @test_fnaddr() nounwind {
+entry:
+  %func = alloca i32 (i32)*, align 8
+  store i32 (i32)* @foo, i32 (i32)** %func, align 8
+  %0 = load i32 (i32)** %func, align 8
+  %1 = bitcast i32 (i32)* %0 to i8*
+  ret i8* %1
+}
+
+declare signext i32 @foo(i32 signext)
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing function address foo.
+;
+; CHECK:       Relocation 12
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM7:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 13
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM7]]
+; CHECK-NEXT:  'r_type', 0x00000040
+
diff --git a/test/CodeGen/PowerPC/s000-alias-misched.ll b/test/CodeGen/PowerPC/s000-alias-misched.ll
new file mode 100644
index 0000000000..d03ee8738e
--- /dev/null
+++ b/test/CodeGen/PowerPC/s000-alias-misched.ll
@@ -0,0 +1,101 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+; RUN: llc < %s -enable-misched -march=ppc64 -mcpu=a2 | FileCheck %s
+; RUN: llc < %s -enable-misched -enable-aa-sched-mi -march=ppc64 -mcpu=a2 | FileCheck %s
+
+@aa = external global [256 x [256 x double]], align 32
+@bb = external global [256 x [256 x double]], align 32
+@cc = external global [256 x [256 x double]], align 32
+@.str1 = external hidden unnamed_addr constant [6 x i8], align 1
+@X = external global [16000 x double], align 32
+@Y = external global [16000 x double], align 32
+@Z = external global [16000 x double], align 32
+@U = external global [16000 x double], align 32
+@V = external global [16000 x double], align 32
+@.str137 = external hidden unnamed_addr constant [14 x i8], align 1
+
+declare void @check(i32 signext) nounwind
+
+declare signext i32 @printf(i8* nocapture, ...) nounwind
+
+declare signext i32 @init(i8*) nounwind
+
+define signext i32 @s000() nounwind {
+entry:
+  %call = tail call signext i32 @init(i8* getelementptr inbounds ([6 x i8]* @.str1, i64 0, i64 0))
+  %call1 = tail call i64 @clock() nounwind
+  br label %for.cond2.preheader
+
+; CHECK: @s000
+
+for.cond2.preheader:                              ; preds = %for.end, %entry
+  %nl.018 = phi i32 [ 0, %entry ], [ %inc9, %for.end ]
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.cond2.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond2.preheader ], [ %indvars.iv.next.15, %for.body4 ]
+  %arrayidx = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv
+  %arrayidx6 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv
+  %0 = bitcast double* %arrayidx to <1 x double>*
+  %1 = load <1 x double>* %0, align 32, !tbaa !0
+  %add = fadd <1 x double> %1, <double 1.000000e+00>
+  %2 = bitcast double* %arrayidx6 to <1 x double>*
+  store <1 x double> %add, <1 x double>* %2, align 32, !tbaa !0
+  %indvars.iv.next.322 = or i64 %indvars.iv, 4
+  %arrayidx.4 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.322
+  %arrayidx6.4 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.322
+  %3 = bitcast double* %arrayidx.4 to <1 x double>*
+  %4 = load <1 x double>* %3, align 32, !tbaa !0
+  %add.4 = fadd <1 x double> %4, <double 1.000000e+00>
+  %5 = bitcast double* %arrayidx6.4 to <1 x double>*
+  store <1 x double> %add.4, <1 x double>* %5, align 32, !tbaa !0
+  %indvars.iv.next.726 = or i64 %indvars.iv, 8
+  %arrayidx.8 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.726
+  %arrayidx6.8 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.726
+  %6 = bitcast double* %arrayidx.8 to <1 x double>*
+  %7 = load <1 x double>* %6, align 32, !tbaa !0
+  %add.8 = fadd <1 x double> %7, <double 1.000000e+00>
+  %8 = bitcast double* %arrayidx6.8 to <1 x double>*
+  store <1 x double> %add.8, <1 x double>* %8, align 32, !tbaa !0
+  %indvars.iv.next.1130 = or i64 %indvars.iv, 12
+  %arrayidx.12 = getelementptr inbounds [16000 x double]* @Y, i64 0, i64 %indvars.iv.next.1130
+  %arrayidx6.12 = getelementptr inbounds [16000 x double]* @X, i64 0, i64 %indvars.iv.next.1130
+  %9 = bitcast double* %arrayidx.12 to <1 x double>*
+  %10 = load <1 x double>* %9, align 32, !tbaa !0
+  %add.12 = fadd <1 x double> %10, <double 1.000000e+00>
+  %11 = bitcast double* %arrayidx6.12 to <1 x double>*
+  store <1 x double> %add.12, <1 x double>* %11, align 32, !tbaa !0
+  %indvars.iv.next.15 = add i64 %indvars.iv, 16
+  %lftr.wideiv.15 = trunc i64 %indvars.iv.next.15 to i32
+  %exitcond.15 = icmp eq i32 %lftr.wideiv.15, 16000
+  br i1 %exitcond.15, label %for.end, label %for.body4
+
+; All of the loads should come before all of the stores.
+; CHECK: mtctr
+; CHECK: stfd
+; CHECK-NOT: lfd
+; CHECK: bdnz
+
+for.end:                                          ; preds = %for.body4
+  %call7 = tail call signext i32 @dummy(double* getelementptr inbounds ([16000 x double]* @X, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Y, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @Z, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @U, i64 0, i64 0), double* getelementptr inbounds ([16000 x double]* @V, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @aa, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @bb, i64 0, i64 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @cc, i64 0, i64 0), double 0.000000e+00) nounwind
+  %inc9 = add nsw i32 %nl.018, 1
+  %exitcond = icmp eq i32 %inc9, 400000
+  br i1 %exitcond, label %for.end10, label %for.cond2.preheader
+
+for.end10:                                        ; preds = %for.end
+  %call11 = tail call i64 @clock() nounwind
+  %sub = sub nsw i64 %call11, %call1
+  %conv = sitofp i64 %sub to double
+  %div = fdiv double %conv, 1.000000e+06
+  %call12 = tail call signext i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8]* @.str137, i64 0, i64 0), double %div) nounwind
+  tail call void @check(i32 signext 1)
+  ret i32 0
+}
+
+declare i64 @clock() nounwind
+
+declare signext i32 @dummy(double*, double*, double*, double*, double*, [256 x double]*, [256 x double]*, [256 x double]*, double)
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/tls-ie-obj.ll b/test/CodeGen/PowerPC/tls-ie-obj.ll
new file mode 100644
index 0000000000..5cc0b187f6
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-ie-obj.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mcpu=pwr7 -O0 -filetype=obj %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+; Test correct relocation generation for thread-local storage
+; using the initial-exec model and integrated assembly.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = external thread_local global i32
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_GOT_TPREL16_DS and R_PPC64_TLS for
+; accessing external variable a.
+;
+; CHECK:       '.rela.text'
+; CHECK:       Relocation 0
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9a-f]+]]
+; CHECK-NEXT:  'r_type', 0x00000057
+; CHECK:       Relocation 1
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000043
+
diff --git a/test/CodeGen/PowerPC/tls-ie.ll b/test/CodeGen/PowerPC/tls-ie.ll
new file mode 100644
index 0000000000..cc6f084efb
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-ie.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mcpu=pwr7 -O0 <%s | FileCheck %s
+
+; Test correct assembly code generation for thread-local storage
+; using the initial-exec model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = external thread_local global i32
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; CHECK: ld [[REG:[0-9]+]], a@got@tprel(2)
+; CHECK: add {{[0-9]+}}, [[REG]], a@tls
+
diff --git a/test/CodeGen/PowerPC/vec_mul.ll b/test/CodeGen/PowerPC/vec_mul.ll
index 80f4de4a17..53bc75dd10 100644
--- a/test/CodeGen/PowerPC/vec_mul.ll
+++ b/test/CodeGen/PowerPC/vec_mul.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | not grep mullw
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vmsumuhm
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -march=ppc32 -mattr=+altivec | FileCheck %s
 
 define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 	%tmp = load <4 x i32>* %X		; <<4 x i32>> [#uses=1]
@@ -7,6 +6,9 @@ define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 	%tmp3 = mul <4 x i32> %tmp, %tmp2		; <<4 x i32>> [#uses=1]
 	ret <4 x i32> %tmp3
 }
+; CHECK: test_v4i32:
+; CHECK: vmsumuhm
+; CHECK-NOT: mullw
 
 define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 	%tmp = load <8 x i16>* %X		; <<8 x i16>> [#uses=1]
@@ -14,6 +16,9 @@ define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 	%tmp3 = mul <8 x i16> %tmp, %tmp2		; <<8 x i16>> [#uses=1]
 	ret <8 x i16> %tmp3
 }
+; CHECK: test_v8i16:
+; CHECK: vmladduhm
+; CHECK-NOT: mullw
 
 define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 	%tmp = load <16 x i8>* %X		; <<16 x i8>> [#uses=1]
@@ -21,3 +26,21 @@ define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 	%tmp3 = mul <16 x i8> %tmp, %tmp2		; <<16 x i8>> [#uses=1]
 	ret <16 x i8> %tmp3
 }
+; CHECK: test_v16i8:
+; CHECK: vmuloub
+; CHECK: vmuleub
+; CHECK-NOT: mullw
+
+define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
+	%tmp = load <4 x float>* %X
+	%tmp2 = load <4 x float>* %Y
+	%tmp3 = fmul <4 x float> %tmp, %tmp2
+	ret <4 x float> %tmp3
+}
+; Check the creation of a negative zero float vector by creating a vector of
+; all bits set and shifting it 31 bits to left, resulting a an vector of 
+; 4 x 0x80000000 (-0.0 as float).
+; CHECK: test_float:
+; CHECK: vspltisw [[ZNEG:[0-9]+]], -1
+; CHECK: vslw     {{[0-9]+}}, [[ZNEG]], [[ZNEG]]
+; CHECK: vmaddfp
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index 9f5a677ed3..498c78165e 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -61,7 +61,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !12 = metadata !{i32 524289, metadata !4, metadata !"", metadata !4, i32 0, i64 192, i64 32, i64 0, i32 0, metadata !13, metadata !14, i32 0, null} ; [ DW_TAG_array_type ]
 !13 = metadata !{i32 524324, metadata !4, metadata !"double", metadata !4, i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 524321, i64 0, i64 2}        ; [ DW_TAG_subrange_type ]
+!15 = metadata !{i32 524321, i64 0, i64 3}        ; [ DW_TAG_subrange_type ]
 !16 = metadata !{i32 524334, i32 0, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", metadata !9, i32 72, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null} ; [ DW_TAG_subprogram ]
 !17 = metadata !{i32 524309, metadata !4, metadata !"", metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null} ; [ DW_TAG_subroutine_type ]
 !18 = metadata !{null, metadata !19, metadata !20}
diff --git a/test/CodeGen/Thumb2/thumb2-mul.ll b/test/CodeGen/Thumb2/thumb2-mul.ll
index ac059bdaf0..a8134e6308 100644
--- a/test/CodeGen/Thumb2/thumb2-mul.ll
+++ b/test/CodeGen/Thumb2/thumb2-mul.ll
@@ -15,7 +15,7 @@ entry:
 ; CHECK: t1:
 ; CHECK: mla     r0, r2, r0, r1
 ; CHECK: add.w   r0, r0, r0, lsl #3
-; CHECL: add.w   r0, r3, r0, lsl #2
+; CHECK: add.w   r0, r3, r0, lsl #2
   %mul = mul i32 %n, %i
   %add = add i32 %mul, %j
   %0 = ptrtoint %struct.CMPoint* %thePoints to i32
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 0dca14d064..890fd0f067 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -78,7 +78,7 @@ declare void @llvm.stackrestore(i8*) nounwind
 !9 = metadata !{i32 458767, metadata !2, metadata !"", metadata !2, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
 !10 = metadata !{i32 458753, metadata !2, metadata !"", metadata !2, i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null} ; [ DW_TAG_array_type ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 458785, i64 0, i64 0}        ; [ DW_TAG_subrange_type ]
+!12 = metadata !{i32 458785, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
 !13 = metadata !{i32 3, i32 0, metadata !14, null}
 !14 = metadata !{i32 458763, metadata !1, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 4, i32 0, metadata !14, null}
diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
index 94075e78a2..c2d9d84d4c 100644
--- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
+++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
@@ -6,15 +6,16 @@
 define void @t(i32 %count) ssp nounwind {
 entry:
 ; CHECK: t:
-; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip)
-; CHECK: movups L_str(%rip), %xmm0
+; CHECK: movups L_str+12(%rip), %xmm0
+; CHECK: movups L_str(%rip), %xmm1
   %tmp0 = alloca [60 x i8], align 1
   %tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0
   br label %bb1
 
 bb1:
 ; CHECK: LBB0_1:
-; CHECK: movaps %xmm0, (%rsp)
+; CHECK: movups %xmm0, 12(%rsp)
+; CHECK: movaps %xmm1, (%rsp)
   %tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ]
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false)
   %tmp3 = add i32 %tmp2, 1
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index edd6015b0d..208e93e098 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -16,7 +16,7 @@
 !103 = metadata !{i32 524299, metadata !97, i32 73, i32 0} ; [ DW_TAG_lexical_block ]
 !104 = metadata !{i32 524289, metadata !38, metadata !"", metadata !38, i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null} ; [ DW_TAG_array_type ]
 !105 = metadata !{metadata !106}
-!106 = metadata !{i32 524321, i64 0, i64 1332}    ; [ DW_TAG_subrange_type ]
+!106 = metadata !{i32 524321, i64 0, i64 1333}    ; [ DW_TAG_subrange_type ]
 !107 = metadata !{i32 73, i32 0, metadata !103, null}
 
 define i32 @main() nounwind ssp {
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index 0a949eb29b..f66248bc5a 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -11,12 +11,12 @@ target triple = "x86_64-apple-macosx10.6.6"
 define void @select_func() {
 entry:
   %c.lobit.i.i.i = ashr <8 x i16> <i16 17, i16 5, i16 1, i16 15, i16 19, i16 15, i16 4, i16 1> , <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-  %a35 = bitcast <8 x i16> %c.lobit.i.i.i to <2 x i64>
   %and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
   %and.i5.i.i.i = bitcast <8 x i16> %and.i56.i.i.i to <2 x i64>
-  %neg.i.i.i.i = xor <2 x i64> %a35, <i64 -1, i64 -1>
-  %and.i.i.i.i = and <2 x i64> zeroinitializer, %neg.i.i.i.i
-  %or.i.i.i.i = or <2 x i64> %and.i.i.i.i, %and.i5.i.i.i
+  %neg.i.i.i.i = xor <8 x i16> %c.lobit.i.i.i, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and.i.i.i = and <8 x i16> %neg.i.i.i.i, <i16 45, i16 15, i16 95, i16 8, i16 25, i16 65, i16 8, i16 25>
+  %and.i2.i.i.i = bitcast <8 x i16> %and.i.i.i to <2 x i64>
+  %or.i.i.i.i = or <2 x i64> %and.i2.i.i.i, %and.i5.i.i.i
   %a37 = bitcast <2 x i64> %or.i.i.i.i to <8 x i16>
   store <8 x i16> %a37, <8 x i16> addrspace(1)* undef, align 4
   ret void
diff --git a/test/CodeGen/X86/2011-12-28-vselecti8.ll b/test/CodeGen/X86/2011-12-28-vselecti8.ll
index dbc122ac6e..1a9d46d1e2 100644
--- a/test/CodeGen/X86/2011-12-28-vselecti8.ll
+++ b/test/CodeGen/X86/2011-12-28-vselecti8.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-darwin11.2.0"
 
 ; CHECK: @foo8
 ; CHECK: psll
-; CHECK: psraw
+; CHECK-NOT: psraw
 ; CHECK: pblendvb
 ; CHECK: ret
 define void @foo8(float* nocapture %RET) nounwind {
diff --git a/test/CodeGen/Generic/2012-07-15-BuildVectorPromote.ll b/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll
index 6591c64d87..078f1b05c3 100644
--- a/test/CodeGen/Generic/2012-07-15-BuildVectorPromote.ll
+++ b/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 < %s
+; RUN: llc < %s -march=x86 -mcpu=corei7
 ; We don't care about the output, just that it doesn't crash
 
 define <1 x i1> @buildvec_promote() {
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
index 3b7a8a7b87..2c7dfc8dfd 100644
--- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -3,7 +3,7 @@
 declare x86_fastcallcc i64 @barrier()
 
 ;CHECK: bcast_fold
-;CHECK: vmovaps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]]
+;CHECK: vmov{{[au]}}ps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]]
 ;CHECK: barrier
 ;CHECK: vbroadcastss [[SPILLED]], %ymm0
 ;CHECK: ret
diff --git a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
new file mode 100644
index 0000000000..756e86e0f8
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win64 | FileCheck %s
+
+; CHECK: merge_stores_can
+; CHECK: callq foo
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movups  %xmm0
+; CHECK: callq foo
+; CHECK: ret
+declare i32 @foo([10 x i32]* )
+
+define i32 @merge_stores_can() nounwind ssp {
+  %object1 = alloca [10 x i32]
+
+  %ret0 = call i32 @foo([10 x i32]* %object1) nounwind
+
+  %O1_1 = getelementptr [10 x i32]* %object1, i64 0, i32 1
+  %O1_2 = getelementptr [10 x i32]* %object1, i64 0, i32 2
+  %O1_3 = getelementptr [10 x i32]* %object1, i64 0, i32 3
+  %O1_4 = getelementptr [10 x i32]* %object1, i64 0, i32 4
+  %ld_ptr = getelementptr [10 x i32]* %object1, i64 0, i32 9
+
+  store i32 0, i32* %O1_1
+  store i32 0, i32* %O1_2
+  %ret = load  i32* %ld_ptr  ; <--- does not alias.
+  store i32 0, i32* %O1_3
+  store i32 0, i32* %O1_4
+
+  %ret1 = call i32 @foo([10 x i32]* %object1) nounwind
+
+  ret i32 %ret
+}
+
+; CHECK: merge_stores_cant
+; CHECK-NOT: xorps %xmm0, %xmm0
+; CHECK-NOT: movups  %xmm0
+; CHECK: ret
+define i32 @merge_stores_cant([10 x i32]* %in0, [10 x i32]* %in1) nounwind ssp {
+
+  %O1_1 = getelementptr [10 x i32]* %in1, i64 0, i32 1
+  %O1_2 = getelementptr [10 x i32]* %in1, i64 0, i32 2
+  %O1_3 = getelementptr [10 x i32]* %in1, i64 0, i32 3
+  %O1_4 = getelementptr [10 x i32]* %in1, i64 0, i32 4
+  %ld_ptr = getelementptr [10 x i32]* %in0, i64 0, i32 2
+
+  store i32 0, i32* %O1_1
+  store i32 0, i32* %O1_2
+  %ret = load  i32* %ld_ptr  ;  <--- may alias
+  store i32 0, i32* %O1_3
+  store i32 0, i32* %O1_4
+
+  ret i32 %ret
+}
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
new file mode 100644
index 0000000000..f149e4a11e
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Test LiveInterval update handling of DBG_VALUE.
+; rdar://12777252.
+;
+; CHECK: %entry
+; CHECK: DEBUG_VALUE: hg
+; CHECK: je
+
+%struct.node.0.27 = type { i16, double, [3 x double], i32, i32 }
+%struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
+%struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* }
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
+entry:
+  call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4)
+  %type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0
+  %0 = load i16* %type, align 2, !tbaa !8
+  %cmp = icmp eq i16 %0, 1
+  br i1 %cmp, label %return, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %entry
+  %arrayidx6.1 = getelementptr inbounds %struct.hgstruct.2.29* %hg, i64 0, i32 1, i64 1
+  %cmp22 = fcmp olt double 0.000000e+00, %dsq
+  %conv24 = zext i1 %cmp22 to i16
+  br label %return
+
+return:                                           ; preds = %for.cond.preheader, %entry
+  %retval.0 = phi i16 [ %conv24, %for.cond.preheader ], [ 0, %entry ]
+  ret i16 %retval.0
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh", metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{null}
+!4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
+!5 = metadata !{i32 786473, metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh", null} ; [ DW_TAG_file_type ]
+!6 = metadata !{i32 786454, null, metadata !"hgstruct", metadata !5, i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786451, null, metadata !"", metadata !5, i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [from ]
+!8 = metadata !{metadata !"short", metadata !9}
+!9 = metadata !{metadata !"omnipotent char", metadata !10}
+!10 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
new file mode 100644
index 0000000000..f171c16df3
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Test MachineScheduler handling of DBG_VALUE.
+; rdar://12776937.
+;
+; CHECK: %if.else581
+; CHECK: DEBUG_VALUE: num1
+; CHECK: call
+
+%union.rec = type {}
+
+@.str15 = external hidden unnamed_addr constant [6 x i8], align 1
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp {
+entry:
+  %num14075 = alloca [20 x i8], align 16
+  br label %if.end33
+
+if.end33:                                         ; preds = %entry
+  %cmp1733 = icmp eq i32 undef, 0
+  br label %if.else581
+
+if.else581:                                       ; preds = %if.end33
+  %cmp586 = icmp eq i8 undef, -123
+  br i1 %cmp586, label %if.then588, label %if.else594
+
+if.then588:                                       ; preds = %if.else581
+  br label %for.cond1710.preheader
+
+if.else594:                                       ; preds = %if.else581
+  unreachable
+
+for.cond1710.preheader:                           ; preds = %if.then588
+  br label %for.cond1710
+
+for.cond1710:                                     ; preds = %for.cond1710, %for.cond1710.preheader
+  br i1 undef, label %for.cond1710, label %if.then3344
+
+if.then3344:
+  br label %if.then4073
+
+if.then4073:                                      ; preds = %if.then3344
+  call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4)
+  %arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0
+  %0 = load i32* undef, align 4
+  %add4093 = add nsw i32 %0, 0
+  %conv4094 = sitofp i32 %add4093 to float
+  %div4095 = fdiv float %conv4094, 5.670000e+02
+  %conv4096 = fpext float %div4095 to double
+  %call4097 = call i32 (i8*, i32, i64, i8*, ...)* @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
+  br i1 %cmp1733, label %if.then4107, label %if.else4114
+
+if.then4107:                                      ; preds = %if.then4073
+  unreachable
+
+if.else4114:                                      ; preds = %if.then4073
+  unreachable
+}
+
+declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset", metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{}
+!4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
+!5 = metadata !{i32 786443, metadata !6, i32 815, i32 0, metadata !14, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!6 = metadata !{i32 786443, metadata !7, i32 812, i32 0, metadata !14, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!7 = metadata !{i32 786443, metadata !8, i32 807, i32 0, metadata !14, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!8 = metadata !{i32 786443, metadata !9, i32 440, i32 0, metadata !14, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!9 = metadata !{i32 786443, metadata !10, i32 435, i32 0, metadata !14, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!10 = metadata !{i32 786443, metadata !11, i32 434, i32 0, metadata !14, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!11 = metadata !{i32 786443, metadata !12, i32 250, i32 0, metadata !14, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{i32 786443, metadata !3, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!14 = metadata !{i32 786473, metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset", null} ; [ DW_TAG_file_type ]
+!15 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!16 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
+
+; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
+;
+; CHECK: @main
+; CHECK: DEBUG_VALUE: X
+; CHECK: call
+
+%"class.__gnu_cxx::hash_map" = type { %"class.__gnu_cxx::hashtable" }
+%"class.__gnu_cxx::hashtable" = type { i64, i64, i64, i64, i64, i64 }
+
+define void @main() uwtable ssp {
+entry:
+  %X = alloca %"class.__gnu_cxx::hash_map", align 8
+  br i1 undef, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  unreachable
+
+cond.end:                                         ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !21)
+  %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
+  invoke void @_Znwm()
+          to label %exit.i unwind label %lpad2.i.i.i.i
+
+exit.i:                                           ; preds = %cond.end
+  unreachable
+
+lpad2.i.i.i.i:                                    ; preds = %cond.end
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br i1 undef, label %lpad.body.i.i, label %if.then.i.i.i.i.i.i.i.i
+
+if.then.i.i.i.i.i.i.i.i:                          ; preds = %lpad2.i.i.i.i
+  unreachable
+
+lpad.body.i.i:                                    ; preds = %lpad2.i.i.i.i
+  resume { i8*, i32 } %0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_Znwm()
+
+!llvm.dbg.cu = !{!20}
+
+!20 = metadata !{i32 786449, i32 0, i32 4, metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++", metadata !"clang version 3.3 (trunk 169129) (llvm/trunk 169135)", i1 true, i1 true, metadata !"", i32 0, null, null, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
+!21 = metadata !{i32 786688, null, metadata !"X", null, i32 29, metadata !22, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] [line 29]
+!22 = metadata !{i32 786454, null, metadata !"HM", metadata !23, i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
+!23 = metadata !{i32 786473, metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++", null} ; [ DW_TAG_file_type ]
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
new file mode 100644
index 0000000000..d290d514cc
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Test RegisterPressure handling of DBG_VALUE.
+;
+; CHECK: %entry
+; CHECK: DEBUG_VALUE: callback
+; CHECK: ret
+
+%struct.btCompoundLeafCallback = type { i32, i32 }
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define void @test() unnamed_addr uwtable ssp align 2 {
+entry:
+  %callback = alloca %struct.btCompoundLeafCallback, align 8
+  br i1 undef, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3)
+  %m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
+  store i32 0, i32* undef, align 8
+  %cmp12447 = icmp sgt i32 undef, 0
+  br i1 %cmp12447, label %for.body.lr.ph, label %invoke.cont44
+
+for.body.lr.ph:                                   ; preds = %if.end
+  unreachable
+
+invoke.cont44:                                    ; preds = %if.end
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet", metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, i1 true, metadata !"", i32 0, metadata !1, null, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{null, null}
+!3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214]
+!4 = metadata !{i32 786451, null, metadata !"btCompoundLeafCallback", metadata !5, i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [from ]
+!5 = metadata !{i32 786473, metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet", null} ; [ DW_TAG_file_type ]
diff --git a/test/CodeGen/X86/2012-12-06-python27-miscompile.ll b/test/CodeGen/X86/2012-12-06-python27-miscompile.ll
new file mode 100644
index 0000000000..d9effc92fa
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-06-python27-miscompile.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure that we are zeroing one memory location at a time using xorl and
+; not both using XMM registers.
+
+;CHECK: @foo
+;CHECK: xorl
+;CHECK-NOT: xmm
+;CHECK: ret
+define i32 @foo (i64* %so) nounwind uwtable ssp {
+entry:
+  %used = getelementptr inbounds i64* %so, i32 3
+  store i64 0, i64* %used, align 8
+  %fill = getelementptr inbounds i64* %so, i32 2
+  %L = load i64* %fill, align 8
+  store i64 0, i64* %fill, align 8
+  %cmp28 = icmp sgt i64 %L, 0
+  %R = sext i1 %cmp28 to i32
+  ret i32 %R
+}
diff --git a/test/CodeGen/X86/2012-12-1-merge-multiple.ll b/test/CodeGen/X86/2012-12-1-merge-multiple.ll
new file mode 100644
index 0000000000..5931c3d27b
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-1-merge-multiple.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win64 | FileCheck %s
+
+; CHECK: multiple_stores_on_chain
+; CHECK: movabsq
+; CHECK: movq
+; CHECK: movabsq
+; CHECK: movq
+; CHECK: ret
+define void @multiple_stores_on_chain(i16 * %A) {
+entry:
+  %a0 = getelementptr inbounds i16* %A, i64 0
+  %a1 = getelementptr inbounds i16* %A, i64 1
+  %a2 = getelementptr inbounds i16* %A, i64 2
+  %a3 = getelementptr inbounds i16* %A, i64 3
+  %a4 = getelementptr inbounds i16* %A, i64 4
+  %a5 = getelementptr inbounds i16* %A, i64 5
+  %a6 = getelementptr inbounds i16* %A, i64 6
+  %a7 = getelementptr inbounds i16* %A, i64 7
+
+  store i16 0, i16* %a0
+  store i16 1, i16* %a1
+  store i16 2, i16* %a2
+  store i16 3, i16* %a3
+  store i16 4, i16* %a4
+  store i16 5, i16* %a5
+  store i16 6, i16* %a6
+  store i16 7, i16* %a7
+
+  ret void
+}
+
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 88ecd5a5d3..0be83f648d 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -671,7 +671,9 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
   ; CHECK: test_x86_sse2_storeu_dq
   ; CHECK: movl
   ; CHECK: vmovdqu
-  call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a1)
+  ; add operation forces the execution domain.
+  %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
   ret void
 }
 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
@@ -681,6 +683,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
   ; CHECK: test_x86_sse2_storeu_pd
   ; CHECK: movl
   ; CHECK: vmovupd
+  ; fadd operation forces the execution domain.
   %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
   call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
   ret void
@@ -2345,7 +2348,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea
 
 
 define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
-  ; CHECK: vpermilps
+  ; CHECK: vpshufd
   %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 904f048d1e..65685a3224 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -6,7 +6,7 @@ define <4 x float> @test1(<4 x float> %a) nounwind {
   ret <4 x float> %b
 ; CHECK: test1:
 ; CHECK: vshufps
-; CHECK: vpermilps
+; CHECK: vpshufd
 }
 
 ; rdar://10538417
@@ -106,7 +106,7 @@ define <4 x float> @test11(<4 x float> %a) nounwind  {
 
 define <4 x float> @test12(<4 x float>* %a) nounwind  {
 ; CHECK: test12
-; CHECK: vpermilps $27, (
+; CHECK: vpshufd
   %tmp0 = load <4 x float>* %a
   %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %tmp1
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 5ad75236e1..67e4b40810 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -84,7 +84,7 @@ define <8 x float> @funcF(i32 %val) nounwind {
   ret <8 x float> %tmp
 }
 
-; CHECK: vpermilps  $0
+; CHECK: vpshufd  $0
 ; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:
@@ -93,7 +93,7 @@ entry:
 }
 
 ; CHECK: vextractf128  $1
-; CHECK-NEXT: vpermilps  $85
+; CHECK-NEXT: vpshufd
 ; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index a414e6880c..cf319cb7fe 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -4,15 +4,62 @@
 ; The mask for the vpblendw instruction needs to be identical for both halves
 ; of the YMM. Need to use two vpblendw instructions.
 
-; CHECK: blendw1
-; CHECK: vpblendw
-; CHECK: vpblendw
+; CHECK: vpblendw_test1
+; mask = 10010110,b = 150,d
+; CHECK: vpblendw  $150, %ymm
 ; CHECK: ret
-define <16 x i16> @blendw1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3,  i32 20, i32 5,  i32 6,  i32 23, 
+                                                               i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31>
   ret <16 x i16> %t
 }
 
+; CHECK: vpblendw_test2
+; mask1 = 00010110 = 22
+; mask2 = 10000000 = 128
+; CHECK: vpblendw  $128, %xmm
+; CHECK: vpblendw  $22, %xmm
+; CHECK: vinserti128
+; CHECK: ret
+define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, 
+                                                               i32 8, i32 9,  i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %t
+}
+
+; CHECK: blend_test1
+; CHECK: vpblendd
+; CHECK: ret
+define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x i32> %t
+}
+
+; CHECK: blend_test2
+; CHECK: vpblendd
+; CHECK: ret
+define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x i32> %t
+}
+
+
+; CHECK: blend_test3
+; CHECK: vblendps
+; CHECK: ret
+define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x float> %t
+}
+
+; CHECK: blend_test4
+; CHECK: vblendpd
+; CHECK: ret
+define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
+  %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  ret <4 x i64> %t
+}
+
 ; CHECK: vpshufhw $27, %ymm
 define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 11f811f8cf..34445428ea 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -28,7 +28,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ; reduce the mask in this case.
 ;CHECK: vsel_8xi16
 ;CHECK: psllw
-;CHECK: psraw
+;CHECK-NOT: psraw
 ;CHECK: pblendvb
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index ec447e5e9c..39a784dec3 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s -march=x86 | grep btl | count 28
-; RUN: llc < %s -march=x86 -mcpu=pentium4 | grep btl | not grep esp
-; RUN: llc < %s -march=x86 -mcpu=penryn   | grep btl | not grep esp
+; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=penryn | FileCheck %s
 ; PR3253
 
 ; The register+memory form of the BT instruction should be usable on
@@ -21,6 +19,9 @@
 
 define void @test2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -36,6 +37,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @test2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test2b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -51,6 +55,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atest2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atest2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -66,6 +73,8 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atest2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atest2b
+; CHECK: btl %eax, %ecx
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -81,6 +90,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @test3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test3
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -96,6 +108,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @test3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test3b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -111,6 +126,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -126,6 +144,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -141,6 +162,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atestne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atestne2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -156,6 +180,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atestne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atestne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -171,6 +198,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne3
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -186,6 +216,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne3b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -201,6 +234,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -216,6 +252,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -231,6 +270,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aquery2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aquery2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -246,6 +288,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aquery2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aquery2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -261,6 +306,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -276,6 +324,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -291,6 +342,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3x(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3x
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -306,6 +360,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3bx(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3bx
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp eq i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -321,6 +378,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -336,6 +396,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -351,6 +414,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aqueryne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aqueryne2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -366,6 +432,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aqueryne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aqueryne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -381,6 +450,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -396,6 +468,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -411,6 +486,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3x(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3x
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -426,6 +504,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3bx(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3bx
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp ne i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -440,3 +521,16 @@ UnifiedReturnBlock:		; preds = %entry
 }
 
 declare void @foo()
+
+; rdar://12755626
+define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
+; CHECK: invert
+; CHECK: btl %eax, %ecx
+; CHECK: setae
+entry:
+  %neg = xor i32 %flags, -1
+  %shl = shl i32 1, %flag
+  %and = and i32 %shl, %neg
+  %tobool = icmp ne i32 %and, 0
+  ret i1 %tobool
+}
diff --git a/test/CodeGen/X86/dbg-at-specficiation.ll b/test/CodeGen/X86/dbg-at-specficiation.ll
index aa5e6efede..48b8202bd5 100644
--- a/test/CodeGen/X86/dbg-at-specficiation.ll
+++ b/test/CodeGen/X86/dbg-at-specficiation.ll
@@ -17,4 +17,4 @@
 !7 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !8, metadata !9, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !8 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 720929, i64 0, i64 9}        ; [ DW_TAG_subrange_type ]
+!10 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
diff --git a/test/CodeGen/X86/dbg-declare.ll b/test/CodeGen/X86/dbg-declare.ll
index 5d4cedc5c4..b73e310cc5 100644
--- a/test/CodeGen/X86/dbg-declare.ll
+++ b/test/CodeGen/X86/dbg-declare.ll
@@ -51,7 +51,7 @@ declare void @llvm.stackrestore(i8*) nounwind
 !19 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 0, i64 8, i32 0, i32 0, metadata !20, metadata !21, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !20 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !21 = metadata !{metadata !22}
-!22 = metadata !{i32 786465, i64 1, i64 0}        ; [ DW_TAG_subrange_type ]
+!22 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
 !23 = metadata !{i32 7, i32 8, metadata !17, null}
 !24 = metadata !{i32 9, i32 1, metadata !17, null}
 !25 = metadata !{i32 8, i32 3, metadata !17, null}
diff --git a/test/CodeGen/X86/dbg-subrange.ll b/test/CodeGen/X86/dbg-subrange.ll
index 788910c7fe..0efb50e9a9 100644
--- a/test/CodeGen/X86/dbg-subrange.ll
+++ b/test/CodeGen/X86/dbg-subrange.ll
@@ -31,7 +31,7 @@ entry:
 !14 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !15 = metadata !{i32 720932, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 720929, i64 0, i64 4294967295} ; [ DW_TAG_subrange_type ]
+!17 = metadata !{i32 720929, i64 0, i64 4294967296} ; [ DW_TAG_subrange_type ]
 !18 = metadata !{i32 5, i32 3, metadata !19, null}
 !19 = metadata !{i32 720907, metadata !5, i32 4, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
 !20 = metadata !{i32 6, i32 1, metadata !19, null}
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 2fe1ecd40e..7a1a9ae461 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -63,6 +63,16 @@ define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x floa
 }
 declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
+; To test execution dependency
+define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+  ; CHECK: vmovaps
+  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+  %x = load <4 x float>* %a0
+  %y = load <4 x float>* %a1
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
+  ret < 4 x float > %res
+}
+
 define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd
   %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
@@ -82,6 +92,16 @@ define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x do
 }
 declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
+; To test execution dependency
+define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+  ; CHECK: vmovapd
+  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+  %x = load <2 x double>* %a0
+  %y = load <2 x double>* %a1
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
+  ret < 2 x double > %res
+}
+
 define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddps
   ; CHECK: ymm
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
new file mode 100644
index 0000000000..c1756d5e2e
--- /dev/null
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+
+; rdar://12721174
+; We should not fold movss into pshufd since pshufd expects m128 while movss
+; loads from m32.
+define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
+; CHECK: sample_test
+; CHECK: movss
+; CHECK: pshufd
+entry:
+  %source.addr = alloca <4 x float>*, align 8
+  %dest.addr = alloca <2 x float>*, align 8
+  %tmp = alloca <2 x float>, align 8
+  store <4 x float>* %source, <4 x float>** %source.addr, align 8
+  store <2 x float>* %dest, <2 x float>** %dest.addr, align 8
+  store <2 x float> zeroinitializer, <2 x float>* %tmp, align 8
+  %0 = load <4 x float>** %source.addr, align 8
+  %arrayidx = getelementptr inbounds <4 x float>* %0, i64 0
+  %1 = load <4 x float>* %arrayidx, align 16
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = load <2 x float>* %tmp, align 8
+  %4 = insertelement <2 x float> %3, float %2, i32 1
+  store <2 x float> %4, <2 x float>* %tmp, align 8
+  %5 = load <2 x float>* %tmp, align 8
+  %6 = load <2 x float>** %dest.addr, align 8
+  %arrayidx1 = getelementptr inbounds <2 x float>* %6, i64 0
+  store <2 x float> %5, <2 x float>* %arrayidx1, align 8
+  %7 = load <2 x float>** %dest.addr, align 8
+  %arrayidx2 = getelementptr inbounds <2 x float>* %7, i64 0
+  %8 = load <2 x float>* %arrayidx2, align 8
+  %vecext = extractelement <2 x float> %8, i32 0
+  %9 = load <2 x float>** %dest.addr, align 8
+  %arrayidx3 = getelementptr inbounds <2 x float>* %9, i64 0
+  %10 = load <2 x float>* %arrayidx3, align 8
+  %vecext4 = extractelement <2 x float> %10, i32 1
+  call void @ext(float %vecext, float %vecext4)
+  ret void
+}
+declare void @ext(float, float)
diff --git a/test/CodeGen/X86/fold-pcmpeqd-2.ll b/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 9cf4607cf5..2bde76efd2 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -43,21 +43,21 @@ forbody:		; preds = %forcond
 	%mul171.i = fmul <4 x float> %add167.i, %sub140.i		; <<4 x float>> [#uses=1]
 	%add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
 	%bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer		; <<4 x i32>> [#uses=1]
+	%andnps178.i = add <4 x i32> %bitcast176.i, <i32 1, i32 1, i32 1, i32 1>		; <<4 x i32>> [#uses=1]
 	%bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float>		; <<4 x float>> [#uses=1]
 	%mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer		; <<4 x float>> [#uses=1]
 	%bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer		; <<4 x i32>> [#uses=1]
+	%andnps192.i = add <4 x i32> %bitcast190.i, <i32 1, i32 1, i32 1, i32 1>		; <<4 x i32>> [#uses=1]
 	%xorps.i = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%orps203.i = or <4 x i32> %andnps192.i, %xorps.i		; <<4 x i32>> [#uses=1]
+	%orps203.i = add <4 x i32> %andnps192.i, %xorps.i		; <<4 x i32>> [#uses=1]
 	%bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float>		; <<4 x float>> [#uses=1]
 	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
 	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
 	%cmpunord.i11 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i8 3) nounwind		; <<4 x float>> [#uses=1]
 	%bitcast6.i13 = bitcast <4 x float> %cmpunord.i11 to <4 x i32>		; <<4 x i32>> [#uses=2]
-	%andps.i14 = and <4 x i32> zeroinitializer, %bitcast6.i13		; <<4 x i32>> [#uses=1]
+	%andps.i14 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %bitcast6.i13		; <<4 x i32>> [#uses=1]
 	%not.i16 = xor <4 x i32> %bitcast6.i13, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%andnps.i17 = and <4 x i32> zeroinitializer, %not.i16		; <<4 x i32>> [#uses=1]
+	%andnps.i17 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %not.i16		; <<4 x i32>> [#uses=1]
 	%orps.i18 = or <4 x i32> %andnps.i17, %andps.i14		; <<4 x i32>> [#uses=1]
 	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
 	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index 7a2bbc4ef0..dcc8f0d268 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -10,18 +10,18 @@
 define void @t1(i32 %argc, i8** %argv) nounwind  {
 entry:
 ; SSE2: t1:
+; SSE2: movsd _.str+16, %xmm0
+; SSE2: movsd %xmm0, 16(%esp)
 ; SSE2: movaps _.str, %xmm0
 ; SSE2: movaps %xmm0
-; SSE2: movb $0
-; SSE2: movl $0
-; SSE2: movl $0
+; SSE2: movb $0, 24(%esp)
 
 ; SSE1: t1:
+; SSE1: fldl _.str+16
+; SSE1: fstpl 16(%esp)
 ; SSE1: movaps _.str, %xmm0
 ; SSE1: movaps %xmm0
-; SSE1: movb $0
-; SSE1: movl $0
-; SSE1: movl $0
+; SSE1: movb $0, 24(%esp)
 
 ; NOSSE: t1:
 ; NOSSE: movb $0
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
new file mode 100644
index 0000000000..f5566e5e5d
--- /dev/null
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -0,0 +1,195 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -misched-topdown -verify-machineinstrs \
+; RUN:     | FileCheck %s -check-prefix=TOPDOWN
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -misched=ilpmin -verify-machineinstrs \
+; RUN:     | FileCheck %s -check-prefix=ILPMIN
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -misched=ilpmax -verify-machineinstrs \
+; RUN:     | FileCheck %s -check-prefix=ILPMAX
+;
+; Verify that the MI scheduler minimizes register pressure for a
+; uniform set of bottom-up subtrees (unrolled matrix multiply).
+;
+; For current top-down heuristics, ensure that some folded imulls have
+; been reordered with the stores. This tests the scheduler's cheap
+; alias analysis ability (that doesn't require any AliasAnalysis pass).
+;
+; TOPDOWN: %for.body
+; TOPDOWN: movl %{{.*}}, (
+; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN: movl %{{.*}}, 4(
+; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN: movl %{{.*}}, 8(
+; TOPDOWN: movl %{{.*}}, 12(
+; TOPDOWN: %for.end
+;
+; For -misched=ilpmin, verify that each expression subtree is
+; scheduled independently, and that the imull/adds are interleaved.
+;
+; ILPMIN: %for.body
+; ILPMIN: movl %{{.*}}, (
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 4(
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 8(
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 12(
+; ILPMIN: %for.end
+;
+; For -misched=ilpmax, verify that each expression subtree is
+; scheduled independently, and that the imull/adds are clustered.
+;
+; ILPMAX: %for.body
+; ILPMAX: movl %{{.*}}, (
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 4(
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 8(
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 12(
+; ILPMAX: %for.end
+
+define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
+[4 x i32]* noalias nocapture %m3) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                              ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx8 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 0
+  %tmp = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 0
+  %tmp1 = load i32* %arrayidx12, align 4, !tbaa !0
+  %arrayidx8.1 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 1
+  %tmp2 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 0
+  %tmp3 = load i32* %arrayidx12.1, align 4, !tbaa !0
+  %arrayidx8.2 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 2
+  %tmp4 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 0
+  %tmp5 = load i32* %arrayidx12.2, align 4, !tbaa !0
+  %arrayidx8.3 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 3
+  %tmp6 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 0
+  %tmp8 = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12.137 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 1
+  %tmp9 = load i32* %arrayidx12.137, align 4, !tbaa !0
+  %tmp10 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 1
+  %tmp11 = load i32* %arrayidx12.1.1, align 4, !tbaa !0
+  %tmp12 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2.1 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 1
+  %tmp13 = load i32* %arrayidx12.2.1, align 4, !tbaa !0
+  %tmp14 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3.1 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 1
+  %tmp15 = load i32* %arrayidx12.3.1, align 4, !tbaa !0
+  %tmp16 = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12.239 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 2
+  %tmp17 = load i32* %arrayidx12.239, align 4, !tbaa !0
+  %tmp18 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1.2 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 2
+  %tmp19 = load i32* %arrayidx12.1.2, align 4, !tbaa !0
+  %tmp20 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 2
+  %tmp21 = load i32* %arrayidx12.2.2, align 4, !tbaa !0
+  %tmp22 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3.2 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 2
+  %tmp23 = load i32* %arrayidx12.3.2, align 4, !tbaa !0
+  %tmp24 = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12.341 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 3
+  %tmp25 = load i32* %arrayidx12.341, align 4, !tbaa !0
+  %tmp26 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1.3 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 3
+  %tmp27 = load i32* %arrayidx12.1.3, align 4, !tbaa !0
+  %tmp28 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2.3 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 3
+  %tmp29 = load i32* %arrayidx12.2.3, align 4, !tbaa !0
+  %tmp30 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 3
+  %tmp31 = load i32* %arrayidx12.3.3, align 4, !tbaa !0
+  %tmp7 = load i32* %arrayidx12.3, align 4, !tbaa !0
+  %mul = mul nsw i32 %tmp1, %tmp
+  %mul.1 = mul nsw i32 %tmp3, %tmp2
+  %mul.2 = mul nsw i32 %tmp5, %tmp4
+  %mul.3 = mul nsw i32 %tmp7, %tmp6
+  %mul.138 = mul nsw i32 %tmp9, %tmp8
+  %mul.1.1 = mul nsw i32 %tmp11, %tmp10
+  %mul.2.1 = mul nsw i32 %tmp13, %tmp12
+  %mul.3.1 = mul nsw i32 %tmp15, %tmp14
+  %mul.240 = mul nsw i32 %tmp17, %tmp16
+  %mul.1.2 = mul nsw i32 %tmp19, %tmp18
+  %mul.2.2 = mul nsw i32 %tmp21, %tmp20
+  %mul.3.2 = mul nsw i32 %tmp23, %tmp22
+  %mul.342 = mul nsw i32 %tmp25, %tmp24
+  %mul.1.3 = mul nsw i32 %tmp27, %tmp26
+  %mul.2.3 = mul nsw i32 %tmp29, %tmp28
+  %mul.3.3 = mul nsw i32 %tmp31, %tmp30
+  %add.1 = add nsw i32 %mul.1, %mul
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %add.3 = add nsw i32 %mul.3, %add.2
+  %add.1.1 = add nsw i32 %mul.1.1, %mul.138
+  %add.2.1 = add nsw i32 %mul.2.1, %add.1.1
+  %add.3.1 = add nsw i32 %mul.3.1, %add.2.1
+  %add.1.2 = add nsw i32 %mul.1.2, %mul.240
+  %add.2.2 = add nsw i32 %mul.2.2, %add.1.2
+  %add.3.2 = add nsw i32 %mul.3.2, %add.2.2
+  %add.1.3 = add nsw i32 %mul.1.3, %mul.342
+  %add.2.3 = add nsw i32 %mul.2.3, %add.1.3
+  %add.3.3 = add nsw i32 %mul.3.3, %add.2.3
+  %arrayidx16 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 0
+  store i32 %add.3, i32* %arrayidx16, align 4, !tbaa !0
+  %arrayidx16.1 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 1
+  store i32 %add.3.1, i32* %arrayidx16.1, align 4, !tbaa !0
+  %arrayidx16.2 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 2
+  store i32 %add.3.2, i32* %arrayidx16.2, align 4, !tbaa !0
+  %arrayidx16.3 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 3
+  store i32 %add.3.3, i32* %arrayidx16.3, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                        ; preds = %for.body
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/misched-new.ll b/test/CodeGen/X86/misched-new.ll
index cec04b534f..89e45b7cfc 100644
--- a/test/CodeGen/X86/misched-new.ll
+++ b/test/CodeGen/X86/misched-new.ll
@@ -1,6 +1,9 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
 ; RUN:          -misched=shuffle -misched-bottomup -verify-machineinstrs \
 ; RUN:     | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN:          -misched=shuffle -misched-topdown -verify-machineinstrs \
+; RUN:     | FileCheck %s --check-prefix TOPDOWN
 ; REQUIRES: asserts
 ;
 ; Interesting MachineScheduler cases.
@@ -51,3 +54,56 @@ if.end:                                           ; preds = %if.then, %entry
 }
 
 declare void @bar(i32,i32)
+
+; Test that the DAG builder can handle an undef vreg on ExitSU.
+; CHECK: hasundef
+; CHECK: call
+
+%t0 = type { i32, i32, i8 }
+%t6 = type { i32 (...)**, %t7* }
+%t7 = type { i32 (...)** }
+
+define void @hasundef() unnamed_addr uwtable ssp align 2 {
+  %1 = alloca %t0, align 8
+  br i1 undef, label %3, label %2
+
+; <label>:2                                       ; preds = %0
+  unreachable
+
+; <label>:3                                       ; preds = %0
+  br i1 undef, label %4, label %5
+
+; <label>:4                                       ; preds = %3
+  call void undef(%t6* undef, %t0* %1)
+  unreachable
+
+; <label>:5                                       ; preds = %3
+  ret void
+}
+
+; Test top-down subregister liveness tracking. Self-verification
+; catches any pressure set underflow.
+; rdar://12797931.
+;
+; TOPDOWN: @testSubregTracking
+; TOPDOWN: divb
+; TOPDOWN: movzbl %al
+; TOPDOWN: ret
+define void @testSubregTracking() nounwind uwtable ssp align 2 {
+  %tmp = load i8* undef, align 1
+  %tmp6 = sub i8 0, %tmp
+  %tmp7 = load i8* undef, align 1
+  %tmp8 = udiv i8 %tmp6, %tmp7
+  %tmp9 = zext i8 %tmp8 to i64
+  %tmp10 = load i8* undef, align 1
+  %tmp11 = zext i8 %tmp10 to i64
+  %tmp12 = mul i64 %tmp11, %tmp9
+  %tmp13 = urem i8 %tmp6, %tmp7
+  %tmp14 = zext i8 %tmp13 to i32
+  %tmp15 = add nsw i32 %tmp14, 0
+  %tmp16 = add i32 %tmp15, 0
+  store i32 %tmp16, i32* undef, align 4
+  %tmp17 = add i64 0, %tmp12
+  store i64 %tmp17, i64* undef, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
index e2224a6196..98f4077763 100644
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -39,7 +39,7 @@ define i32 @_rdrand64_step(i64* %random_val) {
   %isvalid = extractvalue {i64, i32} %call, 1
   ret i32 %isvalid
 ; CHECK: _rdrand64_step:
-; CHECK: rdrandq	%r[[T1:[[a-z]+]]
+; CHECK: rdrandq	%r[[T1:[a-z]+]]
 ; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
diff --git a/test/CodeGen/X86/sext-load.ll b/test/CodeGen/X86/sext-load.ll
index c9b39d3a48..58c93229a2 100644
--- a/test/CodeGen/X86/sext-load.ll
+++ b/test/CodeGen/X86/sext-load.ll
@@ -1,9 +1,30 @@
-; RUN: llc < %s -march=x86 | grep movsbl
+; RUN: llc < %s -march=x86 | FileCheck %s
 
-define i32 @foo(i32 %X) nounwind  {
+; When doing sign extension, use the sext-load lowering to take advantage of
+; x86's sign extension during loads.
+;
+; CHECK: test1:
+; CHECK:      movsbl {{.*}}, %eax
+; CHECK-NEXT: ret
+define i32 @test1(i32 %X) nounwind  {
 entry:
 	%tmp12 = trunc i32 %X to i8		; <i8> [#uses=1]
 	%tmp123 = sext i8 %tmp12 to i32		; <i32> [#uses=1]
 	ret i32 %tmp123
 }
 
+; When using a sextload representation, ensure that the sign extension is
+; preserved even when removing shifted-out low bits.
+;
+; CHECK: test2:
+; CHECK:      movswl {{.*}}, %eax
+; CHECK-NEXT: ret
+define i32 @test2({i16, [6 x i8]}* %this) {
+entry:
+  %b48 = getelementptr inbounds { i16, [6 x i8] }* %this, i32 0, i32 1
+  %cast = bitcast [6 x i8]* %b48 to i48*
+  %bf.load = load i48* %cast, align 2
+  %bf.ashr = ashr i48 %bf.load, 32
+  %bf.cast = trunc i48 %bf.ashr to i32
+  ret i32 %bf.cast
+}
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
index 2f4317bf29..67ce1be135 100644
--- a/test/CodeGen/X86/sse2-blend.ll
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -28,33 +28,31 @@ define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
 
 ; Without forcing instructions, fall back to the preferred PS domain.
 ; CHECK: vsel_i64
-; CHECK: xorps
-; CHECK: andps
 ; CHECK: andnps
+; CHECK: andps
 ; CHECK: orps
 ; CHECK: ret
 
-define void@vsel_i64(<4 x i64>* %v1, <4 x i64>* %v2) {
-  %A = load <4 x i64>* %v1
-  %B = load <4 x i64>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %A, <4 x i64> %B
-  store <4 x i64 > %vsel, <4 x i64>* %v1
+define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
+  %A = load <2 x i64>* %v1
+  %B = load <2 x i64>* %v2
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %A, <2 x i64> %B
+  store <2 x i64 > %vsel, <2 x i64>* %v1
   ret void
 }
 
 ; Without forcing instructions, fall back to the preferred PS domain.
 ; CHECK: vsel_double
-; CHECK: xorps
-; CHECK: andps
 ; CHECK: andnps
+; CHECK: andps
 ; CHECK: orps
 ; CHECK: ret
 
-define void@vsel_double(<4 x double>* %v1, <4 x double>* %v2) {
-  %A = load <4 x double>* %v1
-  %B = load <4 x double>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %A, <4 x double> %B
-  store <4 x double > %vsel, <4 x double>* %v1
+define void@vsel_double(<2 x double>* %v1, <2 x double>* %v2) {
+  %A = load <2 x double>* %v1
+  %B = load <2 x double>* %v2
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %A, <2 x double> %B
+  store <2 x double > %vsel, <2 x double>* %v1
   ret void
 }
 
diff --git a/test/CodeGen/X86/tailcall-fastisel.ll b/test/CodeGen/X86/tailcall-fastisel.ll
index 7f92af4dca..842ed25439 100644
--- a/test/CodeGen/X86/tailcall-fastisel.ll
+++ b/test/CodeGen/X86/tailcall-fastisel.ll
@@ -1,12 +1,11 @@
-; RUN: llc < %s -march=x86-64 -tailcallopt -fast-isel | not grep TAILCALL
-
-; Fast-isel shouldn't attempt to cope with tail calls.
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -tailcallopt -fast-isel -fast-isel-abort | FileCheck %s
 
 %0 = type { i64, i32, i8* }
 
 define fastcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 %arg1) nounwind {
 fail:                                             ; preds = %entry
   %tmp20 = tail call fastcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 undef) ; <i8*> [#uses=1]
+; CHECK: jmp "_visit_array_aux<`Reference>" ## TAILCALL
   ret i8* %tmp20
 }
 
diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll
index 976cd1835b..b6b8ba6f84 100644
--- a/test/CodeGen/X86/vec_shuffle-20.ll
+++ b/test/CodeGen/X86/vec_shuffle-20.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 3
+; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2
 
 define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/vec_zero.ll b/test/CodeGen/X86/vec_zero.ll
index 682a0dfca8..c3ea0ad202 100644
--- a/test/CodeGen/X86/vec_zero.ll
+++ b/test/CodeGen/X86/vec_zero.ll
@@ -13,7 +13,7 @@ define void @foo(<4 x float>* %P) {
 ; CHECK: pxor
 define void @bar(<4 x i32>* %P) {
         %T = load <4 x i32>* %P         ; <<4 x i32>> [#uses=1]
-        %S = add <4 x i32> zeroinitializer, %T          ; <<4 x i32>> [#uses=1]
+        %S = sub <4 x i32> zeroinitializer, %T          ; <<4 x i32>> [#uses=1]
         store <4 x i32> %S, <4 x i32>* %P
         ret void
 }
author	Derek Schuff <dschuff@chromium.org>	2013-01-09 16:55:43 -0800
committer	Derek Schuff <dschuff@chromium.org>	2013-01-11 13:47:37 -0800
commit	b770d0e0636a4b5ad61b1ca661caee67576c05fc (patch)
tree	c486ce032d41f97313c50629bd5b879f53e6ccbf /test/CodeGen
parent	b835840cf112a6178506d834b58aa625f59a8994 (diff)
parent	1ad9253c9d34ccbce3e7e4ea5d87c266cbf93410 (diff)