53 files changed, 1754 insertions, 153 deletions
diff --git a/test/CodeGen/X86/2008-10-27-StackRealignment.ll b/test/CodeGen/X86/2008-10-27-StackRealignment.ll
deleted file mode 100644
index a57f7166ca..0000000000
--- a/test/CodeGen/X86/2008-10-27-StackRealignment.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; Linux doesn't support stack realignment for functions with allocas (PR2888).
-; Until it does, we shouldn't use movaps to access the stack.  On targets with
-; sufficiently aligned stack (e.g. darwin) we should.
-; PR8969 - make 32-bit linux have a 16-byte aligned stack
-; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mcpu=yonah | grep movaps | count 2
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=yonah | grep movaps | count 2
-
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-pc-linux-gnu"
-  
-define void @foo(i32 %t) nounwind {
-  %tmp1210 = alloca i8, i32 32, align 4
-  call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
-  %x = alloca i8, i32 %t
-  call void @dummy(i8* %x)
-  ret void
-}
-
-declare void @dummy(i8*)
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 0dca14d064..890fd0f067 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -78,7 +78,7 @@ declare void @llvm.stackrestore(i8*) nounwind
 !9 = metadata !{i32 458767, metadata !2, metadata !"", metadata !2, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
 !10 = metadata !{i32 458753, metadata !2, metadata !"", metadata !2, i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null} ; [ DW_TAG_array_type ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 458785, i64 0, i64 0}        ; [ DW_TAG_subrange_type ]
+!12 = metadata !{i32 458785, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
 !13 = metadata !{i32 3, i32 0, metadata !14, null}
 !14 = metadata !{i32 458763, metadata !1, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 4, i32 0, metadata !14, null}
diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
index 94075e78a2..c2d9d84d4c 100644
--- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
+++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
@@ -6,15 +6,16 @@
 define void @t(i32 %count) ssp nounwind {
 entry:
 ; CHECK: t:
-; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip)
-; CHECK: movups L_str(%rip), %xmm0
+; CHECK: movups L_str+12(%rip), %xmm0
+; CHECK: movups L_str(%rip), %xmm1
   %tmp0 = alloca [60 x i8], align 1
   %tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0
   br label %bb1
 
 bb1:
 ; CHECK: LBB0_1:
-; CHECK: movaps %xmm0, (%rsp)
+; CHECK: movups %xmm0, 12(%rsp)
+; CHECK: movaps %xmm1, (%rsp)
   %tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ]
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false)
   %tmp3 = add i32 %tmp2, 1
diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
index 7d1cda35a2..3d058bc289 100644
--- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
+++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
@@ -10,10 +10,10 @@ entry:
 ; CHECK: movl ([[REG:%[a-z]+]]), %eax
 ; CHECK: movl 4([[REG]]), %edx
 ; CHECK: LBB0_1:
-; CHECK: movl $1
-; CHECK: addl
-; CHECK: movl $0
-; CHECK: adcl
+; CHECK: movl %eax, %ebx
+; CHECK: addl {{%[a-z]+}}, %ebx
+; CHECK: movl %edx, %ecx
+; CHECK: adcl {{%[a-z]+}}, %ecx
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg8b ([[REG]])
 ; CHECK-NEXT: jne
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index edd6015b0d..208e93e098 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -16,7 +16,7 @@
 !103 = metadata !{i32 524299, metadata !97, i32 73, i32 0} ; [ DW_TAG_lexical_block ]
 !104 = metadata !{i32 524289, metadata !38, metadata !"", metadata !38, i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null} ; [ DW_TAG_array_type ]
 !105 = metadata !{metadata !106}
-!106 = metadata !{i32 524321, i64 0, i64 1332}    ; [ DW_TAG_subrange_type ]
+!106 = metadata !{i32 524321, i64 0, i64 1333}    ; [ DW_TAG_subrange_type ]
 !107 = metadata !{i32 73, i32 0, metadata !103, null}
 
 define i32 @main() nounwind ssp {
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index 0a949eb29b..f66248bc5a 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -11,12 +11,12 @@ target triple = "x86_64-apple-macosx10.6.6"
 define void @select_func() {
 entry:
   %c.lobit.i.i.i = ashr <8 x i16> <i16 17, i16 5, i16 1, i16 15, i16 19, i16 15, i16 4, i16 1> , <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
-  %a35 = bitcast <8 x i16> %c.lobit.i.i.i to <2 x i64>
   %and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
   %and.i5.i.i.i = bitcast <8 x i16> %and.i56.i.i.i to <2 x i64>
-  %neg.i.i.i.i = xor <2 x i64> %a35, <i64 -1, i64 -1>
-  %and.i.i.i.i = and <2 x i64> zeroinitializer, %neg.i.i.i.i
-  %or.i.i.i.i = or <2 x i64> %and.i.i.i.i, %and.i5.i.i.i
+  %neg.i.i.i.i = xor <8 x i16> %c.lobit.i.i.i, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and.i.i.i = and <8 x i16> %neg.i.i.i.i, <i16 45, i16 15, i16 95, i16 8, i16 25, i16 65, i16 8, i16 25>
+  %and.i2.i.i.i = bitcast <8 x i16> %and.i.i.i to <2 x i64>
+  %or.i.i.i.i = or <2 x i64> %and.i2.i.i.i, %and.i5.i.i.i
   %a37 = bitcast <2 x i64> %or.i.i.i.i to <8 x i16>
   store <8 x i16> %a37, <8 x i16> addrspace(1)* undef, align 4
   ret void
diff --git a/test/CodeGen/X86/2011-12-28-vselecti8.ll b/test/CodeGen/X86/2011-12-28-vselecti8.ll
index dbc122ac6e..1a9d46d1e2 100644
--- a/test/CodeGen/X86/2011-12-28-vselecti8.ll
+++ b/test/CodeGen/X86/2011-12-28-vselecti8.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-darwin11.2.0"
 
 ; CHECK: @foo8
 ; CHECK: psll
-; CHECK: psraw
+; CHECK-NOT: psraw
 ; CHECK: pblendvb
 ; CHECK: ret
 define void @foo8(float* nocapture %RET) nounwind {
diff --git a/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll b/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll
new file mode 100644
index 0000000000..078f1b05c3
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7
+; We don't care about the output, just that it doesn't crash
+
+define <1 x i1> @buildvec_promote() {
+  %cmp = icmp ule <1 x i32> undef, undef
+  %sel = select i1 undef, <1 x i1> undef, <1 x i1> %cmp
+  ret <1 x i1> %sel
+}
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
index 3b7a8a7b87..2c7dfc8dfd 100644
--- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -3,7 +3,7 @@
 declare x86_fastcallcc i64 @barrier()
 
 ;CHECK: bcast_fold
-;CHECK: vmovaps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]]
+;CHECK: vmov{{[au]}}ps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]]
 ;CHECK: barrier
 ;CHECK: vbroadcastss [[SPILLED]], %ymm0
 ;CHECK: ret
diff --git a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
new file mode 100644
index 0000000000..756e86e0f8
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win64 | FileCheck %s
+
+; CHECK: merge_stores_can
+; CHECK: callq foo
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movups  %xmm0
+; CHECK: callq foo
+; CHECK: ret
+declare i32 @foo([10 x i32]* )
+
+define i32 @merge_stores_can() nounwind ssp {
+  %object1 = alloca [10 x i32]
+
+  %ret0 = call i32 @foo([10 x i32]* %object1) nounwind
+
+  %O1_1 = getelementptr [10 x i32]* %object1, i64 0, i32 1
+  %O1_2 = getelementptr [10 x i32]* %object1, i64 0, i32 2
+  %O1_3 = getelementptr [10 x i32]* %object1, i64 0, i32 3
+  %O1_4 = getelementptr [10 x i32]* %object1, i64 0, i32 4
+  %ld_ptr = getelementptr [10 x i32]* %object1, i64 0, i32 9
+
+  store i32 0, i32* %O1_1
+  store i32 0, i32* %O1_2
+  %ret = load  i32* %ld_ptr  ; <--- does not alias.
+  store i32 0, i32* %O1_3
+  store i32 0, i32* %O1_4
+
+  %ret1 = call i32 @foo([10 x i32]* %object1) nounwind
+
+  ret i32 %ret
+}
+
+; CHECK: merge_stores_cant
+; CHECK-NOT: xorps %xmm0, %xmm0
+; CHECK-NOT: movups  %xmm0
+; CHECK: ret
+define i32 @merge_stores_cant([10 x i32]* %in0, [10 x i32]* %in1) nounwind ssp {
+
+  %O1_1 = getelementptr [10 x i32]* %in1, i64 0, i32 1
+  %O1_2 = getelementptr [10 x i32]* %in1, i64 0, i32 2
+  %O1_3 = getelementptr [10 x i32]* %in1, i64 0, i32 3
+  %O1_4 = getelementptr [10 x i32]* %in1, i64 0, i32 4
+  %ld_ptr = getelementptr [10 x i32]* %in0, i64 0, i32 2
+
+  store i32 0, i32* %O1_1
+  store i32 0, i32* %O1_2
+  %ret = load  i32* %ld_ptr  ;  <--- may alias
+  store i32 0, i32* %O1_3
+  store i32 0, i32* %O1_4
+
+  ret i32 %ret
+}
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
new file mode 100644
index 0000000000..f149e4a11e
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Test LiveInterval update handling of DBG_VALUE.
+; rdar://12777252.
+;
+; CHECK: %entry
+; CHECK: DEBUG_VALUE: hg
+; CHECK: je
+
+%struct.node.0.27 = type { i16, double, [3 x double], i32, i32 }
+%struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
+%struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* }
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
+entry:
+  call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4)
+  %type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0
+  %0 = load i16* %type, align 2, !tbaa !8
+  %cmp = icmp eq i16 %0, 1
+  br i1 %cmp, label %return, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %entry
+  %arrayidx6.1 = getelementptr inbounds %struct.hgstruct.2.29* %hg, i64 0, i32 1, i64 1
+  %cmp22 = fcmp olt double 0.000000e+00, %dsq
+  %conv24 = zext i1 %cmp22 to i16
+  br label %return
+
+return:                                           ; preds = %for.cond.preheader, %entry
+  %retval.0 = phi i16 [ %conv24, %for.cond.preheader ], [ 0, %entry ]
+  ret i16 %retval.0
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh", metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{null}
+!4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
+!5 = metadata !{i32 786473, metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh", null} ; [ DW_TAG_file_type ]
+!6 = metadata !{i32 786454, null, metadata !"hgstruct", metadata !5, i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786451, null, metadata !"", metadata !5, i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [from ]
+!8 = metadata !{metadata !"short", metadata !9}
+!9 = metadata !{metadata !"omnipotent char", metadata !10}
+!10 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
new file mode 100644
index 0000000000..f171c16df3
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Test MachineScheduler handling of DBG_VALUE.
+; rdar://12776937.
+;
+; CHECK: %if.else581
+; CHECK: DEBUG_VALUE: num1
+; CHECK: call
+
+%union.rec = type {}
+
+@.str15 = external hidden unnamed_addr constant [6 x i8], align 1
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp {
+entry:
+  %num14075 = alloca [20 x i8], align 16
+  br label %if.end33
+
+if.end33:                                         ; preds = %entry
+  %cmp1733 = icmp eq i32 undef, 0
+  br label %if.else581
+
+if.else581:                                       ; preds = %if.end33
+  %cmp586 = icmp eq i8 undef, -123
+  br i1 %cmp586, label %if.then588, label %if.else594
+
+if.then588:                                       ; preds = %if.else581
+  br label %for.cond1710.preheader
+
+if.else594:                                       ; preds = %if.else581
+  unreachable
+
+for.cond1710.preheader:                           ; preds = %if.then588
+  br label %for.cond1710
+
+for.cond1710:                                     ; preds = %for.cond1710, %for.cond1710.preheader
+  br i1 undef, label %for.cond1710, label %if.then3344
+
+if.then3344:
+  br label %if.then4073
+
+if.then4073:                                      ; preds = %if.then3344
+  call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4)
+  %arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0
+  %0 = load i32* undef, align 4
+  %add4093 = add nsw i32 %0, 0
+  %conv4094 = sitofp i32 %add4093 to float
+  %div4095 = fdiv float %conv4094, 5.670000e+02
+  %conv4096 = fpext float %div4095 to double
+  %call4097 = call i32 (i8*, i32, i64, i8*, ...)* @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
+  br i1 %cmp1733, label %if.then4107, label %if.else4114
+
+if.then4107:                                      ; preds = %if.then4073
+  unreachable
+
+if.else4114:                                      ; preds = %if.then4073
+  unreachable
+}
+
+declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset", metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{}
+!4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
+!5 = metadata !{i32 786443, metadata !6, i32 815, i32 0, metadata !14, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!6 = metadata !{i32 786443, metadata !7, i32 812, i32 0, metadata !14, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!7 = metadata !{i32 786443, metadata !8, i32 807, i32 0, metadata !14, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!8 = metadata !{i32 786443, metadata !9, i32 440, i32 0, metadata !14, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!9 = metadata !{i32 786443, metadata !10, i32 435, i32 0, metadata !14, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!10 = metadata !{i32 786443, metadata !11, i32 434, i32 0, metadata !14, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!11 = metadata !{i32 786443, metadata !12, i32 250, i32 0, metadata !14, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{i32 786443, metadata !3, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!14 = metadata !{i32 786473, metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset", null} ; [ DW_TAG_file_type ]
+!15 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!16 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
+
+; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
+;
+; CHECK: @main
+; CHECK: DEBUG_VALUE: X
+; CHECK: call
+
+%"class.__gnu_cxx::hash_map" = type { %"class.__gnu_cxx::hashtable" }
+%"class.__gnu_cxx::hashtable" = type { i64, i64, i64, i64, i64, i64 }
+
+define void @main() uwtable ssp {
+entry:
+  %X = alloca %"class.__gnu_cxx::hash_map", align 8
+  br i1 undef, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  unreachable
+
+cond.end:                                         ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !21)
+  %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
+  invoke void @_Znwm()
+          to label %exit.i unwind label %lpad2.i.i.i.i
+
+exit.i:                                           ; preds = %cond.end
+  unreachable
+
+lpad2.i.i.i.i:                                    ; preds = %cond.end
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br i1 undef, label %lpad.body.i.i, label %if.then.i.i.i.i.i.i.i.i
+
+if.then.i.i.i.i.i.i.i.i:                          ; preds = %lpad2.i.i.i.i
+  unreachable
+
+lpad.body.i.i:                                    ; preds = %lpad2.i.i.i.i
+  resume { i8*, i32 } %0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_Znwm()
+
+!llvm.dbg.cu = !{!20}
+
+!20 = metadata !{i32 786449, i32 0, i32 4, metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++", metadata !"clang version 3.3 (trunk 169129) (llvm/trunk 169135)", i1 true, i1 true, metadata !"", i32 0, null, null, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
+!21 = metadata !{i32 786688, null, metadata !"X", null, i32 29, metadata !22, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] [line 29]
+!22 = metadata !{i32 786454, null, metadata !"HM", metadata !23, i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
+!23 = metadata !{i32 786473, metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++", null} ; [ DW_TAG_file_type ]
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
new file mode 100644
index 0000000000..d290d514cc
--- /dev/null
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Test RegisterPressure handling of DBG_VALUE.
+;
+; CHECK: %entry
+; CHECK: DEBUG_VALUE: callback
+; CHECK: ret
+
+%struct.btCompoundLeafCallback = type { i32, i32 }
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define void @test() unnamed_addr uwtable ssp align 2 {
+entry:
+  %callback = alloca %struct.btCompoundLeafCallback, align 8
+  br i1 undef, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3)
+  %m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
+  store i32 0, i32* undef, align 8
+  %cmp12447 = icmp sgt i32 undef, 0
+  br i1 %cmp12447, label %for.body.lr.ph, label %invoke.cont44
+
+for.body.lr.ph:                                   ; preds = %if.end
+  unreachable
+
+invoke.cont44:                                    ; preds = %if.end
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet", metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, i1 true, metadata !"", i32 0, metadata !1, null, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{null, null}
+!3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214]
+!4 = metadata !{i32 786451, null, metadata !"btCompoundLeafCallback", metadata !5, i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [from ]
+!5 = metadata !{i32 786473, metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet", null} ; [ DW_TAG_file_type ]
diff --git a/test/CodeGen/X86/2012-12-06-python27-miscompile.ll b/test/CodeGen/X86/2012-12-06-python27-miscompile.ll
new file mode 100644
index 0000000000..d9effc92fa
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-06-python27-miscompile.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure that we are zeroing one memory location at a time using xorl and
+; not both using XMM registers.
+
+;CHECK: @foo
+;CHECK: xorl
+;CHECK-NOT: xmm
+;CHECK: ret
+define i32 @foo (i64* %so) nounwind uwtable ssp {
+entry:
+  %used = getelementptr inbounds i64* %so, i32 3
+  store i64 0, i64* %used, align 8
+  %fill = getelementptr inbounds i64* %so, i32 2
+  %L = load i64* %fill, align 8
+  store i64 0, i64* %fill, align 8
+  %cmp28 = icmp sgt i64 %L, 0
+  %R = sext i1 %cmp28 to i32
+  ret i32 %R
+}
diff --git a/test/CodeGen/X86/2012-12-1-merge-multiple.ll b/test/CodeGen/X86/2012-12-1-merge-multiple.ll
new file mode 100644
index 0000000000..5931c3d27b
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-1-merge-multiple.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win64 | FileCheck %s
+
+; CHECK: multiple_stores_on_chain
+; CHECK: movabsq
+; CHECK: movq
+; CHECK: movabsq
+; CHECK: movq
+; CHECK: ret
+define void @multiple_stores_on_chain(i16 * %A) {
+entry:
+  %a0 = getelementptr inbounds i16* %A, i64 0
+  %a1 = getelementptr inbounds i16* %A, i64 1
+  %a2 = getelementptr inbounds i16* %A, i64 2
+  %a3 = getelementptr inbounds i16* %A, i64 3
+  %a4 = getelementptr inbounds i16* %A, i64 4
+  %a5 = getelementptr inbounds i16* %A, i64 5
+  %a6 = getelementptr inbounds i16* %A, i64 6
+  %a7 = getelementptr inbounds i16* %A, i64 7
+
+  store i16 0, i16* %a0
+  store i16 1, i16* %a1
+  store i16 2, i16* %a2
+  store i16 3, i16* %a3
+  store i16 4, i16* %a4
+  store i16 5, i16* %a5
+  store i16 6, i16* %a6
+  store i16 7, i16* %a7
+
+  ret void
+}
+
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 1446b36a0f..0fec9658d6 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -1,9 +1,12 @@
-; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X32 %s
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=X64 %s
 
 declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+declare i32 @func_int(i32, i32)
+
 ; WIN64: testf16_inp
 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
@@ -11,19 +14,19 @@ declare <16 x float> @func_float16(<16 x float>, <16 x float>)
 ; WIN64: call
 ; WIN64: ret
 
-; WIN32: testf16_inp
-; WIN32: movl    %eax, (%esp)
-; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
-; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
-; WIN32: call
-; WIN32: ret
+; X32: testf16_inp
+; X32: movl    %eax, (%esp)
+; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; X32: call
+; X32: ret
 
-; NOT_WIN: testf16_inp
-; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
-; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
-; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
-; NOT_WIN: call
-; NOT_WIN: ret
+; X64: testf16_inp
+; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; X64: leaq    {{.*}}(%rsp), %rdi
+; X64: call
+; X64: ret
 
 ;test calling conventions - input parameters
 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
@@ -45,11 +48,11 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN64: ret
 
 ; preserved ymm8-ymm15
-; NOT_WIN: testf16_regs
-; NOT_WIN: call
-; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
-; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
-; NOT_WIN: ret
+; X64: testf16_regs
+; X64: call
+; X64: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
+; X64: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; X64: ret
 
 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
   %y = alloca <16 x float>, align 16
@@ -84,24 +87,43 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
 ; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
 
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
-; NOT_WIN: call
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
-; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: call
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
    ret <16 x float> %c
 }
+
+; test functions with integer parameters
+; pass parameters on stack for 32-bit platform
+; X32: movl {{.*}}, 4(%esp)
+; X32: movl {{.*}}, (%esp)
+; X32: call
+; X32: addl {{.*}}, %eax
+
+; pass parameters in registers for 64-bit platform
+; X64: leal {{.*}}, %edi
+; X64: movl {{.*}}, %esi
+; X64: call
+; X64: addl {{.*}}, %eax
+define i32 @test_int(i32 %a, i32 %b) nounwind {
+    %c1 = add i32 %a, %b
+	%c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a)
+    %c = add i32 %c2, %b
+	ret i32 %c
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index c44beb4bc2..0be83f648d 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -671,7 +671,9 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
   ; CHECK: test_x86_sse2_storeu_dq
   ; CHECK: movl
   ; CHECK: vmovdqu
-  call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a1)
+  ; add operation forces the execution domain.
+  %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
   ret void
 }
 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
@@ -681,6 +683,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
   ; CHECK: test_x86_sse2_storeu_pd
   ; CHECK: movl
   ; CHECK: vmovupd
+  ; fadd operation forces the execution domain.
   %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
   call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
   ret void
@@ -1140,9 +1143,9 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
 
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-  ; CHECK: movl
-  ; CHECK: movl
-  ; CHECK: vpcmpestri
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1150,6 +1153,18 @@ define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
@@ -1216,8 +1231,19 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistri
+  ; CHECK: vpcmpistri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1225,6 +1251,16 @@ define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
   ; CHECK: seta
@@ -1271,7 +1307,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistrm
+  ; CHECK: vpcmpistrm $7
   ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
@@ -1279,6 +1315,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vaddss
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -2303,7 +2348,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea
 
 
 define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
-  ; CHECK: vpermilps
+  ; CHECK: vpshufd
   %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index ec11654b35..65685a3224 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -6,7 +6,7 @@ define <4 x float> @test1(<4 x float> %a) nounwind {
   ret <4 x float> %b
 ; CHECK: test1:
 ; CHECK: vshufps
-; CHECK: vpermilps
+; CHECK: vpshufd
 }
 
 ; rdar://10538417
@@ -106,7 +106,7 @@ define <4 x float> @test11(<4 x float> %a) nounwind  {
 
 define <4 x float> @test12(<4 x float>* %a) nounwind  {
 ; CHECK: test12
-; CHECK: vpermilps $27, (
+; CHECK: vpshufd
   %tmp0 = load <4 x float>* %a
   %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %tmp1
@@ -246,3 +246,54 @@ define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   ret <8 x float>%S
 }
 
+; rdar://12684358
+; Make sure loads happen before stores.
+; CHECK: swap8doubles
+; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
+; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
+; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
+; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
+; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
+; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
+; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
+; CHECK: vextractf128
+; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
+; CHECK: vextractf128
+; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
+; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
+define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp {
+entry:
+  %add.ptr = getelementptr inbounds double* %A, i64 2
+  %v.i = bitcast double* %A to <2 x double>*
+  %0 = load <2 x double>* %v.i, align 1
+  %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %v1.i = bitcast double* %add.ptr to <2 x double>*
+  %1 = load <2 x double>* %v1.i, align 1
+  %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind
+  %add.ptr1 = getelementptr inbounds double* %A, i64 6
+  %add.ptr2 = getelementptr inbounds double* %A, i64 4
+  %v.i27 = bitcast double* %add.ptr2 to <2 x double>*
+  %3 = load <2 x double>* %v.i27, align 1
+  %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %v1.i29 = bitcast double* %add.ptr1 to <2 x double>*
+  %4 = load <2 x double>* %v1.i29, align 1
+  %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind
+  %6 = bitcast double* %C to <4 x double>*
+  %7 = load <4 x double>* %6, align 32
+  %add.ptr5 = getelementptr inbounds double* %C, i64 4
+  %8 = bitcast double* %add.ptr5 to <4 x double>*
+  %9 = load <4 x double>* %8, align 32
+  %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1)
+  %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1)
+  store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16
+  store <2 x double> %10, <2 x double>* %v1.i, align 16
+  store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16
+  store <2 x double> %11, <2 x double>* %v1.i29, align 16
+  store <4 x double> %2, <4 x double>* %6, align 32
+  store <4 x double> %5, <4 x double>* %8, align 32
+  ret void
+}
+declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 94bcddd975..67e4b40810 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -47,9 +47,9 @@ entry:
 ;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
 ; To:
 ;   shuffle (vload ptr)), undef, <1, 1, 1, 1>
-; CHECK: vmovaps
+; CHECK: vmovdqa
+; CHECK-NEXT: vpshufd $-1
 ; CHECK-NEXT: vinsertf128  $1
-; CHECK-NEXT: vpermilps $-1
 define <8 x float> @funcE() nounwind {
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
@@ -75,8 +75,8 @@ __load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_ex
   ret <8 x float> %load_broadcast12281250
 }
 
-; CHECK: vinsertf128 $1
-; CHECK-NEXT: vpermilps $0
+; CHECK: vpshufd $0
+; CHECK-NEXT: vinsertf128 $1
 define <8 x float> @funcF(i32 %val) nounwind {
   %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
   %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
@@ -84,8 +84,8 @@ define <8 x float> @funcF(i32 %val) nounwind {
   ret <8 x float> %tmp
 }
 
-; CHECK: vinsertf128  $1
-; CHECK-NEXT: vpermilps  $0
+; CHECK: vpshufd  $0
+; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -93,8 +93,8 @@ entry:
 }
 
 ; CHECK: vextractf128  $1
+; CHECK-NEXT: vpshufd
 ; CHECK-NEXT: vinsertf128  $1
-; CHECK-NEXT: vpermilps  $85
 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index a414e6880c..cf319cb7fe 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -4,15 +4,62 @@
 ; The mask for the vpblendw instruction needs to be identical for both halves
 ; of the YMM. Need to use two vpblendw instructions.
 
-; CHECK: blendw1
-; CHECK: vpblendw
-; CHECK: vpblendw
+; CHECK: vpblendw_test1
+; mask = 10010110,b = 150,d
+; CHECK: vpblendw  $150, %ymm
 ; CHECK: ret
-define <16 x i16> @blendw1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3,  i32 20, i32 5,  i32 6,  i32 23, 
+                                                               i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31>
   ret <16 x i16> %t
 }
 
+; CHECK: vpblendw_test2
+; mask1 = 00010110 = 22
+; mask2 = 10000000 = 128
+; CHECK: vpblendw  $128, %xmm
+; CHECK: vpblendw  $22, %xmm
+; CHECK: vinserti128
+; CHECK: ret
+define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, 
+                                                               i32 8, i32 9,  i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %t
+}
+
+; CHECK: blend_test1
+; CHECK: vpblendd
+; CHECK: ret
+define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x i32> %t
+}
+
+; CHECK: blend_test2
+; CHECK: vpblendd
+; CHECK: ret
+define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x i32> %t
+}
+
+
+; CHECK: blend_test3
+; CHECK: vblendps
+; CHECK: ret
+define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x float> %t
+}
+
+; CHECK: blend_test4
+; CHECK: vblendpd
+; CHECK: ret
+define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
+  %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  ret <4 x i64> %t
+}
+
 ; CHECK: vpshufhw $27, %ymm
 define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 11f811f8cf..34445428ea 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -28,7 +28,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ; reduce the mask in this case.
 ;CHECK: vsel_8xi16
 ;CHECK: psllw
-;CHECK: psraw
+;CHECK-NOT: psraw
 ;CHECK: pblendvb
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index ec447e5e9c..39a784dec3 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s -march=x86 | grep btl | count 28
-; RUN: llc < %s -march=x86 -mcpu=pentium4 | grep btl | not grep esp
-; RUN: llc < %s -march=x86 -mcpu=penryn   | grep btl | not grep esp
+; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=penryn | FileCheck %s
 ; PR3253
 
 ; The register+memory form of the BT instruction should be usable on
@@ -21,6 +19,9 @@
 
 define void @test2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -36,6 +37,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @test2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test2b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -51,6 +55,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atest2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atest2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -66,6 +73,8 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atest2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atest2b
+; CHECK: btl %eax, %ecx
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -81,6 +90,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @test3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test3
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -96,6 +108,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @test3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: test3b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -111,6 +126,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -126,6 +144,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -141,6 +162,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atestne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atestne2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -156,6 +180,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @atestne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: atestne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -171,6 +198,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne3
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -186,6 +216,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @testne3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: testne3b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
@@ -201,6 +234,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -216,6 +252,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -231,6 +270,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aquery2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aquery2
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -246,6 +288,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aquery2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aquery2b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
@@ -261,6 +306,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -276,6 +324,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3b
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -291,6 +342,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3x(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3x
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp eq i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -306,6 +360,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @query3bx(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: query3bx
+; CHECK: btl %eax, %ecx
+; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp eq i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -321,6 +378,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -336,6 +396,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -351,6 +414,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aqueryne2(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aqueryne2
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -366,6 +432,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @aqueryne2b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: aqueryne2b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
@@ -381,6 +450,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -396,6 +468,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3b(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3b
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp ne i32 %tmp3, %tmp29		; <i1> [#uses=1]
@@ -411,6 +486,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3x(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3x
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
 	%tmp4 = icmp ne i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -426,6 +504,9 @@ UnifiedReturnBlock:		; preds = %entry
 
 define void @queryne3bx(i32 %x, i32 %n) nounwind {
 entry:
+; CHECK: queryne3bx
+; CHECK: btl %eax, %ecx
+; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
 	%tmp4 = icmp ne i32 %tmp29, %tmp3		; <i1> [#uses=1]
@@ -440,3 +521,16 @@ UnifiedReturnBlock:		; preds = %entry
 }
 
 declare void @foo()
+
+; rdar://12755626
+define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
+; CHECK: invert
+; CHECK: btl %eax, %ecx
+; CHECK: setae
+entry:
+  %neg = xor i32 %flags, -1
+  %shl = shl i32 1, %flag
+  %and = and i32 %shl, %neg
+  %tobool = icmp ne i32 %and, 0
+  ret i1 %tobool
+}
diff --git a/test/CodeGen/X86/byval2.ll b/test/CodeGen/X86/byval2.ll
index 196efe58e6..c5187db6de 100644
--- a/test/CodeGen/X86/byval2.ll
+++ b/test/CodeGen/X86/byval2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
 ; X64-NOT:     movsq
 ; X64:     rep
 ; X64-NOT:     rep
@@ -12,7 +12,7 @@
 
 ; Win64 has not supported byval yet.
 
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
 ; X32-NOT:     movsl
 ; X32:     rep
 ; X32-NOT:     rep
diff --git a/test/CodeGen/X86/byval3.ll b/test/CodeGen/X86/byval3.ll
index f3b125c6e3..d06fd8898e 100644
--- a/test/CodeGen/X86/byval3.ll
+++ b/test/CodeGen/X86/byval3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
 ; X64-NOT:     movsq
 ; X64:     rep
 ; X64-NOT:     rep
@@ -12,7 +12,7 @@
 
 ; Win64 has not supported byval yet.
 
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
 ; X32-NOT:     movsl
 ; X32:     rep
 ; X32-NOT:     rep
diff --git a/test/CodeGen/X86/byval4.ll b/test/CodeGen/X86/byval4.ll
index b7a4aa3f9b..4711e45111 100644
--- a/test/CodeGen/X86/byval4.ll
+++ b/test/CodeGen/X86/byval4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
 ; X64-NOT:     movsq
 ; X64:     rep
 ; X64-NOT:     rep
@@ -12,7 +12,7 @@
 
 ; Win64 has not supported byval yet.
 
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
 ; X32-NOT:     movsl
 ; X32:     rep
 ; X32-NOT:     rep
diff --git a/test/CodeGen/X86/byval5.ll b/test/CodeGen/X86/byval5.ll
index dca0936022..f24a5f9aa3 100644
--- a/test/CodeGen/X86/byval5.ll
+++ b/test/CodeGen/X86/byval5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -check-prefix=X64
 ; X64-NOT:     movsq
 ; X64:     rep
 ; X64-NOT:     rep
@@ -12,7 +12,7 @@
 
 ; Win64 has not supported byval yet.
 
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
 ; X32-NOT:     movsl
 ; X32:     rep
 ; X32-NOT:     rep
diff --git a/test/CodeGen/X86/dbg-at-specficiation.ll b/test/CodeGen/X86/dbg-at-specficiation.ll
index aa5e6efede..48b8202bd5 100644
--- a/test/CodeGen/X86/dbg-at-specficiation.ll
+++ b/test/CodeGen/X86/dbg-at-specficiation.ll
@@ -17,4 +17,4 @@
 !7 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !8, metadata !9, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !8 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 720929, i64 0, i64 9}        ; [ DW_TAG_subrange_type ]
+!10 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
diff --git a/test/CodeGen/X86/dbg-declare.ll b/test/CodeGen/X86/dbg-declare.ll
index 5d4cedc5c4..b73e310cc5 100644
--- a/test/CodeGen/X86/dbg-declare.ll
+++ b/test/CodeGen/X86/dbg-declare.ll
@@ -51,7 +51,7 @@ declare void @llvm.stackrestore(i8*) nounwind
 !19 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 0, i64 8, i32 0, i32 0, metadata !20, metadata !21, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !20 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !21 = metadata !{metadata !22}
-!22 = metadata !{i32 786465, i64 1, i64 0}        ; [ DW_TAG_subrange_type ]
+!22 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
 !23 = metadata !{i32 7, i32 8, metadata !17, null}
 !24 = metadata !{i32 9, i32 1, metadata !17, null}
 !25 = metadata !{i32 8, i32 3, metadata !17, null}
diff --git a/test/CodeGen/X86/dbg-subrange.ll b/test/CodeGen/X86/dbg-subrange.ll
index 788910c7fe..0efb50e9a9 100644
--- a/test/CodeGen/X86/dbg-subrange.ll
+++ b/test/CodeGen/X86/dbg-subrange.ll
@@ -31,7 +31,7 @@ entry:
 !14 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !15 = metadata !{i32 720932, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 720929, i64 0, i64 4294967295} ; [ DW_TAG_subrange_type ]
+!17 = metadata !{i32 720929, i64 0, i64 4294967296} ; [ DW_TAG_subrange_type ]
 !18 = metadata !{i32 5, i32 3, metadata !19, null}
 !19 = metadata !{i32 720907, metadata !5, i32 4, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
 !20 = metadata !{i32 6, i32 1, metadata !19, null}
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
index c5e47facf3..9405f76cbe 100644
--- a/test/CodeGen/X86/dynamic-allocas-VLAs.ll
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -103,7 +103,7 @@ entry:
 
 declare void @t4_helper(i32*, i32*, <8 x float>*)
 
-; Dynamic realignment + Spill
+; Spilling an AVX register shouldn't cause dynamic realignment
 define i32 @t5(float* nocapture %f) nounwind uwtable ssp {
 entry:
   %a = alloca i32, align 4
@@ -116,21 +116,15 @@ entry:
   ret i32 %add
 
 ; CHECK: _t5
-; CHECK: pushq %rbp
-; CHECK: movq %rsp, %rbp
-; CHECK: andq $-32, %rsp
 ; CHECK: subq ${{[0-9]+}}, %rsp
 ;
 ; CHECK: vmovaps (%rdi), [[AVXREG:%ymm[0-9]+]]
-; CHECK: vmovaps [[AVXREG]], (%rsp)
+; CHECK: vmovups [[AVXREG]], (%rsp)
 ; CHECK: leaq {{[0-9]+}}(%rsp), %rdi
 ; CHECK: callq   _t5_helper1
-; CHECK: vmovaps (%rsp), %ymm0
+; CHECK: vmovups (%rsp), %ymm0
 ; CHECK: callq   _t5_helper2
 ; CHECK: movl {{[0-9]+}}(%rsp), %eax
-;
-; CHECK: movq %rbp, %rsp
-; CHECK: popq %rbp
 }
 
 declare void @t5_helper1(i32*)
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 2fe1ecd40e..7a1a9ae461 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -63,6 +63,16 @@ define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x floa
 }
 declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
+; To test execution dependency
+define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+  ; CHECK: vmovaps
+  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+  %x = load <4 x float>* %a0
+  %y = load <4 x float>* %a1
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
+  ret < 4 x float > %res
+}
+
 define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd
   %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
@@ -82,6 +92,16 @@ define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x do
 }
 declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
+; To test execution dependency
+define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+  ; CHECK: vmovapd
+  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+  %x = load <2 x double>* %a0
+  %y = load <2 x double>* %a1
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
+  ret < 2 x double > %res
+}
+
 define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddps
   ; CHECK: ymm
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
new file mode 100644
index 0000000000..c1756d5e2e
--- /dev/null
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+
+; rdar://12721174
+; We should not fold movss into pshufd since pshufd expects m128 while movss
+; loads from m32.
+define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
+; CHECK: sample_test
+; CHECK: movss
+; CHECK: pshufd
+entry:
+  %source.addr = alloca <4 x float>*, align 8
+  %dest.addr = alloca <2 x float>*, align 8
+  %tmp = alloca <2 x float>, align 8
+  store <4 x float>* %source, <4 x float>** %source.addr, align 8
+  store <2 x float>* %dest, <2 x float>** %dest.addr, align 8
+  store <2 x float> zeroinitializer, <2 x float>* %tmp, align 8
+  %0 = load <4 x float>** %source.addr, align 8
+  %arrayidx = getelementptr inbounds <4 x float>* %0, i64 0
+  %1 = load <4 x float>* %arrayidx, align 16
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = load <2 x float>* %tmp, align 8
+  %4 = insertelement <2 x float> %3, float %2, i32 1
+  store <2 x float> %4, <2 x float>* %tmp, align 8
+  %5 = load <2 x float>* %tmp, align 8
+  %6 = load <2 x float>** %dest.addr, align 8
+  %arrayidx1 = getelementptr inbounds <2 x float>* %6, i64 0
+  store <2 x float> %5, <2 x float>* %arrayidx1, align 8
+  %7 = load <2 x float>** %dest.addr, align 8
+  %arrayidx2 = getelementptr inbounds <2 x float>* %7, i64 0
+  %8 = load <2 x float>* %arrayidx2, align 8
+  %vecext = extractelement <2 x float> %8, i32 0
+  %9 = load <2 x float>** %dest.addr, align 8
+  %arrayidx3 = getelementptr inbounds <2 x float>* %9, i64 0
+  %10 = load <2 x float>* %arrayidx3, align 8
+  %vecext4 = extractelement <2 x float> %10, i32 1
+  call void @ext(float %vecext, float %vecext4)
+  ret void
+}
+declare void @ext(float, float)
diff --git a/test/CodeGen/X86/fold-pcmpeqd-2.ll b/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 9cf4607cf5..2bde76efd2 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -43,21 +43,21 @@ forbody:		; preds = %forcond
 	%mul171.i = fmul <4 x float> %add167.i, %sub140.i		; <<4 x float>> [#uses=1]
 	%add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
 	%bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer		; <<4 x i32>> [#uses=1]
+	%andnps178.i = add <4 x i32> %bitcast176.i, <i32 1, i32 1, i32 1, i32 1>		; <<4 x i32>> [#uses=1]
 	%bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float>		; <<4 x float>> [#uses=1]
 	%mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer		; <<4 x float>> [#uses=1]
 	%bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer		; <<4 x i32>> [#uses=1]
+	%andnps192.i = add <4 x i32> %bitcast190.i, <i32 1, i32 1, i32 1, i32 1>		; <<4 x i32>> [#uses=1]
 	%xorps.i = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%orps203.i = or <4 x i32> %andnps192.i, %xorps.i		; <<4 x i32>> [#uses=1]
+	%orps203.i = add <4 x i32> %andnps192.i, %xorps.i		; <<4 x i32>> [#uses=1]
 	%bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float>		; <<4 x float>> [#uses=1]
 	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
 	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
 	%cmpunord.i11 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i8 3) nounwind		; <<4 x float>> [#uses=1]
 	%bitcast6.i13 = bitcast <4 x float> %cmpunord.i11 to <4 x i32>		; <<4 x i32>> [#uses=2]
-	%andps.i14 = and <4 x i32> zeroinitializer, %bitcast6.i13		; <<4 x i32>> [#uses=1]
+	%andps.i14 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %bitcast6.i13		; <<4 x i32>> [#uses=1]
 	%not.i16 = xor <4 x i32> %bitcast6.i13, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%andnps.i17 = and <4 x i32> zeroinitializer, %not.i16		; <<4 x i32>> [#uses=1]
+	%andnps.i17 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %not.i16		; <<4 x i32>> [#uses=1]
 	%orps.i18 = or <4 x i32> %andnps.i17, %andps.i14		; <<4 x i32>> [#uses=1]
 	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
 	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll
new file mode 100644
index 0000000000..76d17a09d5
--- /dev/null
+++ b/test/CodeGen/X86/hipe-cc.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s
+
+; Check the HiPE calling convention works (x86-32)
+
+define void @zap(i32 %a, i32 %b) nounwind {
+entry:
+  ; CHECK:      movl 40(%esp), %eax
+  ; CHECK-NEXT: movl 44(%esp), %edx
+  ; CHECK-NEXT: movl       $8, %ecx
+  ; CHECK-NEXT: calll addfour
+  %0 = call cc 11 {i32, i32, i32} @addfour(i32 undef, i32 undef, i32 %a, i32 %b, i32 8)
+  %res = extractvalue {i32, i32, i32} %0, 2
+
+  ; CHECK:      movl %eax, 16(%esp)
+  ; CHECK-NEXT: movl   $2, 12(%esp)
+  ; CHECK-NEXT: movl   $1,  8(%esp)
+  ; CHECK:      calll foo
+  tail call void @foo(i32 undef, i32 undef, i32 1, i32 2, i32 %res) nounwind
+  ret void
+}
+
+define cc 11 {i32, i32, i32} @addfour(i32 %hp, i32 %p, i32 %x, i32 %y, i32 %z) nounwind {
+entry:
+  ; CHECK:      addl %edx, %eax
+  ; CHECK-NEXT: addl %ecx, %eax
+  %0 = add i32 %x, %y
+  %1 = add i32 %0, %z
+
+  ; CHECK:      ret
+  %res = insertvalue {i32, i32, i32} undef, i32 %1, 2
+  ret {i32, i32, i32} %res
+}
+
+define cc 11 void @foo(i32 %hp, i32 %p, i32 %arg0, i32 %arg1, i32 %arg2) nounwind {
+entry:
+  ; CHECK:      movl  %esi, 16(%esp)
+  ; CHECK-NEXT: movl  %ebp, 12(%esp)
+  ; CHECK-NEXT: movl  %eax,  8(%esp)
+  ; CHECK-NEXT: movl  %edx,  4(%esp)
+  ; CHECK-NEXT: movl  %ecx,   (%esp)
+  %hp_var   = alloca i32
+  %p_var    = alloca i32
+  %arg0_var = alloca i32
+  %arg1_var = alloca i32
+  %arg2_var = alloca i32
+  store i32 %hp, i32* %hp_var
+  store i32 %p, i32* %p_var
+  store i32 %arg0, i32* %arg0_var
+  store i32 %arg1, i32* %arg1_var
+  store i32 %arg2, i32* %arg2_var
+
+  ; CHECK:      movl   4(%esp), %edx
+  ; CHECK-NEXT: movl   8(%esp), %eax
+  ; CHECK-NEXT: movl  12(%esp), %ebp
+  ; CHECK-NEXT: movl  16(%esp), %esi
+  %0 = load i32* %hp_var
+  %1 = load i32* %p_var
+  %2 = load i32* %arg0_var
+  %3 = load i32* %arg1_var
+  %4 = load i32* %arg2_var
+  ; CHECK:      jmp bar
+  tail call cc 11 void @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) nounwind
+  ret void
+}
+
+define cc 11 void @baz() nounwind {
+  %tmp_clos = load i32* @clos
+  %tmp_clos2 = inttoptr i32 %tmp_clos to i32*
+  %indirect_call = bitcast i32* %tmp_clos2 to void (i32, i32, i32)*
+  ; CHECK:      movl $42, %eax
+  ; CHECK-NEXT: jmpl *clos
+  tail call cc 11 void %indirect_call(i32 undef, i32 undef, i32 42) nounwind
+  ret void
+}
+
+@clos = external constant i32
+declare cc 11 void @bar(i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
new file mode 100644
index 0000000000..5dbb5a25cb
--- /dev/null
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -0,0 +1,87 @@
+; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s
+
+; Check the HiPE calling convention works (x86-64)
+
+define void @zap(i64 %a, i64 %b) nounwind {
+entry:
+  ; CHECK:      movq %rsi, %rax
+  ; CHECK-NEXT: movq %rdi, %rsi
+  ; CHECK-NEXT: movq %rax, %rdx
+  ; CHECK-NEXT: movl $8, %ecx
+  ; CHECK-NEXT: movl $9, %r8d
+  ; CHECK-NEXT: callq addfour
+  %0 = call cc 11 {i64, i64, i64} @addfour(i64 undef, i64 undef, i64 %a, i64 %b, i64 8, i64 9)
+  %res = extractvalue {i64, i64, i64} %0, 2
+
+  ; CHECK:      movl $1, %edx
+  ; CHECK-NEXT: movl $2, %ecx
+  ; CHECK-NEXT: movl $3, %r8d
+  ; CHECK-NEXT: movq %rax, %r9
+  ; CHECK:      callq foo
+  tail call void @foo(i64 undef, i64 undef, i64 1, i64 2, i64 3, i64 %res) nounwind
+  ret void
+}
+
+define cc 11 {i64, i64, i64} @addfour(i64 %hp, i64 %p, i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
+entry:
+  ; CHECK:      leaq (%rsi,%rdx), %rax
+  ; CHECK-NEXT: addq %rcx, %rax
+  ; CHECK-NEXT: addq %r8, %rax
+  %0 = add i64 %x, %y
+  %1 = add i64 %0, %z
+  %2 = add i64 %1, %w
+
+  ; CHECK:      ret
+  %res = insertvalue {i64, i64, i64} undef, i64 %2, 2
+  ret {i64, i64, i64} %res
+}
+
+define cc 11 void @foo(i64 %hp, i64 %p, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3) nounwind {
+entry:
+  ; CHECK:      movq  %r15, 40(%rsp)
+  ; CHECK-NEXT: movq  %rbp, 32(%rsp)
+  ; CHECK-NEXT: movq  %rsi, 24(%rsp)
+  ; CHECK-NEXT: movq  %rdx, 16(%rsp)
+  ; CHECK-NEXT: movq  %rcx, 8(%rsp)
+  ; CHECK-NEXT: movq  %r8, (%rsp)
+  %hp_var   = alloca i64
+  %p_var    = alloca i64
+  %arg0_var = alloca i64
+  %arg1_var = alloca i64
+  %arg2_var = alloca i64
+  %arg3_var = alloca i64
+  store i64 %hp, i64* %hp_var
+  store i64 %p, i64* %p_var
+  store i64 %arg0, i64* %arg0_var
+  store i64 %arg1, i64* %arg1_var
+  store i64 %arg2, i64* %arg2_var
+  store i64 %arg3, i64* %arg3_var
+
+  ; CHECK:      movq  8(%rsp), %rcx
+  ; CHECK-NEXT: movq  16(%rsp), %rdx
+  ; CHECK-NEXT: movq  24(%rsp), %rsi
+  ; CHECK-NEXT: movq  32(%rsp), %rbp
+  ; CHECK-NEXT: movq  40(%rsp), %r15
+  %0 = load i64* %hp_var
+  %1 = load i64* %p_var
+  %2 = load i64* %arg0_var
+  %3 = load i64* %arg1_var
+  %4 = load i64* %arg2_var
+  %5 = load i64* %arg3_var
+  ; CHECK:      jmp bar
+  tail call cc 11 void @bar(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind
+  ret void
+}
+
+define cc 11 void @baz() nounwind {
+  %tmp_clos = load i64* @clos
+  %tmp_clos2 = inttoptr i64 %tmp_clos to i64*
+  %indirect_call = bitcast i64* %tmp_clos2 to void (i64, i64, i64)*
+  ; CHECK:      movl $42, %esi
+  ; CHECK-NEXT: jmpq *(%rax)
+  tail call cc 11 void %indirect_call(i64 undef, i64 undef, i64 42) nounwind
+  ret void
+}
+
+@clos = external constant i64
+declare cc 11 void @bar(i64, i64, i64, i64, i64, i64)
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index e6eb9efd8c..d201ebdc85 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -52,3 +52,10 @@ entry:
   %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
   ret void
 }
+
+; Mix normal and EC defs of the same register.
+define i32 @pr14376() nounwind noinline {
+entry:
+  %asm = tail call i32 asm sideeffect "", "={ax},i,~{eax},~{flags},~{rax}"(i64 61) nounwind
+  ret i32 %asm
+}
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index eae2e70834..dcc8f0d268 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mattr=+sse2      -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2
 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
 ; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
 ; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
@@ -9,18 +10,18 @@
 define void @t1(i32 %argc, i8** %argv) nounwind  {
 entry:
 ; SSE2: t1:
+; SSE2: movsd _.str+16, %xmm0
+; SSE2: movsd %xmm0, 16(%esp)
 ; SSE2: movaps _.str, %xmm0
 ; SSE2: movaps %xmm0
-; SSE2: movb $0
-; SSE2: movl $0
-; SSE2: movl $0
+; SSE2: movb $0, 24(%esp)
 
 ; SSE1: t1:
+; SSE1: fldl _.str+16
+; SSE1: fstpl 16(%esp)
 ; SSE1: movaps _.str, %xmm0
 ; SSE1: movaps %xmm0
-; SSE1: movb $0
-; SSE1: movl $0
-; SSE1: movl $0
+; SSE1: movb $0, 24(%esp)
 
 ; NOSSE: t1:
 ; NOSSE: movb $0
diff --git a/test/CodeGen/X86/memset-sse-stack-realignment.ll b/test/CodeGen/X86/memset-sse-stack-realignment.ll
new file mode 100644
index 0000000000..df9de5dfaf
--- /dev/null
+++ b/test/CodeGen/X86/memset-sse-stack-realignment.ll
@@ -0,0 +1,77 @@
+; Make sure that we realign the stack. Mingw32 uses 4 byte stack alignment, we
+; need 16 bytes for SSE and 32 bytes for AVX.
+
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium2 | FileCheck %s -check-prefix=NOSSE
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium3 | FileCheck %s -check-prefix=SSE1
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=yonah | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX1
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2
+
+define void @test1(i32 %t) nounwind {
+  %tmp1210 = alloca i8, i32 32, align 4
+  call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
+  %x = alloca i8, i32 %t
+  call void @dummy(i8* %x)
+  ret void
+
+; NOSSE: test1:
+; NOSSE-NOT: and
+; NOSSE: movl $0
+
+; SSE1: test1:
+; SSE1: andl $-16
+; SSE1: movl %esp, %esi
+; SSE1: movaps
+
+; SSE2: test1:
+; SSE2: andl $-16
+; SSE2: movl %esp, %esi
+; SSE2: movaps
+
+; AVX1: test1:
+; AVX1: andl $-32
+; AVX1: movl %esp, %esi
+; AVX1: vmovaps %ymm
+
+; AVX2: test1:
+; AVX2: andl $-32
+; AVX2: movl %esp, %esi
+; AVX2: vmovaps %ymm
+
+}
+
+define void @test2(i32 %t) nounwind {
+  %tmp1210 = alloca i8, i32 16, align 4
+  call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 16, i32 4, i1 false)
+  %x = alloca i8, i32 %t
+  call void @dummy(i8* %x)
+  ret void
+
+; NOSSE: test2:
+; NOSSE-NOT: and
+; NOSSE: movl $0
+
+; SSE1: test2:
+; SSE1: andl $-16
+; SSE1: movl %esp, %esi
+; SSE1: movaps
+
+; SSE2: test2:
+; SSE2: andl $-16
+; SSE2: movl %esp, %esi
+; SSE2: movaps
+
+; AVX1: test2:
+; AVX1: andl $-16
+; AVX1: movl %esp, %esi
+; AVX1: vmovaps %xmm
+
+; AVX2: test2:
+; AVX2: andl $-16
+; AVX2: movl %esp, %esi
+; AVX2: vmovaps %xmm
+}
+
+declare void @dummy(i8*)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/memset.ll b/test/CodeGen/X86/memset.ll
index 72b3e0fa3d..b35f2615d0 100644
--- a/test/CodeGen/X86/memset.ll
+++ b/test/CodeGen/X86/memset.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=-sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 9
-; RUN: llc < %s -march=x86 -mattr=+sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 3
+; RUN: llc < %s -march=x86 -mcpu=pentium2 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -march=x86 -mcpu=pentium3 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=XMM
+; RUN: llc < %s -march=x86 -mcpu=bdver1   -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=YMM
 
 	%struct.x = type { i16, i16 }
 
@@ -8,7 +9,27 @@ entry:
 	%up_mvd = alloca [8 x %struct.x]		; <[8 x %struct.x]*> [#uses=2]
 	%up_mvd116 = getelementptr [8 x %struct.x]* %up_mvd, i32 0, i32 0		; <%struct.x*> [#uses=1]
 	%tmp110117 = bitcast [8 x %struct.x]* %up_mvd to i8*		; <i8*> [#uses=1]
+
 	call void @llvm.memset.p0i8.i64(i8* %tmp110117, i8 0, i64 32, i32 8, i1 false)
+; X86: movl $0,
+; X86: movl $0,
+; X86: movl $0,
+; X86: movl $0,
+; X86: movl $0,
+; X86: movl $0,
+; X86: movl $0,
+; X86: movl $0,
+; X86-NOT: movl $0,
+
+; XMM: xorps %xmm{{[0-9]+}}, [[Z:%xmm[0-9]+]]
+; XMM: movaps [[Z]],
+; XMM: movaps [[Z]],
+; XMM-NOT: movaps
+
+; YMM: vxorps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, [[Z:%ymm[0-9]+]]
+; YMM: vmovaps [[Z]],
+; YMM-NOT: movaps
+
 	call void @foo( %struct.x* %up_mvd116 ) nounwind 
 	ret void
 }
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index e20fce172f..8cfa032797 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=nehalem | grep movups | count 5
 ; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=core2   | grep movl   | count 20
+; RUN: llc < %s -mtriple=i386-pc-mingw32   -mcpu=core2   | grep movl   | count 20
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2   | grep movq   | count 10
 
 define void @bork() nounwind {
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
new file mode 100644
index 0000000000..2184d9e960
--- /dev/null
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Verify that misched resource/latency balancy heuristics are sane.
+
+define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+ i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+; imull folded loads should be in order and interleaved with addl, never
+; adjacent. Also check that we have no spilling.
+;
+; Since mmult1 IR is already in good order, this effectively ensure
+; the scheduler maintains source order.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
+
+; Unlike the above loop, this IR starts out bad and must be
+; rescheduled.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
new file mode 100644
index 0000000000..f5566e5e5d
--- /dev/null
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -0,0 +1,195 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -misched-topdown -verify-machineinstrs \
+; RUN:     | FileCheck %s -check-prefix=TOPDOWN
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -misched=ilpmin -verify-machineinstrs \
+; RUN:     | FileCheck %s -check-prefix=ILPMIN
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -misched=ilpmax -verify-machineinstrs \
+; RUN:     | FileCheck %s -check-prefix=ILPMAX
+;
+; Verify that the MI scheduler minimizes register pressure for a
+; uniform set of bottom-up subtrees (unrolled matrix multiply).
+;
+; For current top-down heuristics, ensure that some folded imulls have
+; been reordered with the stores. This tests the scheduler's cheap
+; alias analysis ability (that doesn't require any AliasAnalysis pass).
+;
+; TOPDOWN: %for.body
+; TOPDOWN: movl %{{.*}}, (
+; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN: movl %{{.*}}, 4(
+; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN: movl %{{.*}}, 8(
+; TOPDOWN: movl %{{.*}}, 12(
+; TOPDOWN: %for.end
+;
+; For -misched=ilpmin, verify that each expression subtree is
+; scheduled independently, and that the imull/adds are interleaved.
+;
+; ILPMIN: %for.body
+; ILPMIN: movl %{{.*}}, (
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 4(
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 8(
+; ILPMIN: imull
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: imull
+; ILPMIN: addl
+; ILPMIN: movl %{{.*}}, 12(
+; ILPMIN: %for.end
+;
+; For -misched=ilpmax, verify that each expression subtree is
+; scheduled independently, and that the imull/adds are clustered.
+;
+; ILPMAX: %for.body
+; ILPMAX: movl %{{.*}}, (
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 4(
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 8(
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: imull
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: addl
+; ILPMAX: movl %{{.*}}, 12(
+; ILPMAX: %for.end
+
+define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
+[4 x i32]* noalias nocapture %m3) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                              ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx8 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 0
+  %tmp = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 0
+  %tmp1 = load i32* %arrayidx12, align 4, !tbaa !0
+  %arrayidx8.1 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 1
+  %tmp2 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 0
+  %tmp3 = load i32* %arrayidx12.1, align 4, !tbaa !0
+  %arrayidx8.2 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 2
+  %tmp4 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 0
+  %tmp5 = load i32* %arrayidx12.2, align 4, !tbaa !0
+  %arrayidx8.3 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 3
+  %tmp6 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 0
+  %tmp8 = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12.137 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 1
+  %tmp9 = load i32* %arrayidx12.137, align 4, !tbaa !0
+  %tmp10 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 1
+  %tmp11 = load i32* %arrayidx12.1.1, align 4, !tbaa !0
+  %tmp12 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2.1 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 1
+  %tmp13 = load i32* %arrayidx12.2.1, align 4, !tbaa !0
+  %tmp14 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3.1 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 1
+  %tmp15 = load i32* %arrayidx12.3.1, align 4, !tbaa !0
+  %tmp16 = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12.239 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 2
+  %tmp17 = load i32* %arrayidx12.239, align 4, !tbaa !0
+  %tmp18 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1.2 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 2
+  %tmp19 = load i32* %arrayidx12.1.2, align 4, !tbaa !0
+  %tmp20 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 2
+  %tmp21 = load i32* %arrayidx12.2.2, align 4, !tbaa !0
+  %tmp22 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3.2 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 2
+  %tmp23 = load i32* %arrayidx12.3.2, align 4, !tbaa !0
+  %tmp24 = load i32* %arrayidx8, align 4, !tbaa !0
+  %arrayidx12.341 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 3
+  %tmp25 = load i32* %arrayidx12.341, align 4, !tbaa !0
+  %tmp26 = load i32* %arrayidx8.1, align 4, !tbaa !0
+  %arrayidx12.1.3 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 3
+  %tmp27 = load i32* %arrayidx12.1.3, align 4, !tbaa !0
+  %tmp28 = load i32* %arrayidx8.2, align 4, !tbaa !0
+  %arrayidx12.2.3 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 3
+  %tmp29 = load i32* %arrayidx12.2.3, align 4, !tbaa !0
+  %tmp30 = load i32* %arrayidx8.3, align 4, !tbaa !0
+  %arrayidx12.3.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 3
+  %tmp31 = load i32* %arrayidx12.3.3, align 4, !tbaa !0
+  %tmp7 = load i32* %arrayidx12.3, align 4, !tbaa !0
+  %mul = mul nsw i32 %tmp1, %tmp
+  %mul.1 = mul nsw i32 %tmp3, %tmp2
+  %mul.2 = mul nsw i32 %tmp5, %tmp4
+  %mul.3 = mul nsw i32 %tmp7, %tmp6
+  %mul.138 = mul nsw i32 %tmp9, %tmp8
+  %mul.1.1 = mul nsw i32 %tmp11, %tmp10
+  %mul.2.1 = mul nsw i32 %tmp13, %tmp12
+  %mul.3.1 = mul nsw i32 %tmp15, %tmp14
+  %mul.240 = mul nsw i32 %tmp17, %tmp16
+  %mul.1.2 = mul nsw i32 %tmp19, %tmp18
+  %mul.2.2 = mul nsw i32 %tmp21, %tmp20
+  %mul.3.2 = mul nsw i32 %tmp23, %tmp22
+  %mul.342 = mul nsw i32 %tmp25, %tmp24
+  %mul.1.3 = mul nsw i32 %tmp27, %tmp26
+  %mul.2.3 = mul nsw i32 %tmp29, %tmp28
+  %mul.3.3 = mul nsw i32 %tmp31, %tmp30
+  %add.1 = add nsw i32 %mul.1, %mul
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %add.3 = add nsw i32 %mul.3, %add.2
+  %add.1.1 = add nsw i32 %mul.1.1, %mul.138
+  %add.2.1 = add nsw i32 %mul.2.1, %add.1.1
+  %add.3.1 = add nsw i32 %mul.3.1, %add.2.1
+  %add.1.2 = add nsw i32 %mul.1.2, %mul.240
+  %add.2.2 = add nsw i32 %mul.2.2, %add.1.2
+  %add.3.2 = add nsw i32 %mul.3.2, %add.2.2
+  %add.1.3 = add nsw i32 %mul.1.3, %mul.342
+  %add.2.3 = add nsw i32 %mul.2.3, %add.1.3
+  %add.3.3 = add nsw i32 %mul.3.3, %add.2.3
+  %arrayidx16 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 0
+  store i32 %add.3, i32* %arrayidx16, align 4, !tbaa !0
+  %arrayidx16.1 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 1
+  store i32 %add.3.1, i32* %arrayidx16.1, align 4, !tbaa !0
+  %arrayidx16.2 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 2
+  store i32 %add.3.2, i32* %arrayidx16.2, align 4, !tbaa !0
+  %arrayidx16.3 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 3
+  store i32 %add.3.3, i32* %arrayidx16.3, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                        ; preds = %for.body
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/misched-new.ll b/test/CodeGen/X86/misched-new.ll
index cec04b534f..89e45b7cfc 100644
--- a/test/CodeGen/X86/misched-new.ll
+++ b/test/CodeGen/X86/misched-new.ll
@@ -1,6 +1,9 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
 ; RUN:          -misched=shuffle -misched-bottomup -verify-machineinstrs \
 ; RUN:     | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN:          -misched=shuffle -misched-topdown -verify-machineinstrs \
+; RUN:     | FileCheck %s --check-prefix TOPDOWN
 ; REQUIRES: asserts
 ;
 ; Interesting MachineScheduler cases.
@@ -51,3 +54,56 @@ if.end:                                           ; preds = %if.then, %entry
 }
 
 declare void @bar(i32,i32)
+
+; Test that the DAG builder can handle an undef vreg on ExitSU.
+; CHECK: hasundef
+; CHECK: call
+
+%t0 = type { i32, i32, i8 }
+%t6 = type { i32 (...)**, %t7* }
+%t7 = type { i32 (...)** }
+
+define void @hasundef() unnamed_addr uwtable ssp align 2 {
+  %1 = alloca %t0, align 8
+  br i1 undef, label %3, label %2
+
+; <label>:2                                       ; preds = %0
+  unreachable
+
+; <label>:3                                       ; preds = %0
+  br i1 undef, label %4, label %5
+
+; <label>:4                                       ; preds = %3
+  call void undef(%t6* undef, %t0* %1)
+  unreachable
+
+; <label>:5                                       ; preds = %3
+  ret void
+}
+
+; Test top-down subregister liveness tracking. Self-verification
+; catches any pressure set underflow.
+; rdar://12797931.
+;
+; TOPDOWN: @testSubregTracking
+; TOPDOWN: divb
+; TOPDOWN: movzbl %al
+; TOPDOWN: ret
+define void @testSubregTracking() nounwind uwtable ssp align 2 {
+  %tmp = load i8* undef, align 1
+  %tmp6 = sub i8 0, %tmp
+  %tmp7 = load i8* undef, align 1
+  %tmp8 = udiv i8 %tmp6, %tmp7
+  %tmp9 = zext i8 %tmp8 to i64
+  %tmp10 = load i8* undef, align 1
+  %tmp11 = zext i8 %tmp10 to i64
+  %tmp12 = mul i64 %tmp11, %tmp9
+  %tmp13 = urem i8 %tmp6, %tmp7
+  %tmp14 = zext i8 %tmp13 to i32
+  %tmp15 = add nsw i32 %tmp14, 0
+  %tmp16 = add i32 %tmp15, 0
+  store i32 %tmp16, i32* undef, align 4
+  %tmp17 = add i64 0, %tmp12
+  store i64 %tmp17, i64* undef, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/pr14314.ll b/test/CodeGen/X86/pr14314.ll
new file mode 100644
index 0000000000..0832702244
--- /dev/null
+++ b/test/CodeGen/X86/pr14314.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 | FileCheck %s
+
+define i64 @atomicSub(i64* %a, i64 %b) nounwind {
+entry:
+  %0 = atomicrmw sub i64* %a, i64 %b seq_cst
+  ret i64 %0
+; CHECK: atomicSub
+; CHECK: movl %eax, %ebx
+; CHECK: subl {{%[a-z]+}}, %ebx
+; CHECK: movl %edx, %ecx
+; CHECK: sbbl {{%[a-z]+}}, %ecx
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14333.ll b/test/CodeGen/X86/pr14333.ll
new file mode 100644
index 0000000000..86c12ef6b5
--- /dev/null
+++ b/test/CodeGen/X86/pr14333.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s
+%foo = type { i64, i64 }
+define void @bar(%foo* %zed) {
+  %tmp = getelementptr inbounds %foo* %zed, i64 0, i32 0
+  store i64 0, i64* %tmp, align 8
+  %tmp2 = getelementptr inbounds %foo* %zed, i64 0, i32 1
+  store i64 0, i64* %tmp2, align 8
+  %tmp3 = bitcast %foo* %zed to i8*
+  call void @llvm.memset.p0i8.i64(i8* %tmp3, i8 0, i64 16, i32 8, i1 false)
+  ret void
+}
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
index e2224a6196..98f4077763 100644
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -39,7 +39,7 @@ define i32 @_rdrand64_step(i64* %random_val) {
   %isvalid = extractvalue {i64, i32} %call, 1
   ret i32 %isvalid
 ; CHECK: _rdrand64_step:
-; CHECK: rdrandq	%r[[T1:[[a-z]+]]
+; CHECK: rdrandq	%r[[T1:[a-z]+]]
 ; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
new file mode 100644
index 0000000000..76eb9514f0
--- /dev/null
+++ b/test/CodeGen/X86/rtm.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mattr=+rtm -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare i32 @llvm.x86.xbegin() nounwind
+declare void @llvm.x86.xend() nounwind
+declare void @llvm.x86.xabort(i8) noreturn nounwind
+
+define i32 @test_xbegin() nounwind uwtable {
+entry:
+  %0 = tail call i32 @llvm.x86.xbegin() nounwind
+  ret i32 %0
+; CHECK: test_xbegin
+; CHECK: xbegin [[LABEL:.*BB.*]]
+; CHECK: [[LABEL]]:
+}
+
+define void @test_xend() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xend() nounwind
+  ret void
+; CHECK: test_xend
+; CHECK: xend
+}
+
+define void @test_xabort() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xabort(i8 2)
+  unreachable
+; CHECK: test_xabort
+; CHECK: xabort $2
+}
diff --git a/test/CodeGen/X86/sext-load.ll b/test/CodeGen/X86/sext-load.ll
index c9b39d3a48..58c93229a2 100644
--- a/test/CodeGen/X86/sext-load.ll
+++ b/test/CodeGen/X86/sext-load.ll
@@ -1,9 +1,30 @@
-; RUN: llc < %s -march=x86 | grep movsbl
+; RUN: llc < %s -march=x86 | FileCheck %s
 
-define i32 @foo(i32 %X) nounwind  {
+; When doing sign extension, use the sext-load lowering to take advantage of
+; x86's sign extension during loads.
+;
+; CHECK: test1:
+; CHECK:      movsbl {{.*}}, %eax
+; CHECK-NEXT: ret
+define i32 @test1(i32 %X) nounwind  {
 entry:
 	%tmp12 = trunc i32 %X to i8		; <i8> [#uses=1]
 	%tmp123 = sext i8 %tmp12 to i32		; <i32> [#uses=1]
 	ret i32 %tmp123
 }
 
+; When using a sextload representation, ensure that the sign extension is
+; preserved even when removing shifted-out low bits.
+;
+; CHECK: test2:
+; CHECK:      movswl {{.*}}, %eax
+; CHECK-NEXT: ret
+define i32 @test2({i16, [6 x i8]}* %this) {
+entry:
+  %b48 = getelementptr inbounds { i16, [6 x i8] }* %this, i32 0, i32 1
+  %cast = bitcast [6 x i8]* %b48 to i48*
+  %bf.load = load i48* %cast, align 2
+  %bf.ashr = ashr i48 %bf.load, 32
+  %bf.cast = trunc i48 %bf.ashr to i32
+  ret i32 %bf.cast
+}
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
index 2f4317bf29..67ce1be135 100644
--- a/test/CodeGen/X86/sse2-blend.ll
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -28,33 +28,31 @@ define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
 
 ; Without forcing instructions, fall back to the preferred PS domain.
 ; CHECK: vsel_i64
-; CHECK: xorps
-; CHECK: andps
 ; CHECK: andnps
+; CHECK: andps
 ; CHECK: orps
 ; CHECK: ret
 
-define void@vsel_i64(<4 x i64>* %v1, <4 x i64>* %v2) {
-  %A = load <4 x i64>* %v1
-  %B = load <4 x i64>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %A, <4 x i64> %B
-  store <4 x i64 > %vsel, <4 x i64>* %v1
+define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
+  %A = load <2 x i64>* %v1
+  %B = load <2 x i64>* %v2
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %A, <2 x i64> %B
+  store <2 x i64 > %vsel, <2 x i64>* %v1
   ret void
 }
 
 ; Without forcing instructions, fall back to the preferred PS domain.
 ; CHECK: vsel_double
-; CHECK: xorps
-; CHECK: andps
 ; CHECK: andnps
+; CHECK: andps
 ; CHECK: orps
 ; CHECK: ret
 
-define void@vsel_double(<4 x double>* %v1, <4 x double>* %v2) {
-  %A = load <4 x double>* %v1
-  %B = load <4 x double>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %A, <4 x double> %B
-  store <4 x double > %vsel, <4 x double>* %v1
+define void@vsel_double(<2 x double>* %v1, <2 x double>* %v2) {
+  %A = load <2 x double>* %v1
+  %B = load <2 x double>* %v2
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %A, <2 x double> %B
+  store <2 x double > %vsel, <2 x double>* %v1
   ret void
 }
 
diff --git a/test/CodeGen/X86/tailcall-fastisel.ll b/test/CodeGen/X86/tailcall-fastisel.ll
index 7f92af4dca..842ed25439 100644
--- a/test/CodeGen/X86/tailcall-fastisel.ll
+++ b/test/CodeGen/X86/tailcall-fastisel.ll
@@ -1,12 +1,11 @@
-; RUN: llc < %s -march=x86-64 -tailcallopt -fast-isel | not grep TAILCALL
-
-; Fast-isel shouldn't attempt to cope with tail calls.
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -tailcallopt -fast-isel -fast-isel-abort | FileCheck %s
 
 %0 = type { i64, i32, i8* }
 
 define fastcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 %arg1) nounwind {
 fail:                                             ; preds = %entry
   %tmp20 = tail call fastcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 undef) ; <i8*> [#uses=1]
+; CHECK: jmp "_visit_array_aux<`Reference>" ## TAILCALL
   ret i8* %tmp20
 }
 
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index 5e0160bd28..4db68bd182 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -36,3 +36,147 @@ define <8 x float> @floor_v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
 declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+
+define <2 x double> @ceil_v2f64(<2 x double> %p)
+{
+  ; CHECK: ceil_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
+
+define <4 x float> @ceil_v4f32(<4 x float> %p)
+{
+  ; CHECK: ceil_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
+
+define <4 x double> @ceil_v4f64(<4 x double> %p)
+{
+  ; CHECK: ceil_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
+
+define <8 x float> @ceil_v8f32(<8 x float> %p)
+{
+  ; CHECK: ceil_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
+
+define <2 x double> @trunc_v2f64(<2 x double> %p)
+{
+  ; CHECK: trunc_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
+
+define <4 x float> @trunc_v4f32(<4 x float> %p)
+{
+  ; CHECK: trunc_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
+
+define <4 x double> @trunc_v4f64(<4 x double> %p)
+{
+  ; CHECK: trunc_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
+
+define <8 x float> @trunc_v8f32(<8 x float> %p)
+{
+  ; CHECK: trunc_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
+
+define <2 x double> @rint_v2f64(<2 x double> %p)
+{
+  ; CHECK: rint_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.rint.v2f64(<2 x double> %p)
+
+define <4 x float> @rint_v4f32(<4 x float> %p)
+{
+  ; CHECK: rint_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.rint.v4f32(<4 x float> %p)
+
+define <4 x double> @rint_v4f64(<4 x double> %p)
+{
+  ; CHECK: rint_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.rint.v4f64(<4 x double> %p)
+
+define <8 x float> @rint_v8f32(<8 x float> %p)
+{
+  ; CHECK: rint_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.rint.v8f32(<8 x float> %p)
+
+define <2 x double> @nearbyint_v2f64(<2 x double> %p)
+{
+  ; CHECK: nearbyint_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
+
+define <4 x float> @nearbyint_v4f32(<4 x float> %p)
+{
+  ; CHECK: nearbyint_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
+
+define <4 x double> @nearbyint_v4f64(<4 x double> %p)
+{
+  ; CHECK: nearbyint_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
+
+define <8 x float> @nearbyint_v8f32(<8 x float> %p)
+{
+  ; CHECK: nearbyint_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll
index 976cd1835b..b6b8ba6f84 100644
--- a/test/CodeGen/X86/vec_shuffle-20.ll
+++ b/test/CodeGen/X86/vec_shuffle-20.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 3
+; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2
 
 define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/vec_zero.ll b/test/CodeGen/X86/vec_zero.ll
index 682a0dfca8..c3ea0ad202 100644
--- a/test/CodeGen/X86/vec_zero.ll
+++ b/test/CodeGen/X86/vec_zero.ll
@@ -13,7 +13,7 @@ define void @foo(<4 x float>* %P) {
 ; CHECK: pxor
 define void @bar(<4 x i32>* %P) {
         %T = load <4 x i32>* %P         ; <<4 x i32>> [#uses=1]
-        %S = add <4 x i32> zeroinitializer, %T          ; <<4 x i32>> [#uses=1]
+        %S = sub <4 x i32> zeroinitializer, %T          ; <<4 x i32>> [#uses=1]
         store <4 x i32> %S, <4 x i32>* %P
         ret void
 }