20 files changed, 732 insertions, 93 deletions
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b670c7..04659522d3 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000000..6ebbb2e97d
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000000..26ed0ecc0c
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,272 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+ declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000000..e7c9605d3e
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709a3b..ec11654b35 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define   <8 x float> @test17(<4 x float> %y) {
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6caed3..ff56a45499 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@ entry:
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@ entry:
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@ entry:
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
   %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa274..8af9373e50 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,14 @@ entry:
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663c94..85a70aad75 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a0dd..bd3514cc3f 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@ entry:
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@ entry:
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e09d7..e3910a6935 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b346e..2fe1ecd40e 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87b3b..6d98d59b38 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd