aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/2012-04-26-sdglue.ll2
-rw-r--r--test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll20
-rw-r--r--test/CodeGen/X86/StackColoring.ll272
-rw-r--r--test/CodeGen/X86/atom-bypass-slow-division.ll112
-rw-r--r--test/CodeGen/X86/avx-shuffle.ll10
-rw-r--r--test/CodeGen/X86/avx-vextractf128.ll88
-rw-r--r--test/CodeGen/X86/avx2-shuffle.ll11
-rw-r--r--test/CodeGen/X86/fast-isel-x86-64.ll8
-rw-r--r--test/CodeGen/X86/fma.ll16
-rwxr-xr-xtest/CodeGen/X86/fma3-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/fma4-intrinsics-x86_64.ll1
-rw-r--r--test/CodeGen/X86/fma_patterns.ll103
-rw-r--r--test/CodeGen/X86/fp-fast.ll37
-rw-r--r--test/CodeGen/X86/inline-asm-tied.ll9
-rw-r--r--test/CodeGen/X86/phys_subreg_coalesce-3.ll4
-rw-r--r--test/CodeGen/X86/pr12312.ll48
-rw-r--r--test/CodeGen/X86/pr12359.ll10
-rw-r--r--test/CodeGen/X86/tls-pic.ll12
-rw-r--r--test/CodeGen/X86/vec_shuffle-26.ll45
-rw-r--r--test/CodeGen/X86/widen_load-1.ll13
20 files changed, 732 insertions, 93 deletions
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b670c7..04659522d3 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
; It's hard to test for the ISEL condition because CodeGen optimizes
; away the bugpointed code. Just ensure the basics are still there.
;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
;CHECK: vinsertf128
;CHECK: vpshufd
;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000000..6ebbb2e97d
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+ %cmp = fcmp olt float %mean, -3.000000e+00
+ %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+ %cmp2 = fcmp ult float %f.0, 3.000000e+00
+ %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+ %add = fadd float %f.1, 3.000000e+00
+ %div = fdiv float %add, 2.343750e-02
+ %0 = fpext float %div to double
+ %conv = select i1 undef, double 2.550000e+02, double %0
+ %add8 = fadd double %conv, 5.000000e-01
+ %conv9 = fptosi double %add8 to i32
+ %.conv9 = select i1 undef, i32 255, i32 %conv9
+ ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000000..26ed0ecc0c
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,272 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq $136, %rsp
+;NOCOLOR: subq $264, %rsp
+
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+ %a = alloca [17 x i8*], align 8
+ %a2 = alloca [16 x i8*], align 8
+ %b = bitcast [17 x i8*]* %a to i8*
+ %b2 = bitcast [16 x i8*]* %a2 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %b)
+ %t1 = call i32 @foo(i32 %in, i8* %b)
+ %t2 = call i32 @foo(i32 %in, i8* %b)
+ call void @llvm.lifetime.end(i64 -1, i8* %b)
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t3 = call i32 @foo(i32 %in, i8* %b2)
+ %t4 = call i32 @foo(i32 %in, i8* %b2)
+ call void @llvm.lifetime.end(i64 -1, i8* %b2)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ ret i32 %t7
+}
+
+
+;YESCOLOR: subq $272, %rsp
+;NOCOLOR: subq $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+ %a = alloca [17 x i8*], align 8
+ %a2 = alloca [16 x i8*], align 8
+ %b = bitcast [17 x i8*]* %a to i8*
+ %b2 = bitcast [16 x i8*]* %a2 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %b)
+ %t1 = call i32 @foo(i32 %in, i8* %b)
+ %t2 = call i32 @foo(i32 %in, i8* %b)
+ br i1 %d, label %bb2, label %bb3
+bb2:
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t3 = call i32 @foo(i32 %in, i8* %b2)
+ %t4 = call i32 @foo(i32 %in, i8* %b2)
+ call void @llvm.lifetime.end(i64 -1, i8* %b2)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ call void @llvm.lifetime.end(i64 -1, i8* %b)
+ ret i32 %t7
+bb3:
+ call void @llvm.lifetime.end(i64 -1, i8* %b)
+ ret i32 0
+}
+
+;YESCOLOR: subq $144, %rsp
+;NOCOLOR: subq $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+ %a = alloca [17 x i8*], align 8
+ %a2 = alloca [16 x i8*], align 8
+ %b = bitcast [17 x i8*]* %a to i8*
+ %b2 = bitcast [16 x i8*]* %a2 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %b)
+ %t1 = call i32 @foo(i32 %in, i8* %b)
+ %t2 = call i32 @foo(i32 %in, i8* %b)
+ call void @llvm.lifetime.end(i64 -1, i8* %b)
+ br i1 %d, label %bb2, label %bb3
+bb2:
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t3 = call i32 @foo(i32 %in, i8* %b2)
+ %t4 = call i32 @foo(i32 %in, i8* %b2)
+ call void @llvm.lifetime.end(i64 -1, i8* %b2)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ ret i32 %t7
+bb3:
+ ret i32 0
+}
+;YESCOLOR: subq $208, %rsp
+;NOCOLOR: subq $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+ %a1 = alloca [14 x i8*], align 8
+ %a2 = alloca [13 x i8*], align 8
+ %a3 = alloca [12 x i8*], align 8
+ %a4 = alloca [11 x i8*], align 8
+ %b1 = bitcast [14 x i8*]* %a1 to i8*
+ %b2 = bitcast [13 x i8*]* %a2 to i8*
+ %b3 = bitcast [12 x i8*]* %a3 to i8*
+ %b4 = bitcast [11 x i8*]* %a4 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %b4)
+ call void @llvm.lifetime.start(i64 -1, i8* %b1)
+ %t1 = call i32 @foo(i32 %in, i8* %b1)
+ %t2 = call i32 @foo(i32 %in, i8* %b1)
+ call void @llvm.lifetime.end(i64 -1, i8* %b1)
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t9 = call i32 @foo(i32 %in, i8* %b2)
+ %t8 = call i32 @foo(i32 %in, i8* %b2)
+ call void @llvm.lifetime.end(i64 -1, i8* %b2)
+ call void @llvm.lifetime.start(i64 -1, i8* %b3)
+ %t3 = call i32 @foo(i32 %in, i8* %b3)
+ %t4 = call i32 @foo(i32 %in, i8* %b3)
+ call void @llvm.lifetime.end(i64 -1, i8* %b3)
+ %t11 = call i32 @foo(i32 %in, i8* %b4)
+ call void @llvm.lifetime.end(i64 -1, i8* %b4)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ ret i32 %t7
+}
+
+;YESCOLOR: subq $112, %rsp
+;NOCOLOR: subq $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+ %a1 = alloca [14 x i8*], align 8
+ %a2 = alloca [13 x i8*], align 8
+ %a3 = alloca [12 x i8*], align 8
+ %a4 = alloca [11 x i8*], align 8
+ %b1 = bitcast [14 x i8*]* %a1 to i8*
+ %b2 = bitcast [13 x i8*]* %a2 to i8*
+ %b3 = bitcast [12 x i8*]* %a3 to i8*
+ %b4 = bitcast [11 x i8*]* %a4 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %b1)
+ %t1 = call i32 @foo(i32 %in, i8* %b1)
+ %t2 = call i32 @foo(i32 %in, i8* %b1)
+ call void @llvm.lifetime.end(i64 -1, i8* %b1)
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t9 = call i32 @foo(i32 %in, i8* %b2)
+ %t8 = call i32 @foo(i32 %in, i8* %b2)
+ call void @llvm.lifetime.end(i64 -1, i8* %b2)
+ call void @llvm.lifetime.start(i64 -1, i8* %b3)
+ %t3 = call i32 @foo(i32 %in, i8* %b3)
+ %t4 = call i32 @foo(i32 %in, i8* %b3)
+ call void @llvm.lifetime.end(i64 -1, i8* %b3)
+ br i1 undef, label %bb2, label %bb3
+bb2:
+ call void @llvm.lifetime.start(i64 -1, i8* %b4)
+ %t11 = call i32 @foo(i32 %in, i8* %b4)
+ call void @llvm.lifetime.end(i64 -1, i8* %b4)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ ret i32 %t7
+bb3:
+ ret i32 0
+}
+
+
+;YESCOLOR: subq $144, %rsp
+;NOCOLOR: subq $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+ %a = alloca [17 x i8*], align 8
+ %a2 = alloca [16 x i8*], align 8
+ %b = bitcast [17 x i8*]* %a to i8*
+ %b2 = bitcast [16 x i8*]* %a2 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %b)
+ %t1 = call i32 @foo(i32 %in, i8* %b)
+ %t2 = call i32 @foo(i32 %in, i8* %b)
+ call void @llvm.lifetime.end(i64 -1, i8* %b)
+ br i1 %d, label %bb2, label %bb3
+bb2:
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t3 = call i32 @foo(i32 %in, i8* %b2)
+ %t4 = call i32 @foo(i32 %in, i8* %b2)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ ret i32 %t7
+bb3:
+ ret i32 0
+}
+
+;YESCOLOR: subq $144, %rsp
+;NOCOLOR: subq $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+ %a = alloca [17 x i8*], align 8
+ %a2 = alloca [16 x i8*], align 8
+ %b = bitcast [17 x i8*]* %a to i8*
+ %b2 = bitcast [16 x i8*]* %a2 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %b)
+ %t1 = call i32 @foo(i32 %in, i8* %b)
+ %t2 = call i32 @foo(i32 %in, i8* %b)
+ br i1 %d, label %bb2, label %bb3
+bb2:
+ call void @llvm.lifetime.end(i64 -1, i8* %b)
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t3 = call i32 @foo(i32 %in, i8* %b2)
+ %t4 = call i32 @foo(i32 %in, i8* %b2)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ ret i32 %t7
+bb3:
+ ret i32 0
+}
+
+
+;YESCOLOR: subq $144, %rsp
+;NOCOLOR: subq $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+ %a = alloca [17 x i8*], align 8
+ %a2 = alloca [16 x i8*], align 8
+ %b = bitcast [17 x i8*]* %a to i8*
+ %b2 = bitcast [16 x i8*]* %a2 to i8*
+ %t1 = call i32 @foo(i32 %in, i8* %b)
+ %t2 = call i32 @foo(i32 %in, i8* %b)
+ call void @llvm.lifetime.end(i64 -1, i8* %b)
+ br i1 %d, label %bb2, label %bb3
+bb2:
+ call void @llvm.lifetime.start(i64 -1, i8* %b2)
+ %t3 = call i32 @foo(i32 %in, i8* %b2)
+ %t4 = call i32 @foo(i32 %in, i8* %b2)
+ %t5 = add i32 %t1, %t2
+ %t6 = add i32 %t3, %t4
+ %t7 = add i32 %t5, %t6
+ ret i32 %t7
+bb3:
+ ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq $816, %rsp
+;NOCOLOR: subq $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+ %A.i1 = alloca [100 x i32], align 4
+ %B.i2 = alloca [100 x i32], align 4
+ %A.i = alloca [100 x i32], align 4
+ %B.i = alloca [100 x i32], align 4
+ %0 = bitcast [100 x i32]* %A.i to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+ %1 = bitcast [100 x i32]* %B.i to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+ call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+ call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+ call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+ %2 = bitcast [100 x i32]* %A.i1 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+ %3 = bitcast [100 x i32]* %B.i2 to i8*
+ call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+ call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+ call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+ call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+ ret void
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+ declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000000..e7c9605d3e
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+ %result = sdiv i32 %a, %b
+ ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+ %result = srem i32 %a, %b
+ ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+ %resultdiv = sdiv i32 %a, %b
+ %resultrem = srem i32 %a, %b
+ %result = add i32 %resultdiv, %resultrem
+ ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+ %resultidiv = sdiv i32 %a, %b
+ %resultdiv = udiv i32 %a, %b
+ %result = add i32 %resultidiv, %resultdiv
+ ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+ %resultdiv = sdiv i32 256, 4
+ ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+ %resultdiv = sdiv i32 %a, 33
+ ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+ %resultrem = srem i32 %a, 33
+ ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+ %resultdiv = sdiv i32 %a, 33
+ %resultrem = srem i32 %a, 33
+ %result = add i32 %resultdiv, %resultrem
+ ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+ %resultdiv = sdiv i32 4, %a
+ ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+ %resultdiv = sdiv i32 4, %a
+ ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709a3b..ec11654b35 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define <8 x float> @test17(<4 x float> %y) {
}
; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
; CHECK: ret
define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
%S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
}
; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
; CHECK: ret
define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
%S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6caed3..ff56a45499 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
}
; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
entry:
- %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+ %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
%1 = bitcast float* %addr to <4 x float>*
store <4 x float> %0, <4 x float>* %1, align 16
ret void
@@ -32,27 +32,13 @@ entry:
declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
- %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
- %1 = bitcast float* %addr to i8*
- tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
- ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
entry:
- %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+ %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
%1 = bitcast double* %addr to <2 x double>*
store <2 x double> %0, <2 x double>* %1, align 16
ret void
@@ -60,28 +46,14 @@ entry:
declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
- %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
- %1 = bitcast double* %addr to i8*
- tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
- ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
- %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+ %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
%2 = bitcast <4 x i32> %1 to <2 x i64>
store <2 x i64> %2, <2 x i64>* %addr, align 16
ret void
@@ -90,17 +62,43 @@ entry:
declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+ %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+ %1 = bitcast float* %addr to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 16
+ ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+ %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+ %1 = bitcast double* %addr to <2 x double>*
+ store <2 x double> %0, <2 x double>* %1, align 16
+ ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
%1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
- %2 = bitcast <2 x i64>* %addr to i8*
- %3 = bitcast <4 x i32> %1 to <16 x i8>
- tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+ %2 = bitcast <4 x i32> %1 to <2 x i64>
+ store <2 x i64> %2, <2 x i64>* %addr, align 16
ret void
}
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+ %0 = bitcast <4 x i64> %a to <8 x i32>
+ %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+ %2 = bitcast <4 x i32> %1 to <2 x i64>
+ store <2 x i64> %2, <2 x i64>* %addr, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa274..8af9373e50 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,14 @@ entry:
%shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle.i
}
+
+; CHECK: vpshufb_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+ %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,
+ i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,
+ i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25,
+ i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+ ret <32 x i8>%S
+} \ No newline at end of file
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663c94..85a70aad75 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
; CHECK: cvtsi2sdq {{.*}} %xmm0
; CHECK: movb $1, %al
; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
call void (...)* @test16callee(double 1.000000e+00)
ret void
}
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a0dd..bd3514cc3f 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
; CHECK: test_f32
; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
entry:
@@ -15,7 +17,7 @@ entry:
; CHECK: test_f64
; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
entry:
@@ -24,7 +26,7 @@ entry:
}
; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e09d7..e3910a6935 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b346e..2fe1ecd40e 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
; VFMADD
define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87b3b..6d98d59b38 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
%x = fmul <4 x float> %a0, %a1
%res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
}
; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
%x = fmul <4 x float> %a0, %a1
%res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
}
; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
%x = fmul <4 x float> %a0, %a1
%res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
}
; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
%x = fmul <4 x float> %a0, %a1
%y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
}
; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
%x = fmul <8 x float> %a0, %a1
%res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
}
; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
%x = fmul <8 x float> %a0, %a1
%res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
}
; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
%x = fmul <8 x float> %a0, %a1
%res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
}
; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0
; CHECK: ret
define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
%x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
}
; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
%x = fmul <4 x double> %a0, %a1
%res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
}
; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
%x = fmul <4 x double> %a0, %a1
%res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
}
; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
%x = fmul <2 x double> %a0, %a1
%res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
}
; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
%x = fmul float %a0, %a1
%res = fsub float %a2, %x
@@ -111,8 +146,11 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
}
; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
%x = fmul double %a0, %a1
%res = fsub double %a2, %x
@@ -120,8 +158,11 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
}
; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
@@ -129,11 +170,43 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
}
; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss %xmm2, %xmm1, %xmm0
; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
%x = fsub float -0.000000e+00, %a0
%y = fmul float %x, %a1
%res = fsub float %y, %a2
ret float %res
}
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps (%rdi), %xmm2
+; CHECK: vfmadd213ps %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = load <4 x float>* %a0
+ %y = fmul <4 x float> %x, %a1
+ %res = fadd <4 x float> %y, %a2
+ ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps (%rdi), %xmm2
+; CHECK: fmsub213ps %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = load <4 x float>* %a0
+ %y = fmul <4 x float> %x, %a1
+ %res = fsub <4 x float> %y, %a2
+ ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 0000000000..091f0de930
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=x86-64 -mattr=-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+ %t1 = fadd float %a, %a
+ %r = fadd float %t1, %t1
+ ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+ %t1 = fmul float 4.0, %a
+ %t2 = fadd float %a, %a
+ %r = fadd float %t1, %t2
+ ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+ %t1 = fmul float 2.0, %a
+ %t2 = fadd float %a, %a
+ %r = fsub float %t1, %t2
+ ret float %r
+}
+
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb09e..597236e362 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -19,3 +19,12 @@ entry:
%1 = load i64* %retval ; <i64> [#uses=1]
ret i64 %1
}
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+ %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+ %x1 = extractvalue { i64, i64 } %x0, 0
+ ret i64 %x1
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e57e0..51320dd6d0 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,14 +1,10 @@
; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
; rdar://5571034
; This requires physreg joining, %vreg13 is live everywhere:
; 304L %CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
; 320L %vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
; 336L %vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
; CHECK: foo:
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 0000000000..84102f148b
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond(<4 x i32> %input) {
+entry:
+ %0 = bitcast <4 x i32> %input to i128
+ %1 = icmp ne i128 %0, 0
+ br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block: ; preds = %entry
+ ret i32 0
+endif-block: ; preds = %entry,
+ ret i32 1
+; SSE41: veccond
+; SSE41: ptest
+; SSE41: ret
+; AVX: veccond
+; AVX: vptest
+; AVX: ret
+}
+
+define i32 @vectest(<4 x i32> %input) {
+entry:
+ %0 = bitcast <4 x i32> %input to i128
+ %1 = icmp ne i128 %0, 0
+ %2 = zext i1 %1 to i32
+ ret i32 %2
+; SSE41: vectest
+; SSE41: ptest
+; SSE41: ret
+; AVX: vectest
+; AVX: vptest
+; AVX: ret
+}
+
+define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+ %0 = bitcast <4 x i32> %input to i128
+ %1 = icmp ne i128 %0, 0
+ %2 = select i1 %1, i32 %a, i32 %b
+ ret i32 %2
+; SSE41: vecsel
+; SSE41: ptest
+; SSE41: ret
+; AVX: vecsel
+; AVX: vptest
+; AVX: ret
+}
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 0000000000..024b163fa7
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll
@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+ %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+ ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d2363f..b823f0af2c 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -76,12 +76,12 @@ entry:
; X32: f5:
; X32: leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
; X64: f5:
; X64: leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq __tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq __tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6bb11..8dfc2eab41 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom -mattr=+sse41 | FileCheck -check-prefix=ATOM %s
; Transpose example using the more generic vector shuffle. Return float8
; instead of float16
@@ -14,6 +13,17 @@ target triple = "i386-apple-cl.1.0"
define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2]
%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2]
%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@ entry:
; %r3 = shufflevector <8 x float> %r1, <8 x float> %r2, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >;
ret <8 x float> %r2
}
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+ %v.i = bitcast float* %y to <4 x float>*
+ %0 = load <4 x float>* %v.i, align 1
+ %1 = bitcast float* %x to <1 x i64>*
+ %.val = load <1 x i64>* %1, align 1
+ %2 = bitcast <1 x i64> %.val to <2 x float>
+ %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %cast.i = bitcast <4 x float> %0 to <2 x i64>
+ %extract.i = extractelement <2 x i64> %cast.i, i32 1
+ %3 = bitcast float* %x to i64*
+ store i64 %extract.i, i64* %3, align 4
+ %4 = bitcast <4 x float> %0 to <16 x i8>
+ %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+ %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %6 = bitcast <16 x i8> %palignr to <2 x i64>
+ ret <2 x i64> %6
+}
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d149dd..dfaa3d6dc9 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
; PR4891
; PR5626
; This load should be before the call, not after.
-; CHECK: movaps compl+128(%rip), %xmm0
-; CHECK: movaps %xmm0, (%rsp)
-; CHECK: callq killcommon
+; SSE: movaps compl+128(%rip), %xmm0
+; SSE: movaps %xmm0, (%rsp)
+; SSE: callq killcommon
+
+; AVX: vmovapd compl+128(%rip), %xmm0
+; AVX: vmovapd %xmm0, (%rsp)
+; AVX: callq killcommon
@compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]