aboutsummaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
Diffstat (limited to 'test')
-rw-r--r--test/Assembler/AutoUpgradeIntrinsics.ll2
-rw-r--r--test/Bitcode/ssse3_palignr.ll.bcbin1280 -> 1504 bytes
-rw-r--r--test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll41
-rw-r--r--test/CodeGen/X86/2007-05-15-maskmovq.ll8
-rw-r--r--test/CodeGen/X86/2007-06-15-IntToMMX.ll13
-rw-r--r--test/CodeGen/X86/2007-07-03-GR64ToVR64.ll14
-rw-r--r--test/CodeGen/X86/2008-04-08-CoalescerCrash.ll8
-rw-r--r--test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll6
-rw-r--r--test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll22
-rw-r--r--test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll8
-rw-r--r--test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll64
-rw-r--r--test/CodeGen/X86/fast-isel-bc.ll14
-rw-r--r--test/CodeGen/X86/mmx-arg-passing.ll19
-rw-r--r--test/CodeGen/X86/mmx-arg-passing2.ll14
-rw-r--r--test/CodeGen/X86/mmx-arith.ll380
-rw-r--r--test/CodeGen/X86/mmx-bitcast-to-i64.ll37
-rw-r--r--test/CodeGen/X86/mmx-insert-element.ll10
-rw-r--r--test/CodeGen/X86/mmx-punpckhdq.ll14
-rw-r--r--test/CodeGen/X86/mmx-shift.ll24
-rw-r--r--test/CodeGen/X86/mmx-shuffle.ll6
-rw-r--r--test/CodeGen/X86/mmx-vzmovl-2.ll20
-rw-r--r--test/CodeGen/X86/mmx-vzmovl.ll4
-rw-r--r--test/CodeGen/X86/vec_insert-5.ll9
-rw-r--r--test/CodeGen/X86/vec_insert-7.ll12
-rw-r--r--test/CodeGen/X86/vec_zero_cse.ll5
25 files changed, 540 insertions, 214 deletions
diff --git a/test/Assembler/AutoUpgradeIntrinsics.ll b/test/Assembler/AutoUpgradeIntrinsics.ll
index af4ec92e09..6752bd8281 100644
--- a/test/Assembler/AutoUpgradeIntrinsics.ll
+++ b/test/Assembler/AutoUpgradeIntrinsics.ll
@@ -7,7 +7,7 @@
; RUN: llvm-as < %s | llvm-dis | \
; RUN: not grep {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*}
; RUN: llvm-as < %s | llvm-dis | \
-; RUN: grep {llvm\\.x86\\.mmx\\.ps} | grep {\\\<2 x i32\\\>} | count 6
+; RUN: grep {llvm\\.x86\\.mmx\\.ps} | grep {x86_mmx} | count 16
declare i32 @llvm.ctpop.i28(i28 %val)
declare i32 @llvm.cttz.i29(i29 %val)
diff --git a/test/Bitcode/ssse3_palignr.ll.bc b/test/Bitcode/ssse3_palignr.ll.bc
index 642f4dedc4..3fc9cdf15a 100644
--- a/test/Bitcode/ssse3_palignr.ll.bc
+++ b/test/Bitcode/ssse3_palignr.ll.bc
Binary files differ
diff --git a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
index c39b82a1fe..256cbbb5d9 100644
--- a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
+++ b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -o - -march=x86 -mattr=+mmx | FileCheck %s
+; There are no MMX instructions here. We use add+adcl for the adds.
define <1 x i64> @unsigned_add3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
entry:
@@ -7,9 +8,8 @@ entry:
bb26: ; preds = %bb26, %entry
-; CHECK: movq ({{.*}},8), %mm
-; CHECK: paddq ({{.*}},8), %mm
-; CHECK: paddq %mm{{[0-7]}}, %mm
+; CHECK: addl %eax, %ebx
+; CHECK: adcl %edx, %ebp
%i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ] ; <i32> [#uses=3]
%sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] ; <<1 x i64>> [#uses=1]
@@ -27,3 +27,38 @@ bb31: ; preds = %bb26, %entry
%sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] ; <<1 x i64>> [#uses=1]
ret <1 x i64> %sum.035.1
}
+
+
+; This is the original test converted to use MMX intrinsics.
+
+define <1 x i64> @unsigned_add3a(x86_mmx* %a, x86_mmx* %b, i32 %count) nounwind {
+entry:
+ %tmp2943 = bitcast <1 x i64><i64 0> to x86_mmx
+ %tmp2942 = icmp eq i32 %count, 0 ; <i1> [#uses=1]
+ br i1 %tmp2942, label %bb31, label %bb26
+
+bb26: ; preds = %bb26, %entry
+
+; CHECK: movq ({{.*}},8), %mm
+; CHECK: paddq ({{.*}},8), %mm
+; CHECK: paddq %mm{{[0-7]}}, %mm
+
+ %i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ] ; <i32> [#uses=3]
+ %sum.035.0 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ] ; <x86_mmx> [#uses=1]
+ %tmp13 = getelementptr x86_mmx* %b, i32 %i.037.0 ; <x86_mmx*> [#uses=1]
+ %tmp14 = load x86_mmx* %tmp13 ; <x86_mmx> [#uses=1]
+ %tmp18 = getelementptr x86_mmx* %a, i32 %i.037.0 ; <x86_mmx*> [#uses=1]
+ %tmp19 = load x86_mmx* %tmp18 ; <x86_mmx> [#uses=1]
+ %tmp21 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp19, x86_mmx %tmp14) ; <x86_mmx> [#uses=1]
+ %tmp22 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp21, x86_mmx %sum.035.0) ; <x86_mmx> [#uses=2]
+ %tmp25 = add i32 %i.037.0, 1 ; <i32> [#uses=2]
+ %tmp29 = icmp ult i32 %tmp25, %count ; <i1> [#uses=1]
+ br i1 %tmp29, label %bb26, label %bb31
+
+bb31: ; preds = %bb26, %entry
+ %sum.035.1 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ] ; <x86_mmx> [#uses=1]
+ %t = bitcast x86_mmx %sum.035.1 to <1 x i64>
+ ret <1 x i64> %t
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/2007-05-15-maskmovq.ll b/test/CodeGen/X86/2007-05-15-maskmovq.ll
index 2093b8f687..006cf2e43a 100644
--- a/test/CodeGen/X86/2007-05-15-maskmovq.ll
+++ b/test/CodeGen/X86/2007-05-15-maskmovq.ll
@@ -5,10 +5,10 @@ target triple = "i686-apple-darwin8"
define void @test(<1 x i64> %c64, <1 x i64> %mask1, i8* %P) {
entry:
- %tmp4 = bitcast <1 x i64> %mask1 to <8 x i8> ; <<8 x i8>> [#uses=1]
- %tmp6 = bitcast <1 x i64> %c64 to <8 x i8> ; <<8 x i8>> [#uses=1]
- tail call void @llvm.x86.mmx.maskmovq( <8 x i8> %tmp6, <8 x i8> %tmp4, i8* %P )
+ %tmp4 = bitcast <1 x i64> %mask1 to x86_mmx ; <x86_mmx> [#uses=1]
+ %tmp6 = bitcast <1 x i64> %c64 to x86_mmx ; <x86_mmx> [#uses=1]
+ tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, i8* %P )
ret void
}
-declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*)
diff --git a/test/CodeGen/X86/2007-06-15-IntToMMX.ll b/test/CodeGen/X86/2007-06-15-IntToMMX.ll
index 6128d8b92d..660d4fe7b1 100644
--- a/test/CodeGen/X86/2007-06-15-IntToMMX.ll
+++ b/test/CodeGen/X86/2007-06-15-IntToMMX.ll
@@ -1,17 +1,16 @@
; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep paddusw
-@R = external global <1 x i64> ; <<1 x i64>*> [#uses=1]
+@R = external global x86_mmx ; <x86_mmx*> [#uses=1]
define void @foo(<1 x i64> %A, <1 x i64> %B) {
entry:
- %tmp4 = bitcast <1 x i64> %B to <4 x i16> ; <<4 x i16>> [#uses=1]
- %tmp6 = bitcast <1 x i64> %A to <4 x i16> ; <<4 x i16>> [#uses=1]
- %tmp7 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp6, <4 x i16> %tmp4 ) ; <<4 x i16>> [#uses=1]
- %tmp8 = bitcast <4 x i16> %tmp7 to <1 x i64> ; <<1 x i64>> [#uses=1]
- store <1 x i64> %tmp8, <1 x i64>* @R
+ %tmp2 = bitcast <1 x i64> %A to x86_mmx
+ %tmp3 = bitcast <1 x i64> %B to x86_mmx
+ %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp2, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=1]
+ store x86_mmx %tmp7, x86_mmx* @R
tail call void @llvm.x86.mmx.emms( )
ret void
}
-declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll b/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
index 2c513f1781..1c5e6766fd 100644
--- a/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
+++ b/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
@@ -2,19 +2,17 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | grep {movd %rdi, %mm1}
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | grep {paddusw %mm0, %mm1}
-@R = external global <1 x i64> ; <<1 x i64>*> [#uses=1]
+@R = external global x86_mmx ; <x86_mmx*> [#uses=1]
define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
entry:
- %tmp4 = bitcast <1 x i64> %B to <4 x i16> ; <<4 x i16>> [#uses=1]
- %tmp6 = bitcast <1 x i64> %A to <4 x i16> ; <<4 x i16>> [#uses=1]
- %tmp7 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp6, <4 x i16> %tmp4 ) ; <<4 x i16>> [#uses=1]
- %tmp8 = bitcast <4 x i16> %tmp7 to <1 x i64> ; <<1 x i64>> [#uses=1]
- store <1 x i64> %tmp8, <1 x i64>* @R
+ %tmp4 = bitcast <1 x i64> %B to x86_mmx ; <<4 x i16>> [#uses=1]
+ %tmp6 = bitcast <1 x i64> %A to x86_mmx ; <<4 x i16>> [#uses=1]
+ %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 ) ; <x86_mmx> [#uses=1]
+ store x86_mmx %tmp7, x86_mmx* @R
tail call void @llvm.x86.mmx.emms( )
ret void
}
-declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>)
-
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
index dc8c097efc..5089e8c5b6 100644
--- a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
@@ -5,15 +5,15 @@ entry:
tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
- %tmp1 = tail call <2 x i32> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; <<2 x i32>> [#uses=1]
+ %tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; <x86_mmx> [#uses=1]
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
- %tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> undef ) nounwind ; <i32> [#uses=1]
+ %tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind ; <i32> [#uses=1]
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
- tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> undef, i32 undef, i32 %tmp3 ) nounwind
+ tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
- %tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> %tmp1 ) nounwind ; <i32> [#uses=0]
+ %tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind ; <i32> [#uses=0]
ret i32 undef
}
diff --git a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
index c76dd7de12..53402c0451 100644
--- a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
+++ b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
@@ -17,11 +17,13 @@ entry:
br i1 false, label %bb.nph144.split, label %bb133
bb.nph144.split: ; preds = %entry
- tail call void @llvm.x86.mmx.maskmovq( <8 x i8> zeroinitializer, <8 x i8> zeroinitializer, i8* null ) nounwind
+ %tmp = bitcast <8 x i8> zeroinitializer to x86_mmx
+ %tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx
+ tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, i8* null ) nounwind
unreachable
bb133: ; preds = %entry
ret void
}
-declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*) nounwind
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) nounwind
diff --git a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index 60be0d51e7..2dc1deaf17 100644
--- a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -1,6 +1,9 @@
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | grep unpcklpd
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | grep unpckhpd
; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvttpd2pi | count 1
; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvtpi2pd | count 1
-; PR2687
+; originally from PR2687, but things don't work that way any more.
+; there are no MMX instructions here; we use XMM.
define <2 x double> @a(<2 x i32> %x) nounwind {
entry:
@@ -13,3 +16,20 @@ entry:
%y = fptosi <2 x double> %x to <2 x i32>
ret <2 x i32> %y
}
+
+; This is how to get MMX instructions.
+
+define <2 x double> @a2(x86_mmx %x) nounwind {
+entry:
+ %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
+ ret <2 x double> %y
+}
+
+define x86_mmx @b2(<2 x double> %x) nounwind {
+entry:
+ %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
+ ret x86_mmx %y
+}
+
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
+declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>)
diff --git a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll b/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
index b9b09a3f00..288eef4f69 100644
--- a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
+++ b/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
@@ -1,10 +1,12 @@
; RUN: llc < %s -march=x86-64
; PR4669
-declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
define <1 x i64> @test(i64 %t) {
entry:
%t1 = insertelement <1 x i64> undef, i64 %t, i32 0
- %t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48)
- ret <1 x i64> %t2
+ %t0 = bitcast <1 x i64> %t1 to x86_mmx
+ %t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
+ %t3 = bitcast x86_mmx %t2 to <1 x i64>
+ ret <1 x i64> %t3
}
diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
index 4cd3be35e8..fa3d5fbcdc 100644
--- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
+++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
@@ -1,12 +1,12 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s
+; There are no MMX operations here, so we use XMM or i64.
define void @ti8(double %a, double %b) nounwind {
entry:
%tmp1 = bitcast double %a to <8 x i8>
-; CHECK: movdq2q
%tmp2 = bitcast double %b to <8 x i8>
-; CHECK: movdq2q
%tmp3 = add <8 x i8> %tmp1, %tmp2
+; CHECK: paddb %xmm1, %xmm0
store <8 x i8> %tmp3, <8 x i8>* null
ret void
}
@@ -14,10 +14,9 @@ entry:
define void @ti16(double %a, double %b) nounwind {
entry:
%tmp1 = bitcast double %a to <4 x i16>
-; CHECK: movdq2q
%tmp2 = bitcast double %b to <4 x i16>
-; CHECK: movdq2q
%tmp3 = add <4 x i16> %tmp1, %tmp2
+; CHECK: paddw %xmm1, %xmm0
store <4 x i16> %tmp3, <4 x i16>* null
ret void
}
@@ -25,10 +24,9 @@ entry:
define void @ti32(double %a, double %b) nounwind {
entry:
%tmp1 = bitcast double %a to <2 x i32>
-; CHECK: movdq2q
%tmp2 = bitcast double %b to <2 x i32>
-; CHECK: movdq2q
%tmp3 = add <2 x i32> %tmp1, %tmp2
+; CHECK: paddd %xmm1, %xmm0
store <2 x i32> %tmp3, <2 x i32>* null
ret void
}
@@ -36,10 +34,60 @@ entry:
define void @ti64(double %a, double %b) nounwind {
entry:
%tmp1 = bitcast double %a to <1 x i64>
-; CHECK: movdq2q
%tmp2 = bitcast double %b to <1 x i64>
-; CHECK: movdq2q
%tmp3 = add <1 x i64> %tmp1, %tmp2
+; CHECK: addq %rax, %rcx
store <1 x i64> %tmp3, <1 x i64>* null
ret void
}
+
+; MMX intrinsics calls get us MMX instructions.
+
+define void @ti8a(double %a, double %b) nounwind {
+entry:
+ %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+ %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+ %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
+ store x86_mmx %tmp3, x86_mmx* null
+ ret void
+}
+
+define void @ti16a(double %a, double %b) nounwind {
+entry:
+ %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+ %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+ %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
+ store x86_mmx %tmp3, x86_mmx* null
+ ret void
+}
+
+define void @ti32a(double %a, double %b) nounwind {
+entry:
+ %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+ %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+ %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
+ store x86_mmx %tmp3, x86_mmx* null
+ ret void
+}
+
+define void @ti64a(double %a, double %b) nounwind {
+entry:
+ %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+ %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+ %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
+ store x86_mmx %tmp3, x86_mmx* null
+ ret void
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/fast-isel-bc.ll b/test/CodeGen/X86/fast-isel-bc.ll
index 8d7dc8f9a7..db846889d8 100644
--- a/test/CodeGen/X86/fast-isel-bc.ll
+++ b/test/CodeGen/X86/fast-isel-bc.ll
@@ -5,15 +5,19 @@ target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-apple-darwin9.8"
-declare void @func2(<1 x i64>)
+declare void @func2(x86_mmx)
define void @func1() nounwind {
; This isn't spectacular, but it's MMX code at -O0...
-; CHECK: movl $2, %eax
-; CHECK: movd %rax, %mm0
-; CHECK: movd %mm0, %rdi
+; CHECK: movq2dq %mm0, %xmm0
+; For now, handling of x86_mmx parameters in fast Isel is unimplemented,
+; so we get pretty poor code. The below is preferable.
+; CHEK: movl $2, %eax
+; CHEK: movd %rax, %mm0
+; CHEK: movd %mm0, %rdi
- call void @func2(<1 x i64> <i64 2>)
+ %tmp0 = bitcast <2 x i32><i32 0, i32 2> to x86_mmx
+ call void @func2(x86_mmx %tmp0)
ret void
}
diff --git a/test/CodeGen/X86/mmx-arg-passing.ll b/test/CodeGen/X86/mmx-arg-passing.ll
index 426e98e019..b348512b57 100644
--- a/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/test/CodeGen/X86/mmx-arg-passing.ll
@@ -1,24 +1,27 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep mm0 | count 3
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep esp | count 1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep mm0 | count 1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep esp | count 2
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep xmm0
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep rdi
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | not grep movups
;
; On Darwin x86-32, v8i8, v4i16, v2i32 values are passed in MM[0-2].
-; On Darwin x86-32, v1i64 values are passed in memory.
+; On Darwin x86-32, v1i64 values are passed in memory. In this example, they
+; are never moved into an MM register at all.
; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs.
-@u1 = external global <8 x i8>
+@u1 = external global x86_mmx
-define void @t1(<8 x i8> %v1) nounwind {
- store <8 x i8> %v1, <8 x i8>* @u1, align 8
+define void @t1(x86_mmx %v1) nounwind {
+ store x86_mmx %v1, x86_mmx* @u1, align 8
ret void
}
-@u2 = external global <1 x i64>
+@u2 = external global x86_mmx
define void @t2(<1 x i64> %v1) nounwind {
- store <1 x i64> %v1, <1 x i64>* @u2, align 8
+ %tmp = bitcast <1 x i64> %v1 to x86_mmx
+ store x86_mmx %tmp, x86_mmx* @u2, align 8
ret void
}
+
diff --git a/test/CodeGen/X86/mmx-arg-passing2.ll b/test/CodeGen/X86/mmx-arg-passing2.ll
index c42af08236..c132d311b9 100644
--- a/test/CodeGen/X86/mmx-arg-passing2.ll
+++ b/test/CodeGen/X86/mmx-arg-passing2.ll
@@ -1,17 +1,21 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep movq2dq | count 1
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep movdq2q | count 2
+; Since the add is not an MMX add, we don't have a movq2dq any more.
@g_v8qi = external global <8 x i8>
define void @t1() nounwind {
%tmp3 = load <8 x i8>* @g_v8qi, align 8
- %tmp4 = tail call i32 (...)* @pass_v8qi( <8 x i8> %tmp3 ) nounwind
+ %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
+ %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
ret void
}
-define void @t2(<8 x i8> %v1, <8 x i8> %v2) nounwind {
- %tmp3 = add <8 x i8> %v1, %v2
- %tmp4 = tail call i32 (...)* @pass_v8qi( <8 x i8> %tmp3 ) nounwind
+define void @t2(x86_mmx %v1, x86_mmx %v2) nounwind {
+ %v1a = bitcast x86_mmx %v1 to <8 x i8>
+ %v2b = bitcast x86_mmx %v2 to <8 x i8>
+ %tmp3 = add <8 x i8> %v1a, %v2b
+ %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
+ %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
ret void
}
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index e4dfdbfe1b..6817487324 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -1,131 +1,309 @@
; RUN: llc < %s -march=x86 -mattr=+mmx
;; A basic sanity check to make sure that MMX arithmetic actually compiles.
+;; First is a straight translation of the original with bitcasts as needed.
-define void @foo(<8 x i8>* %A, <8 x i8>* %B) {
+define void @foo(x86_mmx* %A, x86_mmx* %B) {
entry:
- %tmp1 = load <8 x i8>* %A ; <<8 x i8>> [#uses=1]
- %tmp3 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp4 = add <8 x i8> %tmp1, %tmp3 ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp4, <8 x i8>* %A
- %tmp7 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp12 = tail call <8 x i8> @llvm.x86.mmx.padds.b( <8 x i8> %tmp4, <8 x i8> %tmp7 ) ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp12, <8 x i8>* %A
- %tmp16 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp21 = tail call <8 x i8> @llvm.x86.mmx.paddus.b( <8 x i8> %tmp12, <8 x i8> %tmp16 ) ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp21, <8 x i8>* %A
- %tmp27 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp28 = sub <8 x i8> %tmp21, %tmp27 ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp28, <8 x i8>* %A
- %tmp31 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp36 = tail call <8 x i8> @llvm.x86.mmx.psubs.b( <8 x i8> %tmp28, <8 x i8> %tmp31 ) ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp36, <8 x i8>* %A
- %tmp40 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp45 = tail call <8 x i8> @llvm.x86.mmx.psubus.b( <8 x i8> %tmp36, <8 x i8> %tmp40 ) ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp45, <8 x i8>* %A
- %tmp51 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp52 = mul <8 x i8> %tmp45, %tmp51 ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp52, <8 x i8>* %A
- %tmp57 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp58 = and <8 x i8> %tmp52, %tmp57 ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp58, <8 x i8>* %A
- %tmp63 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp64 = or <8 x i8> %tmp58, %tmp63 ; <<8 x i8>> [#uses=2]
- store <8 x i8> %tmp64, <8 x i8>* %A
- %tmp69 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1]
- %tmp70 = xor <8 x i8> %tmp64, %tmp69 ; <<8 x i8>> [#uses=1]
- store <8 x i8> %tmp70, <8 x i8>* %A
+ %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1]
+ %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
+ %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
+ %tmp4 = add <8 x i8> %tmp1a, %tmp3a ; <<8 x i8>> [#uses=2]
+ %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
+ store x86_mmx %tmp4a, x86_mmx* %A
+ %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4a, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp12, x86_mmx* %A
+ %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp21, x86_mmx* %A
+ %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
+ %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
+ %tmp28 = sub <8 x i8> %tmp21a, %tmp27a ; <<8 x i8>> [#uses=2]
+ %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
+ store x86_mmx %tmp28a, x86_mmx* %A
+ %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28a, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp36, x86_mmx* %A
+ %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp45, x86_mmx* %A
+ %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
+ %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
+ %tmp52 = mul <8 x i8> %tmp45a, %tmp51a ; <<8 x i8>> [#uses=2]
+ %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
+ store x86_mmx %tmp52a, x86_mmx* %A
+ %tmp57 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
+ %tmp58 = and <8 x i8> %tmp52, %tmp57a ; <<8 x i8>> [#uses=2]
+ %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
+ store x86_mmx %tmp58a, x86_mmx* %A
+ %tmp63 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
+ %tmp64 = or <8 x i8> %tmp58, %tmp63a ; <<8 x i8>> [#uses=2]
+ %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
+ store x86_mmx %tmp64a, x86_mmx* %A
+ %tmp69 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
+ %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
+ %tmp70 = xor <8 x i8> %tmp64b, %tmp69a ; <<8 x i8>> [#uses=1]
+ %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
+ store x86_mmx %tmp70a, x86_mmx* %A
tail call void @llvm.x86.mmx.emms( )
ret void
}
-define void @baz(<2 x i32>* %A, <2 x i32>* %B) {
+define void @baz(x86_mmx* %A, x86_mmx* %B) {
entry:
- %tmp1 = load <2 x i32>* %A ; <<2 x i32>> [#uses=1]
- %tmp3 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1]
- %tmp4 = add <2 x i32> %tmp1, %tmp3 ; <<2 x i32>> [#uses=2]
- store <2 x i32> %tmp4, <2 x i32>* %A
- %tmp9 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1]
- %tmp10 = sub <2 x i32> %tmp4, %tmp9 ; <<2 x i32>> [#uses=2]
- store <2 x i32> %tmp10, <2 x i32>* %A
- %tmp15 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1]
- %tmp16 = mul <2 x i32> %tmp10, %tmp15 ; <<2 x i32>> [#uses=2]
- store <2 x i32> %tmp16, <2 x i32>* %A
- %tmp21 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1]
- %tmp22 = and <2 x i32> %tmp16, %tmp21 ; <<2 x i32>> [#uses=2]
- store <2 x i32> %tmp22, <2 x i32>* %A
- %tmp27 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1]
- %tmp28 = or <2 x i32> %tmp22, %tmp27 ; <<2 x i32>> [#uses=2]
- store <2 x i32> %tmp28, <2 x i32>* %A
- %tmp33 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1]
- %tmp34 = xor <2 x i32> %tmp28, %tmp33 ; <<2 x i32>> [#uses=1]
- store <2 x i32> %tmp34, <2 x i32>* %A
+ %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1]
+ %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
+ %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
+ %tmp4 = add <2 x i32> %tmp1a, %tmp3a ; <<2 x i32>> [#uses=2]
+ %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
+ store x86_mmx %tmp4a, x86_mmx* %A
+ %tmp9 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
+ %tmp10 = sub <2 x i32> %tmp4, %tmp9a ; <<2 x i32>> [#uses=2]
+ %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
+ store x86_mmx %tmp10a, x86_mmx* %A
+ %tmp15 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
+ %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+ %tmp16 = mul <2 x i32> %tmp10b, %tmp15a ; <<2 x i32>> [#uses=2]
+ %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
+ store x86_mmx %tmp16a, x86_mmx* %A
+ %tmp21 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
+ %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
+ %tmp22 = and <2 x i32> %tmp16b, %tmp21a ; <<2 x i32>> [#uses=2]
+ %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
+ store x86_mmx %tmp22a, x86_mmx* %A
+ %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
+ %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
+ %tmp28 = or <2 x i32> %tmp22b, %tmp27a ; <<2 x i32>> [#uses=2]
+ %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
+ store x86_mmx %tmp28a, x86_mmx* %A
+ %tmp33 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
+ %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
+ %tmp34 = xor <2 x i32> %tmp28b, %tmp33a ; <<2 x i32>> [#uses=1]
+ %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
+ store x86_mmx %tmp34a, x86_mmx* %A
tail call void @llvm.x86.mmx.emms( )
ret void
}
-define void @bar(<4 x i16>* %A, <4 x i16>* %B) {
+define void @bar(x86_mmx* %A, x86_mmx* %B) {
entry:
- %tmp1 = load <4 x i16>* %A ; <<4 x i16>> [#uses=1]
- %tmp3 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp4 = add <4 x i16> %tmp1, %tmp3 ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp4, <4 x i16>* %A
- %tmp7 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp12 = tail call <4 x i16> @llvm.x86.mmx.padds.w( <4 x i16> %tmp4, <4 x i16> %tmp7 ) ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp12, <4 x i16>* %A
- %tmp16 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp21 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp12, <4 x i16> %tmp16 ) ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp21, <4 x i16>* %A
- %tmp27 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp28 = sub <4 x i16> %tmp21, %tmp27 ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp28, <4 x i16>* %A
- %tmp31 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp36 = tail call <4 x i16> @llvm.x86.mmx.psubs.w( <4 x i16> %tmp28, <4 x i16> %tmp31 ) ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp36, <4 x i16>* %A
- %tmp40 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp45 = tail call <4 x i16> @llvm.x86.mmx.psubus.w( <4 x i16> %tmp36, <4 x i16> %tmp40 ) ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp45, <4 x i16>* %A
- %tmp51 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp52 = mul <4 x i16> %tmp45, %tmp51 ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp52, <4 x i16>* %A
- %tmp55 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp60 = tail call <4 x i16> @llvm.x86.mmx.pmulh.w( <4 x i16> %tmp52, <4 x i16> %tmp55 ) ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp60, <4 x i16>* %A
- %tmp64 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp69 = tail call <2 x i32> @llvm.x86.mmx.pmadd.wd( <4 x i16> %tmp60, <4 x i16> %tmp64 ) ; <<2 x i32>> [#uses=1]
- %tmp70 = bitcast <2 x i32> %tmp69 to <4 x i16> ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp70, <4 x i16>* %A
- %tmp75 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp76 = and <4 x i16> %tmp70, %tmp75 ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp76, <4 x i16>* %A
- %tmp81 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp82 = or <4 x i16> %tmp76, %tmp81 ; <<4 x i16>> [#uses=2]
- store <4 x i16> %tmp82, <4 x i16>* %A
- %tmp87 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1]
- %tmp88 = xor <4 x i16> %tmp82, %tmp87 ; <<4 x i16>> [#uses=1]
- store <4 x i16> %tmp88, <4 x i16>* %A
+ %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1]
+ %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
+ %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
+ %tmp4 = add <4 x i16> %tmp1a, %tmp3a ; <<4 x i16>> [#uses=2]
+ %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
+ store x86_mmx %tmp4a, x86_mmx* %A
+ %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4a, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp12, x86_mmx* %A
+ %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp21, x86_mmx* %A
+ %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
+ %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
+ %tmp28 = sub <4 x i16> %tmp21a, %tmp27a ; <<4 x i16>> [#uses=2]
+ %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
+ store x86_mmx %tmp28a, x86_mmx* %A
+ %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28a, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp36, x86_mmx* %A
+ %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp45, x86_mmx* %A
+ %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
+ %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
+ %tmp52 = mul <4 x i16> %tmp45a, %tmp51a ; <<4 x i16>> [#uses=2]
+ %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
+ store x86_mmx %tmp52a, x86_mmx* %A
+ %tmp55 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52a, x86_mmx %tmp55 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp60, x86_mmx* %A
+ %tmp64 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 ) ; <x86_mmx> [#uses=1]
+ %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp70, x86_mmx* %A
+ %tmp75 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
+ %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
+ %tmp76 = and <4 x i16> %tmp70a, %tmp75a ; <<4 x i16>> [#uses=2]
+ %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
+ store x86_mmx %tmp76a, x86_mmx* %A
+ %tmp81 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
+ %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
+ %tmp82 = or <4 x i16> %tmp76b, %tmp81a ; <<4 x i16>> [#uses=2]
+ %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
+ store x86_mmx %tmp82a, x86_mmx* %A
+ %tmp87 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
+ %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
+ %tmp88 = xor <4 x i16> %tmp82b, %tmp87a ; <<4 x i16>> [#uses=1]
+ %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
+ store x86_mmx %tmp88a, x86_mmx* %A
tail call void @llvm.x86.mmx.emms( )
ret void
}
-declare <8 x i8> @llvm.x86.mmx.padds.b(<8 x i8>, <8 x i8>)
+;; The following is modified to use MMX intrinsics everywhere they work.
-declare <8 x i8> @llvm.x86.mmx.paddus.b(<8 x i8>, <8 x i8>)
+define void @fooa(x86_mmx* %A, x86_mmx* %B) {
+entry:
+ %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1]
+ %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.b( x86_mmx %tmp1, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp4, x86_mmx* %A
+ %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp12, x86_mmx* %A
+ %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp21, x86_mmx* %A
+ %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.b( x86_mmx %tmp21, x86_mmx %tmp27 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp28, x86_mmx* %A
+ %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp36, x86_mmx* %A
+ %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp45, x86_mmx* %A
+ %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp51a = bitcast x86_mmx %tmp51 to i64
+ %tmp51aa = bitcast i64 %tmp51a to <8 x i8>
+ %tmp51b = bitcast x86_mmx %tmp45 to <8 x i8>
+ %tmp52 = mul <8 x i8> %tmp51b, %tmp51aa ; <x86_mmx> [#uses=2]
+ %tmp52a = bitcast <8 x i8> %tmp52 to i64
+ %tmp52aa = bitcast i64 %tmp52a to x86_mmx
+ store x86_mmx %tmp52aa, x86_mmx* %A
+ %tmp57 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp58 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp51, x86_mmx %tmp57 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp58, x86_mmx* %A
+ %tmp63 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp64 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp58, x86_mmx %tmp63 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp64, x86_mmx* %A
+ %tmp69 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp70 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp64, x86_mmx %tmp69 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp70, x86_mmx* %A
+ tail call void @llvm.x86.mmx.emms( )
+ ret void
+}
-declare <8 x i8> @llvm.x86.mmx.psubs.b(<8 x i8>, <8 x i8>)
+define void @baza(x86_mmx* %A, x86_mmx* %B) {
+entry:
+ %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1]
+ %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.d( x86_mmx %tmp1, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp4, x86_mmx* %A
+ %tmp9 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp10 = tail call x86_mmx @llvm.x86.mmx.psub.d( x86_mmx %tmp4, x86_mmx %tmp9 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp10, x86_mmx* %A
+ %tmp15 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp10a = bitcast x86_mmx %tmp10 to <2 x i32>
+ %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+ %tmp16 = mul <2 x i32> %tmp10a, %tmp15a ; <x86_mmx> [#uses=2]
+ %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
+ store x86_mmx %tmp16a, x86_mmx* %A
+ %tmp21 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp22 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp16a, x86_mmx %tmp21 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp22, x86_mmx* %A
+ %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp28 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp22, x86_mmx %tmp27 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp28, x86_mmx* %A
+ %tmp33 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp34 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp28, x86_mmx %tmp33 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp34, x86_mmx* %A
+ tail call void @llvm.x86.mmx.emms( )
+ ret void
+}
-declare <8 x i8> @llvm.x86.mmx.psubus.b(<8 x i8>, <8 x i8>)
+define void @bara(x86_mmx* %A, x86_mmx* %B) {
+entry:
+ %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1]
+ %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.w( x86_mmx %tmp1, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp4, x86_mmx* %A
+ %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp12, x86_mmx* %A
+ %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp21, x86_mmx* %A
+ %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.w( x86_mmx %tmp21, x86_mmx %tmp27 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp28, x86_mmx* %A
+ %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp36, x86_mmx* %A
+ %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp45, x86_mmx* %A
+ %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp52 = tail call x86_mmx @llvm.x86.mmx.pmull.w( x86_mmx %tmp45, x86_mmx %tmp51 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp52, x86_mmx* %A
+ %tmp55 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52, x86_mmx %tmp55 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp60, x86_mmx* %A
+ %tmp64 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 ) ; <x86_mmx> [#uses=1]
+ %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp70, x86_mmx* %A
+ %tmp75 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp76 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp70, x86_mmx %tmp75 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp76, x86_mmx* %A
+ %tmp81 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp82 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp76, x86_mmx %tmp81 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp82, x86_mmx* %A
+ %tmp87 = load x86_mmx* %B ; <x86_mmx> [#uses=1]
+ %tmp88 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp82, x86_mmx %tmp87 ) ; <x86_mmx> [#uses=2]
+ store x86_mmx %tmp88, x86_mmx* %A
+ tail call void @llvm.x86.mmx.emms( )
+ ret void
+}
-declare <4 x i16> @llvm.x86.mmx.padds.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
-declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
-declare <4 x i16> @llvm.x86.mmx.psubs.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-declare <4 x i16> @llvm.x86.mmx.psubus.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
-declare <4 x i16> @llvm.x86.mmx.pmulh.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
-declare <2 x i32> @llvm.x86.mmx.pmadd.wd(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx)
declare void @llvm.x86.mmx.emms()
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padds.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubs.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx)
+
diff --git a/test/CodeGen/X86/mmx-bitcast-to-i64.ll b/test/CodeGen/X86/mmx-bitcast-to-i64.ll
index 1fd8f67a0c..8b1840abf6 100644
--- a/test/CodeGen/X86/mmx-bitcast-to-i64.ll
+++ b/test/CodeGen/X86/mmx-bitcast-to-i64.ll
@@ -1,26 +1,31 @@
; RUN: llc < %s -march=x86-64 | grep movd | count 4
-define i64 @foo(<1 x i64>* %p) {
- %t = load <1 x i64>* %p
- %u = add <1 x i64> %t, %t
- %s = bitcast <1 x i64> %u to i64
+define i64 @foo(x86_mmx* %p) {
+ %t = load x86_mmx* %p
+ %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
+ %s = bitcast x86_mmx %u to i64
ret i64 %s
}
-define i64 @goo(<2 x i32>* %p) {
- %t = load <2 x i32>* %p
- %u = add <2 x i32> %t, %t
- %s = bitcast <2 x i32> %u to i64
+define i64 @goo(x86_mmx* %p) {
+ %t = load x86_mmx* %p
+ %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
+ %s = bitcast x86_mmx %u to i64
ret i64 %s
}
-define i64 @hoo(<4 x i16>* %p) {
- %t = load <4 x i16>* %p
- %u = add <4 x i16> %t, %t
- %s = bitcast <4 x i16> %u to i64
+define i64 @hoo(x86_mmx* %p) {
+ %t = load x86_mmx* %p
+ %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
+ %s = bitcast x86_mmx %u to i64
ret i64 %s
}
-define i64 @ioo(<8 x i8>* %p) {
- %t = load <8 x i8>* %p
- %u = add <8 x i8> %t, %t
- %s = bitcast <8 x i8> %u to i64
+define i64 @ioo(x86_mmx* %p) {
+ %t = load x86_mmx* %p
+ %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
+ %s = bitcast x86_mmx %u to i64
ret i64 %s
}
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/mmx-insert-element.ll b/test/CodeGen/X86/mmx-insert-element.ll
index a063ee1d6c..9f9bd8de2d 100644
--- a/test/CodeGen/X86/mmx-insert-element.ll
+++ b/test/CodeGen/X86/mmx-insert-element.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx | not grep movq
-; RUN: llc < %s -march=x86 -mattr=+mmx | grep psllq
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse | grep movq
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse | grep pshufd
+; This is not an MMX operation; promoted to XMM.
-define <2 x i32> @qux(i32 %A) nounwind {
+define x86_mmx @qux(i32 %A) nounwind {
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 ; <<2 x i32>> [#uses=1]
- ret <2 x i32> %tmp3
+ %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
+ ret x86_mmx %tmp4
}
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
index 0af7e017b6..4818baedc4 100644
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ b/test/CodeGen/X86/mmx-punpckhdq.ll
@@ -1,4 +1,6 @@
+; RUN: llc < %s -march=x86 -mattr=+mmx | grep pextrd
; RUN: llc < %s -march=x86 -mattr=+mmx | grep punpckhdq | count 1
+; There are no MMX operations in bork; promoted to XMM.
define void @bork(<1 x i64>* %x) {
entry:
@@ -11,4 +13,16 @@ entry:
ret void
}
+; pork uses MMX.
+
+define void @pork(x86_mmx* %x) {
+entry:
+ %tmp2 = load x86_mmx* %x ; <x86_mmx> [#uses=1]
+ %tmp9 = tail call x86_mmx @llvm.x86.mmx.punpckhdq (x86_mmx %tmp2, x86_mmx %tmp2)
+ store x86_mmx %tmp9, x86_mmx* %x
+ tail call void @llvm.x86.mmx.emms( )
+ ret void
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/mmx-shift.ll b/test/CodeGen/X86/mmx-shift.ll
index dd0aa2ca31..bafc75444d 100644
--- a/test/CodeGen/X86/mmx-shift.ll
+++ b/test/CodeGen/X86/mmx-shift.ll
@@ -5,28 +5,28 @@
define i64 @t1(<1 x i64> %mm1) nounwind {
entry:
- %tmp6 = tail call <1 x i64> @llvm.x86.mmx.pslli.q( <1 x i64> %mm1, i32 32 ) ; <<1 x i64>> [#uses=1]
- %retval1112 = bitcast <1 x i64> %tmp6 to i64 ; <i64> [#uses=1]
+ %tmp = bitcast <1 x i64> %mm1 to x86_mmx
+ %tmp6 = tail call x86_mmx @llvm.x86.mmx.pslli.q( x86_mmx %tmp, i32 32 ) ; <x86_mmx> [#uses=1]
+ %retval1112 = bitcast x86_mmx %tmp6 to i64
ret i64 %retval1112
}
-declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
-define i64 @t2(<2 x i32> %mm1, <2 x i32> %mm2) nounwind {
+define i64 @t2(x86_mmx %mm1, x86_mmx %mm2) nounwind {
entry:
- %tmp7 = tail call <2 x i32> @llvm.x86.mmx.psra.d( <2 x i32> %mm1, <2 x i32> %mm2 ) nounwind readnone ; <<2 x i32>> [#uses=1]
- %retval1112 = bitcast <2 x i32> %tmp7 to i64 ; <i64> [#uses=1]
+ %tmp7 = tail call x86_mmx @llvm.x86.mmx.psra.d( x86_mmx %mm1, x86_mmx %mm2 ) nounwind readnone ; <x86_mmx> [#uses=1]
+ %retval1112 = bitcast x86_mmx %tmp7 to i64
ret i64 %retval1112
}
-declare <2 x i32> @llvm.x86.mmx.psra.d(<2 x i32>, <2 x i32>) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
-define i64 @t3(<1 x i64> %mm1, i32 %bits) nounwind {
+define i64 @t3(x86_mmx %mm1, i32 %bits) nounwind {
entry:
- %tmp6 = bitcast <1 x i64> %mm1 to <4 x i16> ; <<4 x i16>> [#uses=1]
- %tmp8 = tail call <4 x i16> @llvm.x86.mmx.psrli.w( <4 x i16> %tmp6, i32 %bits ) nounwind readnone ; <<4 x i16>> [#uses=1]
- %retval1314 = bitcast <4 x i16> %tmp8 to i64 ; <i64> [#uses=1]
+ %tmp8 = tail call x86_mmx @llvm.x86.mmx.psrli.w( x86_mmx %mm1, i32 %bits ) nounwind readnone ; <x86_mmx> [#uses=1]
+ %retval1314 = bitcast x86_mmx %tmp8 to i64
ret i64 %retval1314
}
-declare <4 x i16> @llvm.x86.mmx.psrli.w(<4 x i16>, i32) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
diff --git a/test/CodeGen/X86/mmx-shuffle.ll b/test/CodeGen/X86/mmx-shuffle.ll
index e3125c7345..9f7501eb7c 100644
--- a/test/CodeGen/X86/mmx-shuffle.ll
+++ b/test/CodeGen/X86/mmx-shuffle.ll
@@ -22,8 +22,10 @@ entry:
%tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16> ; <<4 x i16>> [#uses=1]
%tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 > ; <<4 x i16>> [#uses=1]
%tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8> ; <<8 x i8>> [#uses=1]
- tail call void @llvm.x86.mmx.maskmovq( <8 x i8> zeroinitializer, <8 x i8> %tmp555, i8* null )
+ %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
+ %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
+ tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, i8* null )
ret void
}
-declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*)
diff --git a/test/CodeGen/X86/mmx-vzmovl-2.ll b/test/CodeGen/X86/mmx-vzmovl-2.ll
index 8253c20032..4ad420b37c 100644
--- a/test/CodeGen/X86/mmx-vzmovl-2.ll
+++ b/test/CodeGen/X86/mmx-vzmovl-2.ll
@@ -4,7 +4,7 @@
%struct.vS1024 = type { [8 x <4 x i32>] }
%struct.vS512 = type { [4 x <4 x i32>] }
-declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
define void @t() nounwind {
entry:
@@ -12,14 +12,18 @@ entry:
bb554: ; preds = %bb554, %entry
%sum.0.reg2mem.0 = phi <1 x i64> [ %tmp562, %bb554 ], [ zeroinitializer, %entry ] ; <<1 x i64>> [#uses=1]
- %0 = load <1 x i64>* null, align 8 ; <<1 x i64>> [#uses=2]
- %1 = bitcast <1 x i64> %0 to <2 x i32> ; <<2 x i32>> [#uses=1]
+ %0 = load x86_mmx* null, align 8 ; <<1 x i64>> [#uses=2]
+ %1 = bitcast x86_mmx %0 to <2 x i32> ; <<2 x i32>> [#uses=1]
%tmp555 = and <2 x i32> %1, < i32 -1, i32 0 > ; <<2 x i32>> [#uses=1]
- %2 = bitcast <2 x i32> %tmp555 to <1 x i64> ; <<1 x i64>> [#uses=1]
- %3 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1]
+ %2 = bitcast <2 x i32> %tmp555 to x86_mmx ; <<1 x i64>> [#uses=1]
+ %3 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1]
store <1 x i64> %sum.0.reg2mem.0, <1 x i64>* null
- %tmp558 = add <1 x i64> %sum.0.reg2mem.0, %2 ; <<1 x i64>> [#uses=1]
- %4 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %tmp558, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1]
- %tmp562 = add <1 x i64> %4, %3 ; <<1 x i64>> [#uses=1]
+ %tmp3 = bitcast x86_mmx %2 to <1 x i64>
+ %tmp558 = add <1 x i64> %sum.0.reg2mem.0, %tmp3 ; <<1 x i64>> [#uses=1]
+ %tmp5 = bitcast <1 x i64> %tmp558 to x86_mmx
+ %4 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %tmp5, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1]
+ %tmp6 = bitcast x86_mmx %4 to <1 x i64>
+ %tmp7 = bitcast x86_mmx %3 to <1 x i64>
+ %tmp562 = add <1 x i64> %tmp6, %tmp7 ; <<1 x i64>> [#uses=1]
br label %bb554
}
diff --git a/test/CodeGen/X86/mmx-vzmovl.ll b/test/CodeGen/X86/mmx-vzmovl.ll
index d21e240488..e8b34263c6 100644
--- a/test/CodeGen/X86/mmx-vzmovl.ll
+++ b/test/CodeGen/X86/mmx-vzmovl.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movd
-; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movq
+; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movq | count 2
+; There are no MMX operations here; this is promoted to XMM.
define void @foo(<1 x i64>* %a, <1 x i64>* %b) nounwind {
entry:
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 291fc0454c..471cc1611f 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -1,15 +1,16 @@
; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep psllq %t | grep 32
+; RUN: grep shll %t | grep 12
; RUN: grep pslldq %t | grep 12
; RUN: grep psrldq %t | grep 8
; RUN: grep psrldq %t | grep 12
+; There are no MMX operations in @t1
-define void @t1(i32 %a, <1 x i64>* %P) nounwind {
+define void @t1(i32 %a, x86_mmx* %P) nounwind {
%tmp12 = shl i32 %a, 12
%tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
%tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
- %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
- store <1 x i64> %tmp23, <1 x i64>* %P
+ %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
+ store x86_mmx %tmp23, x86_mmx* %P
ret void
}
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 9ede10f63d..ea7f919304 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx -mtriple=i686-apple-darwin9 -o - | grep punpckldq
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse -mtriple=i686-apple-darwin9 -o - | grep pinsrd | count 2
+; MMX insertelement is not available; these are promoted to XMM.
+; (Without SSE they are split to two ints, and the code is much better.)
-define <2 x i32> @mmx_movzl(<2 x i32> %x) nounwind {
+define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
entry:
- %tmp3 = insertelement <2 x i32> %x, i32 32, i32 0 ; <<2 x i32>> [#uses=1]
+ %tmp = bitcast x86_mmx %x to <2 x i32>
+ %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 ; <<2 x i32>> [#uses=1]
%tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1 ; <<2 x i32>> [#uses=1]
- ret <2 x i32> %tmp8
+ %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
+ ret x86_mmx %tmp9
}
diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll
index 3b15d4cc40..8aa50945e6 100644
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 2
-; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pcmpeqd | count 2
+; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 1
+; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pcmpeqd | count 1
+; 64-bit stores here do not use MMX.
@M1 = external global <1 x i64>
@M2 = external global <2 x i32>