diff options
Diffstat (limited to 'test')
25 files changed, 540 insertions, 214 deletions
diff --git a/test/Assembler/AutoUpgradeIntrinsics.ll b/test/Assembler/AutoUpgradeIntrinsics.ll index af4ec92e09..6752bd8281 100644 --- a/test/Assembler/AutoUpgradeIntrinsics.ll +++ b/test/Assembler/AutoUpgradeIntrinsics.ll @@ -7,7 +7,7 @@ ; RUN: llvm-as < %s | llvm-dis | \ ; RUN: not grep {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*} ; RUN: llvm-as < %s | llvm-dis | \ -; RUN: grep {llvm\\.x86\\.mmx\\.ps} | grep {\\\<2 x i32\\\>} | count 6 +; RUN: grep {llvm\\.x86\\.mmx\\.ps} | grep {x86_mmx} | count 16 declare i32 @llvm.ctpop.i28(i28 %val) declare i32 @llvm.cttz.i29(i29 %val) diff --git a/test/Bitcode/ssse3_palignr.ll.bc b/test/Bitcode/ssse3_palignr.ll.bc Binary files differindex 642f4dedc4..3fc9cdf15a 100644 --- a/test/Bitcode/ssse3_palignr.ll.bc +++ b/test/Bitcode/ssse3_palignr.ll.bc diff --git a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll index c39b82a1fe..256cbbb5d9 100644 --- a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll +++ b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -o - -march=x86 -mattr=+mmx | FileCheck %s +; There are no MMX instructions here. We use add+adcl for the adds. define <1 x i64> @unsigned_add3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind { entry: @@ -7,9 +8,8 @@ entry: bb26: ; preds = %bb26, %entry -; CHECK: movq ({{.*}},8), %mm -; CHECK: paddq ({{.*}},8), %mm -; CHECK: paddq %mm{{[0-7]}}, %mm +; CHECK: addl %eax, %ebx +; CHECK: adcl %edx, %ebp %i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ] ; <i32> [#uses=3] %sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] ; <<1 x i64>> [#uses=1] @@ -27,3 +27,38 @@ bb31: ; preds = %bb26, %entry %sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ] ; <<1 x i64>> [#uses=1] ret <1 x i64> %sum.035.1 } + + +; This is the original test converted to use MMX intrinsics. + +define <1 x i64> @unsigned_add3a(x86_mmx* %a, x86_mmx* %b, i32 %count) nounwind { +entry: + %tmp2943 = bitcast <1 x i64><i64 0> to x86_mmx + %tmp2942 = icmp eq i32 %count, 0 ; <i1> [#uses=1] + br i1 %tmp2942, label %bb31, label %bb26 + +bb26: ; preds = %bb26, %entry + +; CHECK: movq ({{.*}},8), %mm +; CHECK: paddq ({{.*}},8), %mm +; CHECK: paddq %mm{{[0-7]}}, %mm + + %i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ] ; <i32> [#uses=3] + %sum.035.0 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ] ; <x86_mmx> [#uses=1] + %tmp13 = getelementptr x86_mmx* %b, i32 %i.037.0 ; <x86_mmx*> [#uses=1] + %tmp14 = load x86_mmx* %tmp13 ; <x86_mmx> [#uses=1] + %tmp18 = getelementptr x86_mmx* %a, i32 %i.037.0 ; <x86_mmx*> [#uses=1] + %tmp19 = load x86_mmx* %tmp18 ; <x86_mmx> [#uses=1] + %tmp21 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp19, x86_mmx %tmp14) ; <x86_mmx> [#uses=1] + %tmp22 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp21, x86_mmx %sum.035.0) ; <x86_mmx> [#uses=2] + %tmp25 = add i32 %i.037.0, 1 ; <i32> [#uses=2] + %tmp29 = icmp ult i32 %tmp25, %count ; <i1> [#uses=1] + br i1 %tmp29, label %bb26, label %bb31 + +bb31: ; preds = %bb26, %entry + %sum.035.1 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ] ; <x86_mmx> [#uses=1] + %t = bitcast x86_mmx %sum.035.1 to <1 x i64> + ret <1 x i64> %t +} + +declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) diff --git a/test/CodeGen/X86/2007-05-15-maskmovq.ll b/test/CodeGen/X86/2007-05-15-maskmovq.ll index 2093b8f687..006cf2e43a 100644 --- a/test/CodeGen/X86/2007-05-15-maskmovq.ll +++ b/test/CodeGen/X86/2007-05-15-maskmovq.ll @@ -5,10 +5,10 @@ target triple = "i686-apple-darwin8" define void @test(<1 x i64> %c64, <1 x i64> %mask1, i8* %P) { entry: - %tmp4 = bitcast <1 x i64> %mask1 to <8 x i8> ; <<8 x i8>> [#uses=1] - %tmp6 = bitcast <1 x i64> %c64 to <8 x i8> ; <<8 x i8>> [#uses=1] - tail call void @llvm.x86.mmx.maskmovq( <8 x i8> %tmp6, <8 x i8> %tmp4, i8* %P ) + %tmp4 = bitcast <1 x i64> %mask1 to x86_mmx ; <x86_mmx> [#uses=1] + %tmp6 = bitcast <1 x i64> %c64 to x86_mmx ; <x86_mmx> [#uses=1] + tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, i8* %P ) ret void } -declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*) +declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) diff --git a/test/CodeGen/X86/2007-06-15-IntToMMX.ll b/test/CodeGen/X86/2007-06-15-IntToMMX.ll index 6128d8b92d..660d4fe7b1 100644 --- a/test/CodeGen/X86/2007-06-15-IntToMMX.ll +++ b/test/CodeGen/X86/2007-06-15-IntToMMX.ll @@ -1,17 +1,16 @@ ; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep paddusw -@R = external global <1 x i64> ; <<1 x i64>*> [#uses=1] +@R = external global x86_mmx ; <x86_mmx*> [#uses=1] define void @foo(<1 x i64> %A, <1 x i64> %B) { entry: - %tmp4 = bitcast <1 x i64> %B to <4 x i16> ; <<4 x i16>> [#uses=1] - %tmp6 = bitcast <1 x i64> %A to <4 x i16> ; <<4 x i16>> [#uses=1] - %tmp7 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp6, <4 x i16> %tmp4 ) ; <<4 x i16>> [#uses=1] - %tmp8 = bitcast <4 x i16> %tmp7 to <1 x i64> ; <<1 x i64>> [#uses=1] - store <1 x i64> %tmp8, <1 x i64>* @R + %tmp2 = bitcast <1 x i64> %A to x86_mmx + %tmp3 = bitcast <1 x i64> %B to x86_mmx + %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp2, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=1] + store x86_mmx %tmp7, x86_mmx* @R tail call void @llvm.x86.mmx.emms( ) ret void } -declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>) +declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) declare void @llvm.x86.mmx.emms() diff --git a/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll b/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll index 2c513f1781..1c5e6766fd 100644 --- a/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll +++ b/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll @@ -2,19 +2,17 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | grep {movd %rdi, %mm1} ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | grep {paddusw %mm0, %mm1} -@R = external global <1 x i64> ; <<1 x i64>*> [#uses=1] +@R = external global x86_mmx ; <x86_mmx*> [#uses=1] define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind { entry: - %tmp4 = bitcast <1 x i64> %B to <4 x i16> ; <<4 x i16>> [#uses=1] - %tmp6 = bitcast <1 x i64> %A to <4 x i16> ; <<4 x i16>> [#uses=1] - %tmp7 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp6, <4 x i16> %tmp4 ) ; <<4 x i16>> [#uses=1] - %tmp8 = bitcast <4 x i16> %tmp7 to <1 x i64> ; <<1 x i64>> [#uses=1] - store <1 x i64> %tmp8, <1 x i64>* @R + %tmp4 = bitcast <1 x i64> %B to x86_mmx ; <<4 x i16>> [#uses=1] + %tmp6 = bitcast <1 x i64> %A to x86_mmx ; <<4 x i16>> [#uses=1] + %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 ) ; <x86_mmx> [#uses=1] + store x86_mmx %tmp7, x86_mmx* @R tail call void @llvm.x86.mmx.emms( ) ret void } -declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>) - +declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) declare void @llvm.x86.mmx.emms() diff --git a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll index dc8c097efc..5089e8c5b6 100644 --- a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll +++ b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll @@ -5,15 +5,15 @@ entry: tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - %tmp1 = tail call <2 x i32> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; <<2 x i32>> [#uses=1] + %tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; <x86_mmx> [#uses=1] tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - %tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> undef ) nounwind ; <i32> [#uses=1] + %tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind ; <i32> [#uses=1] tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> undef, i32 undef, i32 %tmp3 ) nounwind + tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - %tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> %tmp1 ) nounwind ; <i32> [#uses=0] + %tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind ; <i32> [#uses=0] ret i32 undef } diff --git a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll index c76dd7de12..53402c0451 100644 --- a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll +++ b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll @@ -17,11 +17,13 @@ entry: br i1 false, label %bb.nph144.split, label %bb133 bb.nph144.split: ; preds = %entry - tail call void @llvm.x86.mmx.maskmovq( <8 x i8> zeroinitializer, <8 x i8> zeroinitializer, i8* null ) nounwind + %tmp = bitcast <8 x i8> zeroinitializer to x86_mmx + %tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx + tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, i8* null ) nounwind unreachable bb133: ; preds = %entry ret void } -declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*) nounwind +declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) nounwind diff --git a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll index 60be0d51e7..2dc1deaf17 100644 --- a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll +++ b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll @@ -1,6 +1,9 @@ +; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | grep unpcklpd +; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | grep unpckhpd ; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvttpd2pi | count 1 ; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvtpi2pd | count 1 -; PR2687 +; originally from PR2687, but things don't work that way any more. +; there are no MMX instructions here; we use XMM. define <2 x double> @a(<2 x i32> %x) nounwind { entry: @@ -13,3 +16,20 @@ entry: %y = fptosi <2 x double> %x to <2 x i32> ret <2 x i32> %y } + +; This is how to get MMX instructions. + +define <2 x double> @a2(x86_mmx %x) nounwind { +entry: + %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x) + ret <2 x double> %y +} + +define x86_mmx @b2(<2 x double> %x) nounwind { +entry: + %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x) + ret x86_mmx %y +} + +declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) +declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) diff --git a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll b/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll index b9b09a3f00..288eef4f69 100644 --- a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll +++ b/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll @@ -1,10 +1,12 @@ ; RUN: llc < %s -march=x86-64 ; PR4669 -declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) +declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) define <1 x i64> @test(i64 %t) { entry: %t1 = insertelement <1 x i64> undef, i64 %t, i32 0 - %t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48) - ret <1 x i64> %t2 + %t0 = bitcast <1 x i64> %t1 to x86_mmx + %t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48) + %t3 = bitcast x86_mmx %t2 to <1 x i64> + ret <1 x i64> %t3 } diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll index 4cd3be35e8..fa3d5fbcdc 100644 --- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll +++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll @@ -1,12 +1,12 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s +; There are no MMX operations here, so we use XMM or i64. define void @ti8(double %a, double %b) nounwind { entry: %tmp1 = bitcast double %a to <8 x i8> -; CHECK: movdq2q %tmp2 = bitcast double %b to <8 x i8> -; CHECK: movdq2q %tmp3 = add <8 x i8> %tmp1, %tmp2 +; CHECK: paddb %xmm1, %xmm0 store <8 x i8> %tmp3, <8 x i8>* null ret void } @@ -14,10 +14,9 @@ entry: define void @ti16(double %a, double %b) nounwind { entry: %tmp1 = bitcast double %a to <4 x i16> -; CHECK: movdq2q %tmp2 = bitcast double %b to <4 x i16> -; CHECK: movdq2q %tmp3 = add <4 x i16> %tmp1, %tmp2 +; CHECK: paddw %xmm1, %xmm0 store <4 x i16> %tmp3, <4 x i16>* null ret void } @@ -25,10 +24,9 @@ entry: define void @ti32(double %a, double %b) nounwind { entry: %tmp1 = bitcast double %a to <2 x i32> -; CHECK: movdq2q %tmp2 = bitcast double %b to <2 x i32> -; CHECK: movdq2q %tmp3 = add <2 x i32> %tmp1, %tmp2 +; CHECK: paddd %xmm1, %xmm0 store <2 x i32> %tmp3, <2 x i32>* null ret void } @@ -36,10 +34,60 @@ entry: define void @ti64(double %a, double %b) nounwind { entry: %tmp1 = bitcast double %a to <1 x i64> -; CHECK: movdq2q %tmp2 = bitcast double %b to <1 x i64> -; CHECK: movdq2q %tmp3 = add <1 x i64> %tmp1, %tmp2 +; CHECK: addq %rax, %rcx store <1 x i64> %tmp3, <1 x i64>* null ret void } + +; MMX intrinsics calls get us MMX instructions. + +define void @ti8a(double %a, double %b) nounwind { +entry: + %tmp1 = bitcast double %a to x86_mmx +; CHECK: movdq2q + %tmp2 = bitcast double %b to x86_mmx +; CHECK: movdq2q + %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2) + store x86_mmx %tmp3, x86_mmx* null + ret void +} + +define void @ti16a(double %a, double %b) nounwind { +entry: + %tmp1 = bitcast double %a to x86_mmx +; CHECK: movdq2q + %tmp2 = bitcast double %b to x86_mmx +; CHECK: movdq2q + %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2) + store x86_mmx %tmp3, x86_mmx* null + ret void +} + +define void @ti32a(double %a, double %b) nounwind { +entry: + %tmp1 = bitcast double %a to x86_mmx +; CHECK: movdq2q + %tmp2 = bitcast double %b to x86_mmx +; CHECK: movdq2q + %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2) + store x86_mmx %tmp3, x86_mmx* null + ret void +} + +define void @ti64a(double %a, double %b) nounwind { +entry: + %tmp1 = bitcast double %a to x86_mmx +; CHECK: movdq2q + %tmp2 = bitcast double %b to x86_mmx +; CHECK: movdq2q + %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2) + store x86_mmx %tmp3, x86_mmx* null + ret void +} + +declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) diff --git a/test/CodeGen/X86/fast-isel-bc.ll b/test/CodeGen/X86/fast-isel-bc.ll index 8d7dc8f9a7..db846889d8 100644 --- a/test/CodeGen/X86/fast-isel-bc.ll +++ b/test/CodeGen/X86/fast-isel-bc.ll @@ -5,15 +5,19 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-apple-darwin9.8" -declare void @func2(<1 x i64>) +declare void @func2(x86_mmx) define void @func1() nounwind { ; This isn't spectacular, but it's MMX code at -O0... -; CHECK: movl $2, %eax -; CHECK: movd %rax, %mm0 -; CHECK: movd %mm0, %rdi +; CHECK: movq2dq %mm0, %xmm0 +; For now, handling of x86_mmx parameters in fast Isel is unimplemented, +; so we get pretty poor code. The below is preferable. +; CHEK: movl $2, %eax +; CHEK: movd %rax, %mm0 +; CHEK: movd %mm0, %rdi - call void @func2(<1 x i64> <i64 2>) + %tmp0 = bitcast <2 x i32><i32 0, i32 2> to x86_mmx + call void @func2(x86_mmx %tmp0) ret void } diff --git a/test/CodeGen/X86/mmx-arg-passing.ll b/test/CodeGen/X86/mmx-arg-passing.ll index 426e98e019..b348512b57 100644 --- a/test/CodeGen/X86/mmx-arg-passing.ll +++ b/test/CodeGen/X86/mmx-arg-passing.ll @@ -1,24 +1,27 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep mm0 | count 3 -; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep esp | count 1 +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep mm0 | count 1 +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep esp | count 2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep xmm0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep rdi ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | not grep movups ; ; On Darwin x86-32, v8i8, v4i16, v2i32 values are passed in MM[0-2]. -; On Darwin x86-32, v1i64 values are passed in memory. +; On Darwin x86-32, v1i64 values are passed in memory. In this example, they +; are never moved into an MM register at all. ; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7]. ; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs. -@u1 = external global <8 x i8> +@u1 = external global x86_mmx -define void @t1(<8 x i8> %v1) nounwind { - store <8 x i8> %v1, <8 x i8>* @u1, align 8 +define void @t1(x86_mmx %v1) nounwind { + store x86_mmx %v1, x86_mmx* @u1, align 8 ret void } -@u2 = external global <1 x i64> +@u2 = external global x86_mmx define void @t2(<1 x i64> %v1) nounwind { - store <1 x i64> %v1, <1 x i64>* @u2, align 8 + %tmp = bitcast <1 x i64> %v1 to x86_mmx + store x86_mmx %tmp, x86_mmx* @u2, align 8 ret void } + diff --git a/test/CodeGen/X86/mmx-arg-passing2.ll b/test/CodeGen/X86/mmx-arg-passing2.ll index c42af08236..c132d311b9 100644 --- a/test/CodeGen/X86/mmx-arg-passing2.ll +++ b/test/CodeGen/X86/mmx-arg-passing2.ll @@ -1,17 +1,21 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep movq2dq | count 1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep movdq2q | count 2 +; Since the add is not an MMX add, we don't have a movq2dq any more. @g_v8qi = external global <8 x i8> define void @t1() nounwind { %tmp3 = load <8 x i8>* @g_v8qi, align 8 - %tmp4 = tail call i32 (...)* @pass_v8qi( <8 x i8> %tmp3 ) nounwind + %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx + %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind ret void } -define void @t2(<8 x i8> %v1, <8 x i8> %v2) nounwind { - %tmp3 = add <8 x i8> %v1, %v2 - %tmp4 = tail call i32 (...)* @pass_v8qi( <8 x i8> %tmp3 ) nounwind +define void @t2(x86_mmx %v1, x86_mmx %v2) nounwind { + %v1a = bitcast x86_mmx %v1 to <8 x i8> + %v2b = bitcast x86_mmx %v2 to <8 x i8> + %tmp3 = add <8 x i8> %v1a, %v2b + %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx + %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind ret void } diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll index e4dfdbfe1b..6817487324 100644 --- a/test/CodeGen/X86/mmx-arith.ll +++ b/test/CodeGen/X86/mmx-arith.ll @@ -1,131 +1,309 @@ ; RUN: llc < %s -march=x86 -mattr=+mmx ;; A basic sanity check to make sure that MMX arithmetic actually compiles. +;; First is a straight translation of the original with bitcasts as needed. -define void @foo(<8 x i8>* %A, <8 x i8>* %B) { +define void @foo(x86_mmx* %A, x86_mmx* %B) { entry: - %tmp1 = load <8 x i8>* %A ; <<8 x i8>> [#uses=1] - %tmp3 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp4 = add <8 x i8> %tmp1, %tmp3 ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp4, <8 x i8>* %A - %tmp7 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp12 = tail call <8 x i8> @llvm.x86.mmx.padds.b( <8 x i8> %tmp4, <8 x i8> %tmp7 ) ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp12, <8 x i8>* %A - %tmp16 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp21 = tail call <8 x i8> @llvm.x86.mmx.paddus.b( <8 x i8> %tmp12, <8 x i8> %tmp16 ) ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp21, <8 x i8>* %A - %tmp27 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp28 = sub <8 x i8> %tmp21, %tmp27 ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp28, <8 x i8>* %A - %tmp31 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp36 = tail call <8 x i8> @llvm.x86.mmx.psubs.b( <8 x i8> %tmp28, <8 x i8> %tmp31 ) ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp36, <8 x i8>* %A - %tmp40 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp45 = tail call <8 x i8> @llvm.x86.mmx.psubus.b( <8 x i8> %tmp36, <8 x i8> %tmp40 ) ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp45, <8 x i8>* %A - %tmp51 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp52 = mul <8 x i8> %tmp45, %tmp51 ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp52, <8 x i8>* %A - %tmp57 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp58 = and <8 x i8> %tmp52, %tmp57 ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp58, <8 x i8>* %A - %tmp63 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp64 = or <8 x i8> %tmp58, %tmp63 ; <<8 x i8>> [#uses=2] - store <8 x i8> %tmp64, <8 x i8>* %A - %tmp69 = load <8 x i8>* %B ; <<8 x i8>> [#uses=1] - %tmp70 = xor <8 x i8> %tmp64, %tmp69 ; <<8 x i8>> [#uses=1] - store <8 x i8> %tmp70, <8 x i8>* %A + %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1] + %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8> + %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8> + %tmp4 = add <8 x i8> %tmp1a, %tmp3a ; <<8 x i8>> [#uses=2] + %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx + store x86_mmx %tmp4a, x86_mmx* %A + %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4a, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp12, x86_mmx* %A + %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp21, x86_mmx* %A + %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8> + %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8> + %tmp28 = sub <8 x i8> %tmp21a, %tmp27a ; <<8 x i8>> [#uses=2] + %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx + store x86_mmx %tmp28a, x86_mmx* %A + %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28a, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp36, x86_mmx* %A + %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp45, x86_mmx* %A + %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8> + %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8> + %tmp52 = mul <8 x i8> %tmp45a, %tmp51a ; <<8 x i8>> [#uses=2] + %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx + store x86_mmx %tmp52a, x86_mmx* %A + %tmp57 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8> + %tmp58 = and <8 x i8> %tmp52, %tmp57a ; <<8 x i8>> [#uses=2] + %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx + store x86_mmx %tmp58a, x86_mmx* %A + %tmp63 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8> + %tmp64 = or <8 x i8> %tmp58, %tmp63a ; <<8 x i8>> [#uses=2] + %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx + store x86_mmx %tmp64a, x86_mmx* %A + %tmp69 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8> + %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8> + %tmp70 = xor <8 x i8> %tmp64b, %tmp69a ; <<8 x i8>> [#uses=1] + %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx + store x86_mmx %tmp70a, x86_mmx* %A tail call void @llvm.x86.mmx.emms( ) ret void } -define void @baz(<2 x i32>* %A, <2 x i32>* %B) { +define void @baz(x86_mmx* %A, x86_mmx* %B) { entry: - %tmp1 = load <2 x i32>* %A ; <<2 x i32>> [#uses=1] - %tmp3 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1] - %tmp4 = add <2 x i32> %tmp1, %tmp3 ; <<2 x i32>> [#uses=2] - store <2 x i32> %tmp4, <2 x i32>* %A - %tmp9 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1] - %tmp10 = sub <2 x i32> %tmp4, %tmp9 ; <<2 x i32>> [#uses=2] - store <2 x i32> %tmp10, <2 x i32>* %A - %tmp15 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1] - %tmp16 = mul <2 x i32> %tmp10, %tmp15 ; <<2 x i32>> [#uses=2] - store <2 x i32> %tmp16, <2 x i32>* %A - %tmp21 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1] - %tmp22 = and <2 x i32> %tmp16, %tmp21 ; <<2 x i32>> [#uses=2] - store <2 x i32> %tmp22, <2 x i32>* %A - %tmp27 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1] - %tmp28 = or <2 x i32> %tmp22, %tmp27 ; <<2 x i32>> [#uses=2] - store <2 x i32> %tmp28, <2 x i32>* %A - %tmp33 = load <2 x i32>* %B ; <<2 x i32>> [#uses=1] - %tmp34 = xor <2 x i32> %tmp28, %tmp33 ; <<2 x i32>> [#uses=1] - store <2 x i32> %tmp34, <2 x i32>* %A + %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1] + %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32> + %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32> + %tmp4 = add <2 x i32> %tmp1a, %tmp3a ; <<2 x i32>> [#uses=2] + %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx + store x86_mmx %tmp4a, x86_mmx* %A + %tmp9 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32> + %tmp10 = sub <2 x i32> %tmp4, %tmp9a ; <<2 x i32>> [#uses=2] + %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx + store x86_mmx %tmp10a, x86_mmx* %A + %tmp15 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32> + %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32> + %tmp16 = mul <2 x i32> %tmp10b, %tmp15a ; <<2 x i32>> [#uses=2] + %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx + store x86_mmx %tmp16a, x86_mmx* %A + %tmp21 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32> + %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32> + %tmp22 = and <2 x i32> %tmp16b, %tmp21a ; <<2 x i32>> [#uses=2] + %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx + store x86_mmx %tmp22a, x86_mmx* %A + %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32> + %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32> + %tmp28 = or <2 x i32> %tmp22b, %tmp27a ; <<2 x i32>> [#uses=2] + %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx + store x86_mmx %tmp28a, x86_mmx* %A + %tmp33 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32> + %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32> + %tmp34 = xor <2 x i32> %tmp28b, %tmp33a ; <<2 x i32>> [#uses=1] + %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx + store x86_mmx %tmp34a, x86_mmx* %A tail call void @llvm.x86.mmx.emms( ) ret void } -define void @bar(<4 x i16>* %A, <4 x i16>* %B) { +define void @bar(x86_mmx* %A, x86_mmx* %B) { entry: - %tmp1 = load <4 x i16>* %A ; <<4 x i16>> [#uses=1] - %tmp3 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp4 = add <4 x i16> %tmp1, %tmp3 ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp4, <4 x i16>* %A - %tmp7 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp12 = tail call <4 x i16> @llvm.x86.mmx.padds.w( <4 x i16> %tmp4, <4 x i16> %tmp7 ) ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp12, <4 x i16>* %A - %tmp16 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp21 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp12, <4 x i16> %tmp16 ) ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp21, <4 x i16>* %A - %tmp27 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp28 = sub <4 x i16> %tmp21, %tmp27 ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp28, <4 x i16>* %A - %tmp31 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp36 = tail call <4 x i16> @llvm.x86.mmx.psubs.w( <4 x i16> %tmp28, <4 x i16> %tmp31 ) ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp36, <4 x i16>* %A - %tmp40 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp45 = tail call <4 x i16> @llvm.x86.mmx.psubus.w( <4 x i16> %tmp36, <4 x i16> %tmp40 ) ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp45, <4 x i16>* %A - %tmp51 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp52 = mul <4 x i16> %tmp45, %tmp51 ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp52, <4 x i16>* %A - %tmp55 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp60 = tail call <4 x i16> @llvm.x86.mmx.pmulh.w( <4 x i16> %tmp52, <4 x i16> %tmp55 ) ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp60, <4 x i16>* %A - %tmp64 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp69 = tail call <2 x i32> @llvm.x86.mmx.pmadd.wd( <4 x i16> %tmp60, <4 x i16> %tmp64 ) ; <<2 x i32>> [#uses=1] - %tmp70 = bitcast <2 x i32> %tmp69 to <4 x i16> ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp70, <4 x i16>* %A - %tmp75 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp76 = and <4 x i16> %tmp70, %tmp75 ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp76, <4 x i16>* %A - %tmp81 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp82 = or <4 x i16> %tmp76, %tmp81 ; <<4 x i16>> [#uses=2] - store <4 x i16> %tmp82, <4 x i16>* %A - %tmp87 = load <4 x i16>* %B ; <<4 x i16>> [#uses=1] - %tmp88 = xor <4 x i16> %tmp82, %tmp87 ; <<4 x i16>> [#uses=1] - store <4 x i16> %tmp88, <4 x i16>* %A + %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1] + %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16> + %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16> + %tmp4 = add <4 x i16> %tmp1a, %tmp3a ; <<4 x i16>> [#uses=2] + %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx + store x86_mmx %tmp4a, x86_mmx* %A + %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4a, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp12, x86_mmx* %A + %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp21, x86_mmx* %A + %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16> + %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16> + %tmp28 = sub <4 x i16> %tmp21a, %tmp27a ; <<4 x i16>> [#uses=2] + %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx + store x86_mmx %tmp28a, x86_mmx* %A + %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28a, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp36, x86_mmx* %A + %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp45, x86_mmx* %A + %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16> + %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16> + %tmp52 = mul <4 x i16> %tmp45a, %tmp51a ; <<4 x i16>> [#uses=2] + %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx + store x86_mmx %tmp52a, x86_mmx* %A + %tmp55 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52a, x86_mmx %tmp55 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp60, x86_mmx* %A + %tmp64 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 ) ; <x86_mmx> [#uses=1] + %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx ; <x86_mmx> [#uses=2] + store x86_mmx %tmp70, x86_mmx* %A + %tmp75 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16> + %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16> + %tmp76 = and <4 x i16> %tmp70a, %tmp75a ; <<4 x i16>> [#uses=2] + %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx + store x86_mmx %tmp76a, x86_mmx* %A + %tmp81 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16> + %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16> + %tmp82 = or <4 x i16> %tmp76b, %tmp81a ; <<4 x i16>> [#uses=2] + %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx + store x86_mmx %tmp82a, x86_mmx* %A + %tmp87 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16> + %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16> + %tmp88 = xor <4 x i16> %tmp82b, %tmp87a ; <<4 x i16>> [#uses=1] + %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx + store x86_mmx %tmp88a, x86_mmx* %A tail call void @llvm.x86.mmx.emms( ) ret void } -declare <8 x i8> @llvm.x86.mmx.padds.b(<8 x i8>, <8 x i8>) +;; The following is modified to use MMX intrinsics everywhere they work. -declare <8 x i8> @llvm.x86.mmx.paddus.b(<8 x i8>, <8 x i8>) +define void @fooa(x86_mmx* %A, x86_mmx* %B) { +entry: + %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1] + %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.b( x86_mmx %tmp1, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp4, x86_mmx* %A + %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp12, x86_mmx* %A + %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp21, x86_mmx* %A + %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.b( x86_mmx %tmp21, x86_mmx %tmp27 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp28, x86_mmx* %A + %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp36, x86_mmx* %A + %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp45, x86_mmx* %A + %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp51a = bitcast x86_mmx %tmp51 to i64 + %tmp51aa = bitcast i64 %tmp51a to <8 x i8> + %tmp51b = bitcast x86_mmx %tmp45 to <8 x i8> + %tmp52 = mul <8 x i8> %tmp51b, %tmp51aa ; <x86_mmx> [#uses=2] + %tmp52a = bitcast <8 x i8> %tmp52 to i64 + %tmp52aa = bitcast i64 %tmp52a to x86_mmx + store x86_mmx %tmp52aa, x86_mmx* %A + %tmp57 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp58 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp51, x86_mmx %tmp57 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp58, x86_mmx* %A + %tmp63 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp64 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp58, x86_mmx %tmp63 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp64, x86_mmx* %A + %tmp69 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp70 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp64, x86_mmx %tmp69 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp70, x86_mmx* %A + tail call void @llvm.x86.mmx.emms( ) + ret void +} -declare <8 x i8> @llvm.x86.mmx.psubs.b(<8 x i8>, <8 x i8>) +define void @baza(x86_mmx* %A, x86_mmx* %B) { +entry: + %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1] + %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.d( x86_mmx %tmp1, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp4, x86_mmx* %A + %tmp9 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp10 = tail call x86_mmx @llvm.x86.mmx.psub.d( x86_mmx %tmp4, x86_mmx %tmp9 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp10, x86_mmx* %A + %tmp15 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp10a = bitcast x86_mmx %tmp10 to <2 x i32> + %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32> + %tmp16 = mul <2 x i32> %tmp10a, %tmp15a ; <x86_mmx> [#uses=2] + %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx + store x86_mmx %tmp16a, x86_mmx* %A + %tmp21 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp22 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp16a, x86_mmx %tmp21 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp22, x86_mmx* %A + %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp28 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp22, x86_mmx %tmp27 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp28, x86_mmx* %A + %tmp33 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp34 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp28, x86_mmx %tmp33 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp34, x86_mmx* %A + tail call void @llvm.x86.mmx.emms( ) + ret void +} -declare <8 x i8> @llvm.x86.mmx.psubus.b(<8 x i8>, <8 x i8>) +define void @bara(x86_mmx* %A, x86_mmx* %B) { +entry: + %tmp1 = load x86_mmx* %A ; <x86_mmx> [#uses=1] + %tmp3 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.w( x86_mmx %tmp1, x86_mmx %tmp3 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp4, x86_mmx* %A + %tmp7 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4, x86_mmx %tmp7 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp12, x86_mmx* %A + %tmp16 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp21, x86_mmx* %A + %tmp27 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.w( x86_mmx %tmp21, x86_mmx %tmp27 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp28, x86_mmx* %A + %tmp31 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28, x86_mmx %tmp31 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp36, x86_mmx* %A + %tmp40 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp45, x86_mmx* %A + %tmp51 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp52 = tail call x86_mmx @llvm.x86.mmx.pmull.w( x86_mmx %tmp45, x86_mmx %tmp51 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp52, x86_mmx* %A + %tmp55 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52, x86_mmx %tmp55 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp60, x86_mmx* %A + %tmp64 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 ) ; <x86_mmx> [#uses=1] + %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx ; <x86_mmx> [#uses=2] + store x86_mmx %tmp70, x86_mmx* %A + %tmp75 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp76 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp70, x86_mmx %tmp75 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp76, x86_mmx* %A + %tmp81 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp82 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp76, x86_mmx %tmp81 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp82, x86_mmx* %A + %tmp87 = load x86_mmx* %B ; <x86_mmx> [#uses=1] + %tmp88 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp82, x86_mmx %tmp87 ) ; <x86_mmx> [#uses=2] + store x86_mmx %tmp88, x86_mmx* %A + tail call void @llvm.x86.mmx.emms( ) + ret void +} -declare <4 x i16> @llvm.x86.mmx.padds.w(<4 x i16>, <4 x i16>) +declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) -declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>) +declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) -declare <4 x i16> @llvm.x86.mmx.psubs.w(<4 x i16>, <4 x i16>) +declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) -declare <4 x i16> @llvm.x86.mmx.psubus.w(<4 x i16>, <4 x i16>) +declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) -declare <4 x i16> @llvm.x86.mmx.pmulh.w(<4 x i16>, <4 x i16>) +declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) -declare <2 x i32> @llvm.x86.mmx.pmadd.wd(<4 x i16>, <4 x i16>) +declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) declare void @llvm.x86.mmx.emms() + +declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padds.d(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.psubs.d(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) + diff --git a/test/CodeGen/X86/mmx-bitcast-to-i64.ll b/test/CodeGen/X86/mmx-bitcast-to-i64.ll index 1fd8f67a0c..8b1840abf6 100644 --- a/test/CodeGen/X86/mmx-bitcast-to-i64.ll +++ b/test/CodeGen/X86/mmx-bitcast-to-i64.ll @@ -1,26 +1,31 @@ ; RUN: llc < %s -march=x86-64 | grep movd | count 4 -define i64 @foo(<1 x i64>* %p) { - %t = load <1 x i64>* %p - %u = add <1 x i64> %t, %t - %s = bitcast <1 x i64> %u to i64 +define i64 @foo(x86_mmx* %p) { + %t = load x86_mmx* %p + %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t) + %s = bitcast x86_mmx %u to i64 ret i64 %s } -define i64 @goo(<2 x i32>* %p) { - %t = load <2 x i32>* %p - %u = add <2 x i32> %t, %t - %s = bitcast <2 x i32> %u to i64 +define i64 @goo(x86_mmx* %p) { + %t = load x86_mmx* %p + %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t) + %s = bitcast x86_mmx %u to i64 ret i64 %s } -define i64 @hoo(<4 x i16>* %p) { - %t = load <4 x i16>* %p - %u = add <4 x i16> %t, %t - %s = bitcast <4 x i16> %u to i64 +define i64 @hoo(x86_mmx* %p) { + %t = load x86_mmx* %p + %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t) + %s = bitcast x86_mmx %u to i64 ret i64 %s } -define i64 @ioo(<8 x i8>* %p) { - %t = load <8 x i8>* %p - %u = add <8 x i8> %t, %t - %s = bitcast <8 x i8> %u to i64 +define i64 @ioo(x86_mmx* %p) { + %t = load x86_mmx* %p + %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t) + %s = bitcast x86_mmx %u to i64 ret i64 %s } + +declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) +declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) diff --git a/test/CodeGen/X86/mmx-insert-element.ll b/test/CodeGen/X86/mmx-insert-element.ll index a063ee1d6c..9f9bd8de2d 100644 --- a/test/CodeGen/X86/mmx-insert-element.ll +++ b/test/CodeGen/X86/mmx-insert-element.ll @@ -1,7 +1,9 @@ -; RUN: llc < %s -march=x86 -mattr=+mmx | not grep movq -; RUN: llc < %s -march=x86 -mattr=+mmx | grep psllq +; RUN: llc < %s -march=x86 -mattr=+mmx,+sse | grep movq +; RUN: llc < %s -march=x86 -mattr=+mmx,+sse | grep pshufd +; This is not an MMX operation; promoted to XMM. -define <2 x i32> @qux(i32 %A) nounwind { +define x86_mmx @qux(i32 %A) nounwind { %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 ; <<2 x i32>> [#uses=1] - ret <2 x i32> %tmp3 + %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx + ret x86_mmx %tmp4 } diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll index 0af7e017b6..4818baedc4 100644 --- a/test/CodeGen/X86/mmx-punpckhdq.ll +++ b/test/CodeGen/X86/mmx-punpckhdq.ll @@ -1,4 +1,6 @@ +; RUN: llc < %s -march=x86 -mattr=+mmx | grep pextrd ; RUN: llc < %s -march=x86 -mattr=+mmx | grep punpckhdq | count 1 +; There are no MMX operations in bork; promoted to XMM. define void @bork(<1 x i64>* %x) { entry: @@ -11,4 +13,16 @@ entry: ret void } +; pork uses MMX. + +define void @pork(x86_mmx* %x) { +entry: + %tmp2 = load x86_mmx* %x ; <x86_mmx> [#uses=1] + %tmp9 = tail call x86_mmx @llvm.x86.mmx.punpckhdq (x86_mmx %tmp2, x86_mmx %tmp2) + store x86_mmx %tmp9, x86_mmx* %x + tail call void @llvm.x86.mmx.emms( ) + ret void +} + +declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) declare void @llvm.x86.mmx.emms() diff --git a/test/CodeGen/X86/mmx-shift.ll b/test/CodeGen/X86/mmx-shift.ll index dd0aa2ca31..bafc75444d 100644 --- a/test/CodeGen/X86/mmx-shift.ll +++ b/test/CodeGen/X86/mmx-shift.ll @@ -5,28 +5,28 @@ define i64 @t1(<1 x i64> %mm1) nounwind { entry: - %tmp6 = tail call <1 x i64> @llvm.x86.mmx.pslli.q( <1 x i64> %mm1, i32 32 ) ; <<1 x i64>> [#uses=1] - %retval1112 = bitcast <1 x i64> %tmp6 to i64 ; <i64> [#uses=1] + %tmp = bitcast <1 x i64> %mm1 to x86_mmx + %tmp6 = tail call x86_mmx @llvm.x86.mmx.pslli.q( x86_mmx %tmp, i32 32 ) ; <x86_mmx> [#uses=1] + %retval1112 = bitcast x86_mmx %tmp6 to i64 ret i64 %retval1112 } -declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone +declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone -define i64 @t2(<2 x i32> %mm1, <2 x i32> %mm2) nounwind { +define i64 @t2(x86_mmx %mm1, x86_mmx %mm2) nounwind { entry: - %tmp7 = tail call <2 x i32> @llvm.x86.mmx.psra.d( <2 x i32> %mm1, <2 x i32> %mm2 ) nounwind readnone ; <<2 x i32>> [#uses=1] - %retval1112 = bitcast <2 x i32> %tmp7 to i64 ; <i64> [#uses=1] + %tmp7 = tail call x86_mmx @llvm.x86.mmx.psra.d( x86_mmx %mm1, x86_mmx %mm2 ) nounwind readnone ; <x86_mmx> [#uses=1] + %retval1112 = bitcast x86_mmx %tmp7 to i64 ret i64 %retval1112 } -declare <2 x i32> @llvm.x86.mmx.psra.d(<2 x i32>, <2 x i32>) nounwind readnone +declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone -define i64 @t3(<1 x i64> %mm1, i32 %bits) nounwind { +define i64 @t3(x86_mmx %mm1, i32 %bits) nounwind { entry: - %tmp6 = bitcast <1 x i64> %mm1 to <4 x i16> ; <<4 x i16>> [#uses=1] - %tmp8 = tail call <4 x i16> @llvm.x86.mmx.psrli.w( <4 x i16> %tmp6, i32 %bits ) nounwind readnone ; <<4 x i16>> [#uses=1] - %retval1314 = bitcast <4 x i16> %tmp8 to i64 ; <i64> [#uses=1] + %tmp8 = tail call x86_mmx @llvm.x86.mmx.psrli.w( x86_mmx %mm1, i32 %bits ) nounwind readnone ; <x86_mmx> [#uses=1] + %retval1314 = bitcast x86_mmx %tmp8 to i64 ret i64 %retval1314 } -declare <4 x i16> @llvm.x86.mmx.psrli.w(<4 x i16>, i32) nounwind readnone +declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone diff --git a/test/CodeGen/X86/mmx-shuffle.ll b/test/CodeGen/X86/mmx-shuffle.ll index e3125c7345..9f7501eb7c 100644 --- a/test/CodeGen/X86/mmx-shuffle.ll +++ b/test/CodeGen/X86/mmx-shuffle.ll @@ -22,8 +22,10 @@ entry: %tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16> ; <<4 x i16>> [#uses=1] %tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 > ; <<4 x i16>> [#uses=1] %tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8> ; <<8 x i8>> [#uses=1] - tail call void @llvm.x86.mmx.maskmovq( <8 x i8> zeroinitializer, <8 x i8> %tmp555, i8* null ) + %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx + %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx + tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, i8* null ) ret void } -declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*) +declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) diff --git a/test/CodeGen/X86/mmx-vzmovl-2.ll b/test/CodeGen/X86/mmx-vzmovl-2.ll index 8253c20032..4ad420b37c 100644 --- a/test/CodeGen/X86/mmx-vzmovl-2.ll +++ b/test/CodeGen/X86/mmx-vzmovl-2.ll @@ -4,7 +4,7 @@ %struct.vS1024 = type { [8 x <4 x i32>] } %struct.vS512 = type { [4 x <4 x i32>] } -declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone +declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone define void @t() nounwind { entry: @@ -12,14 +12,18 @@ entry: bb554: ; preds = %bb554, %entry %sum.0.reg2mem.0 = phi <1 x i64> [ %tmp562, %bb554 ], [ zeroinitializer, %entry ] ; <<1 x i64>> [#uses=1] - %0 = load <1 x i64>* null, align 8 ; <<1 x i64>> [#uses=2] - %1 = bitcast <1 x i64> %0 to <2 x i32> ; <<2 x i32>> [#uses=1] + %0 = load x86_mmx* null, align 8 ; <<1 x i64>> [#uses=2] + %1 = bitcast x86_mmx %0 to <2 x i32> ; <<2 x i32>> [#uses=1] %tmp555 = and <2 x i32> %1, < i32 -1, i32 0 > ; <<2 x i32>> [#uses=1] - %2 = bitcast <2 x i32> %tmp555 to <1 x i64> ; <<1 x i64>> [#uses=1] - %3 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1] + %2 = bitcast <2 x i32> %tmp555 to x86_mmx ; <<1 x i64>> [#uses=1] + %3 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1] store <1 x i64> %sum.0.reg2mem.0, <1 x i64>* null - %tmp558 = add <1 x i64> %sum.0.reg2mem.0, %2 ; <<1 x i64>> [#uses=1] - %4 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %tmp558, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1] - %tmp562 = add <1 x i64> %4, %3 ; <<1 x i64>> [#uses=1] + %tmp3 = bitcast x86_mmx %2 to <1 x i64> + %tmp558 = add <1 x i64> %sum.0.reg2mem.0, %tmp3 ; <<1 x i64>> [#uses=1] + %tmp5 = bitcast <1 x i64> %tmp558 to x86_mmx + %4 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %tmp5, i32 32) nounwind readnone ; <<1 x i64>> [#uses=1] + %tmp6 = bitcast x86_mmx %4 to <1 x i64> + %tmp7 = bitcast x86_mmx %3 to <1 x i64> + %tmp562 = add <1 x i64> %tmp6, %tmp7 ; <<1 x i64>> [#uses=1] br label %bb554 } diff --git a/test/CodeGen/X86/mmx-vzmovl.ll b/test/CodeGen/X86/mmx-vzmovl.ll index d21e240488..e8b34263c6 100644 --- a/test/CodeGen/X86/mmx-vzmovl.ll +++ b/test/CodeGen/X86/mmx-vzmovl.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movd -; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movq +; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movq | count 2 +; There are no MMX operations here; this is promoted to XMM. define void @foo(<1 x i64>* %a, <1 x i64>* %b) nounwind { entry: diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index 291fc0454c..471cc1611f 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -1,15 +1,16 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 > %t -; RUN: grep psllq %t | grep 32 +; RUN: grep shll %t | grep 12 ; RUN: grep pslldq %t | grep 12 ; RUN: grep psrldq %t | grep 8 ; RUN: grep psrldq %t | grep 12 +; There are no MMX operations in @t1 -define void @t1(i32 %a, <1 x i64>* %P) nounwind { +define void @t1(i32 %a, x86_mmx* %P) nounwind { %tmp12 = shl i32 %a, 12 %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 - %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64> - store <1 x i64> %tmp23, <1 x i64>* %P + %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx + store x86_mmx %tmp23, x86_mmx* %P ret void } diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll index 9ede10f63d..ea7f919304 100644 --- a/test/CodeGen/X86/vec_insert-7.ll +++ b/test/CodeGen/X86/vec_insert-7.ll @@ -1,8 +1,12 @@ -; RUN: llc < %s -march=x86 -mattr=+mmx -mtriple=i686-apple-darwin9 -o - | grep punpckldq +; RUN: llc < %s -march=x86 -mattr=+mmx,+sse -mtriple=i686-apple-darwin9 -o - | grep pinsrd | count 2 +; MMX insertelement is not available; these are promoted to XMM. +; (Without SSE they are split to two ints, and the code is much better.) -define <2 x i32> @mmx_movzl(<2 x i32> %x) nounwind { +define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { entry: - %tmp3 = insertelement <2 x i32> %x, i32 32, i32 0 ; <<2 x i32>> [#uses=1] + %tmp = bitcast x86_mmx %x to <2 x i32> + %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 ; <<2 x i32>> [#uses=1] %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1 ; <<2 x i32>> [#uses=1] - ret <2 x i32> %tmp8 + %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx + ret x86_mmx %tmp9 } diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll index 3b15d4cc40..8aa50945e6 100644 --- a/test/CodeGen/X86/vec_zero_cse.ll +++ b/test/CodeGen/X86/vec_zero_cse.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 2 -; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pcmpeqd | count 2 +; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 1 +; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pcmpeqd | count 1 +; 64-bit stores here do not use MMX. @M1 = external global <1 x i64> @M2 = external global <2 x i32> |