diff options
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 10 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 82 | ||||
-rw-r--r-- | test/CodeGen/X86/sse41-pmovx.ll | 47 |
3 files changed, 133 insertions, 6 deletions
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 1b68dda448..bd61c3aed4 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -243,6 +243,16 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (ld node:$ptr)), [{ return false; }]>; +def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (ld node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + if (LD->getAddressingMode() != ISD::UNINDEXED) + return false; + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + def loadi32 : PatFrag<(ops node:$ptr), (i32 (ld node:$ptr)), [{ LoadSDNode *LD = cast<LoadSDNode>(N); if (LD->getAddressingMode() != ISD::UNINDEXED) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3e7262ab5c..e89e8a3b56 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -162,6 +162,17 @@ def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; +def vzmovl_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; +def vzmovl_v4i32 : PatFrag<(ops node:$src), + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; + +def vzload_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzload node:$src)))>; + + def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); }]>; @@ -3368,8 +3379,9 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize; + [(set VR128:$dst, + (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, + OpSize; } defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; @@ -3379,6 +3391,38 @@ defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>; defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>; defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>; +// Common patterns involving scalar load. +def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>; + +def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>; +def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>; + + multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -3386,8 +3430,9 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize; + [(set VR128:$dst, + (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, + OpSize; } defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; @@ -3395,20 +3440,45 @@ defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>; defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>; defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>; +// Common patterns involving scalar load +def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), + (PMOVSXBDrm addr:$src)>; +def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), + (PMOVSXWQrm addr:$src)>; + +def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), + (PMOVZXBDrm addr:$src)>; +def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), + (PMOVZXWQrm addr:$src)>; + + multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + // Expecting a i16 load any extended to i32 value. def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (IntId (bitconvert (v4i32 (load addr:$src)))))]>, OpSize; + [(set VR128:$dst, (IntId (bitconvert + (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>, + OpSize; } defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>; +// Common patterns involving scalar load +def : Pat<(int_x86_sse41_pmovsxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXBQrm addr:$src)>; + +def : Pat<(int_x86_sse41_pmovzxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXBQrm addr:$src)>; + /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { diff --git a/test/CodeGen/X86/sse41-pmovx.ll b/test/CodeGen/X86/sse41-pmovx.ll new file mode 100644 index 0000000000..2db23c19f6 --- /dev/null +++ b/test/CodeGen/X86/sse41-pmovx.ll @@ -0,0 +1,47 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | not grep movd +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | not grep movq +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxbd +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxwd +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 | grep pmovsxbq +; RUN: llvm-as < %s | llc -march=x86-64 -mattr=sse41 | grep movq | count 1 + +define <2 x i64> @t1(i32* %p) nounwind { +entry: + %0 = load i32* %p, align 4 ; <i32> [#uses=1] + %1 = insertelement <4 x i32> undef, i32 %0, i32 0 ; <<4 x i32>> [#uses=1] + %2 = insertelement <4 x i32> %1, i32 0, i32 1 ; <<4 x i32>> [#uses=1] + %3 = insertelement <4 x i32> %2, i32 0, i32 2 ; <<4 x i32>> [#uses=1] + %4 = insertelement <4 x i32> %3, i32 0, i32 3 ; <<4 x i32>> [#uses=1] + %5 = bitcast <4 x i32> %4 to <16 x i8> ; <<16 x i8>> [#uses=1] + %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone ; <<4 x i32>> [#uses=1] + %7 = bitcast <4 x i32> %6 to <2 x i64> ; <<2 x i64>> [#uses=1] + ret <2 x i64> %7 +} + +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone + +define <2 x i64> @t2(i64* %p) nounwind readonly { +entry: + %0 = load i64* %p ; <i64> [#uses=1] + %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1] + %1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1] + %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1] + %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1] + ret <2 x i64> %3 +} + +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone + + +@gv = external global i16 ; <i16*> [#uses=1] + +define <2 x i64> @t3() nounwind { +entry: + %0 = load i16* @gv, align 2 ; <i16> [#uses=1] + %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] + %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] + %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] + ret <2 x i64> %3 +} + +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone |