diff options
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 2255 |
1 files changed, 1251 insertions, 1004 deletions
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 57149b8742..1d454aa67b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2,8 +2,8 @@ // // The LLVM Compiler Infrastructure // -// This file was developed by the Evan Cheng and is distributed under -// the University of Illinois Open Source License. See LICENSE.TXT for details. +// This file was developed by Evan Cheng and is distributed under the University +// of Illinois Open Source License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // @@ -41,6 +41,21 @@ def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>; def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>; //===----------------------------------------------------------------------===// +// SSE 'Special' Instructions +//===----------------------------------------------------------------------===// + +def IMPLICIT_DEF_VR128 : I<0, Pseudo, (ops VR128:$dst), + "#IMPLICIT_DEF $dst", + [(set VR128:$dst, (v4f32 (undef)))]>, + Requires<[HasSSE1]>; +def IMPLICIT_DEF_FR32 : I<0, Pseudo, (ops FR32:$dst), + "#IMPLICIT_DEF $dst", + [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>; +def IMPLICIT_DEF_FR64 : I<0, Pseudo, (ops FR64:$dst), + "#IMPLICIT_DEF $dst", + [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>; + +//===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// @@ -70,6 +85,7 @@ def X86loadpf64 : PatFrag<(ops node:$ptr), (f64 (X86loadp node:$ptr))>; def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; +def loadv2i32 : PatFrag<(ops node:$ptr), (v2i32 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; @@ -182,106 +198,6 @@ def PSHUFD_binary_shuffle_mask : PatLeaf<(build_vector), [{ // SSE scalar FP Instructions //===----------------------------------------------------------------------===// -// Instruction templates -// SSI - SSE1 instructions with XS prefix. -// SDI - SSE2 instructions with XD prefix. -// PSI - SSE1 instructions with TB prefix. -// PDI - SSE2 instructions with TB and OpSize prefixes. -// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix. -// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. -// S3I - SSE3 instructions with TB and OpSize prefixes. -// S3SI - SSE3 instructions with XS prefix. -// S3DI - SSE3 instructions with XD prefix. -// SS38I - SSSE3 instructions with T8 and OpSize prefixes. -// SS3AI - SSSE3 instructions with TA and OpSize prefixes. -class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>; -class SDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE2]>; -class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>; -class PDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>; -class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>; -class PDIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : Ii8<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>; - -class S3SI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE3]>; -class S3DI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE3]>; -class S3I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>; - -class SS38I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, T8, OpSize, Requires<[HasSSSE3]>; -class SS3AI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> - : I<o, F, ops, asm, pattern>, TA, OpSize, Requires<[HasSSSE3]>; - -//===----------------------------------------------------------------------===// -// Helpers for defining instructions that directly correspond to intrinsics. - -multiclass SS_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> { - def r : SSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (v4f32 (IntId VR128:$src)))]>; - def m : SSI<o, MRMSrcMem, (ops VR128:$dst, ssmem:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (v4f32 (IntId sse_load_f32:$src)))]>; -} - -multiclass SD_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> { - def r : SDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (v2f64 (IntId VR128:$src)))]>; - def m : SDI<o, MRMSrcMem, (ops VR128:$dst, sdmem:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (v2f64 (IntId sse_load_f64:$src)))]>; -} - -class PS_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))]>; -class PS_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PSI<o, MRMSrcMem, (ops VR128:$dst, f32mem:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId (load addr:$src)))]>; -class PD_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))]>; -class PD_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PDI<o, MRMSrcMem, (ops VR128:$dst, f64mem:$src), - !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId (load addr:$src)))]>; - -class PS_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; -class PS_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PSI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2), - !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>; -class PD_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; -class PD_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId> - : PDI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2), - !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>; - -// Some 'special' instructions -def IMPLICIT_DEF_FR32 : I<0, Pseudo, (ops FR32:$dst), - "#IMPLICIT_DEF $dst", - [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>; -def IMPLICIT_DEF_FR64 : I<0, Pseudo, (ops FR64:$dst), - "#IMPLICIT_DEF $dst", - [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>; - // CMOV* - Used to implement the SSE SELECT DAG operation. Expanded by the // scheduler into a branch sequence. let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler. @@ -310,28 +226,204 @@ let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler. (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>; } +//===----------------------------------------------------------------------===// +// SSE1 Instructions +//===----------------------------------------------------------------------===// + +// SSE1 Instruction Templates: +// +// SSI - SSE1 instructions with XS prefix. +// PSI - SSE1 instructions with TB prefix. +// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix. + +class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>; +class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>; +class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>; + +// Helpers for defining instructions that directly correspond to intrinsics. +multiclass SS_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> { + def r : SSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (IntId VR128:$src)))]>; + def m : SSI<o, MRMSrcMem, (ops VR128:$dst, ssmem:$src), + !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (IntId sse_load_f32:$src)))]>; +} + // Move Instructions def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src), - "movss {$src, $dst|$dst, $src}", []>; + "movss {$src, $dst|$dst, $src}", []>; def MOVSSrm : SSI<0x10, MRMSrcMem, (ops FR32:$dst, f32mem:$src), - "movss {$src, $dst|$dst, $src}", - [(set FR32:$dst, (loadf32 addr:$src))]>; -def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src), - "movsd {$src, $dst|$dst, $src}", []>; -def MOVSDrm : SDI<0x10, MRMSrcMem, (ops FR64:$dst, f64mem:$src), - "movsd {$src, $dst|$dst, $src}", - [(set FR64:$dst, (loadf64 addr:$src))]>; - + "movss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (loadf32 addr:$src))]>; def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src), - "movss {$src, $dst|$dst, $src}", - [(store FR32:$src, addr:$dst)]>; -def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src), - "movsd {$src, $dst|$dst, $src}", - [(store FR64:$src, addr:$dst)]>; + "movss {$src, $dst|$dst, $src}", + [(store FR32:$src, addr:$dst)]>; -/// scalar_sse12_fp_binop_rm - Scalar SSE binops come in four basic forms: -/// 1. f32 vs f64 - These come in SSE1/SSE2 forms for float/doubles. -/// 2. rr vs rm - They include a reg+reg form and a ref+mem form. +def SQRTSSr : SSI<0x51, MRMSrcReg, (ops FR32:$dst, FR32:$src), + "sqrtss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (fsqrt FR32:$src))]>; +def SQRTSSm : SSI<0x51, MRMSrcMem, (ops FR32:$dst, f32mem:$src), + "sqrtss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (fsqrt (loadf32 addr:$src)))]>; + +// Aliases to match intrinsics which expect XMM operand(s). +defm SQRTSS_Int : SS_IntUnary<0x51, "sqrtss" , int_x86_sse_sqrt_ss>; +defm RSQRTSS_Int : SS_IntUnary<0x52, "rsqrtss", int_x86_sse_rsqrt_ss>; +defm RCPSS_Int : SS_IntUnary<0x53, "rcpss" , int_x86_sse_rcp_ss>; + +// Conversion instructions +def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (fp_to_sint FR32:$src))]>; +def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>; +def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR32:$src), + "cvtsi2ss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (sint_to_fp GR32:$src))]>; +def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (ops FR32:$dst, i32mem:$src), + "cvtsi2ss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>; + +// Match intrinsics which expect XMM operand(s). +def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "cvtss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>; +def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (ops GR32:$dst, f32mem:$src), + "cvtss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse_cvtss2si + (load addr:$src)))]>; + +// Aliases for intrinsics +def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, + (int_x86_sse_cvttss2si VR128:$src))]>; +def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, + (int_x86_sse_cvttss2si(load addr:$src)))]>; + +let isTwoAddress = 1 in { + def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, GR32:$src2), + "cvtsi2ss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1, + GR32:$src2))]>; + def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i32mem:$src2), + "cvtsi2ss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1, + (loadi32 addr:$src2)))]>; +} + +// Comparison instructions +let isTwoAddress = 1 in { + def CMPSSrr : SSI<0xC2, MRMSrcReg, + (ops FR32:$dst, FR32:$src1, FR32:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", + []>; + def CMPSSrm : SSI<0xC2, MRMSrcMem, + (ops FR32:$dst, FR32:$src1, f32mem:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", []>; +} + +def UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops FR32:$src1, FR32:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86cmp FR32:$src1, FR32:$src2)]>; +def UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops FR32:$src1, f32mem:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86cmp FR32:$src1, (loadf32 addr:$src2))]>; + +// Aliases to match intrinsics which expect XMM operand(s). +let isTwoAddress = 1 in { + def Int_CMPSSrr : SSI<0xC2, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1, + VR128:$src, imm:$cc))]>; + def Int_CMPSSrm : SSI<0xC2, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f32mem:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1, + (load addr:$src), imm:$cc))]>; +} + +def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86ucomi (v4f32 VR128:$src1), VR128:$src2)]>; +def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2))]>; + +def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2), + "comiss {$src2, $src1|$src1, $src2}", + [(X86comi (v4f32 VR128:$src1), VR128:$src2)]>; +def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), + "comiss {$src2, $src1|$src1, $src2}", + [(X86comi (v4f32 VR128:$src1), (load addr:$src2))]>; + +// Aliases of packed SSE1 instructions for scalar use. These all have names that +// start with 'Fs'. + +// Alias instructions that map fld0 to pxor for sse. +def FsFLD0SS : I<0xEF, MRMInitReg, (ops FR32:$dst), + "pxor $dst, $dst", [(set FR32:$dst, fp32imm0)]>, + Requires<[HasSSE1]>, TB, OpSize; + +// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are +// disregarded. +def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (ops FR32:$dst, FR32:$src), + "movaps {$src, $dst|$dst, $src}", []>; + +// Alias instruction to load FR32 from f128mem using movaps. Upper bits are +// disregarded. +def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (ops FR32:$dst, f128mem:$src), + "movaps {$src, $dst|$dst, $src}", + [(set FR32:$dst, (X86loadpf32 addr:$src))]>; + +// Alias bitwise logical operations using SSE logical ops on packed FP values. +let isTwoAddress = 1 in { + +let isCommutable = 1 in { + def FsANDPSrr : PSI<0x54, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>; + def FsORPSrr : PSI<0x56, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>; + def FsXORPSrr : PSI<0x57, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>; +} + +def FsANDPSrm : PSI<0x54, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fand FR32:$src1, + (X86loadpf32 addr:$src2)))]>; +def FsORPSrm : PSI<0x56, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86for FR32:$src1, + (X86loadpf32 addr:$src2)))]>; +def FsXORPSrm : PSI<0x57, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fxor FR32:$src1, + (X86loadpf32 addr:$src2)))]>; + +def FsANDNPSrr : PSI<0x55, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + "andnps {$src2, $dst|$dst, $src2}", []>; +def FsANDNPSrm : PSI<0x55, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "andnps {$src2, $dst|$dst, $src2}", []>; +} + +/// scalar_sse1_fp_binop_rm - Scalar SSE1 binops come in three basic forms: +/// +/// 1. f32 - This comes in SSE1 form for floats. +/// 2. rr vs rm - They include a reg+reg form and a reg+mem form. /// /// In addition, scalar SSE ops have an intrinsic form. This form is unlike the /// normal form, in that they take an entire vector (instead of a scalar) and @@ -339,27 +431,20 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src), /// above permutations, giving us 8 forms for 'instruction'. /// let isTwoAddress = 1 in { -multiclass scalar_sse12_fp_binop_rm<bits<8> opc, string OpcodeStr, - SDNode OpNode, Intrinsic F32Int, - Intrinsic F64Int, bit Commutable = 0> { +multiclass scalar_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, Intrinsic F32Int, + bit Commutable = 0> { // Scalar operation, reg+reg. def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> { let isCommutable = Commutable; } - def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2), - !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), - [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> { - let isCommutable = Commutable; - } + // Scalar operation, reg+mem. def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2), !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>; - def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2), - !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), - [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>; // Vector intrinsic operation, reg+reg. def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), @@ -367,219 +452,499 @@ multiclass scalar_sse12_fp_binop_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> { let isCommutable = Commutable; } - def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> { - let isCommutable = Commutable; - } + // Vector intrinsic operation, reg+mem. def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (F32Int VR128:$src1, sse_load_f32:$src2))]>; - def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2), - !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (F64Int VR128:$src1, - sse_load_f64:$src2))]>; } } // Arithmetic instructions +defm ADD : scalar_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>; +defm MUL : scalar_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>; +defm SUB : scalar_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>; +defm DIV : scalar_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>; -defm ADD : scalar_sse12_fp_binop_rm<0x58, "add", fadd, - int_x86_sse_add_ss, int_x86_sse2_add_sd, 1>; -defm MUL : scalar_sse12_fp_binop_rm<0x59, "mul", fmul, - int_x86_sse_mul_ss, int_x86_sse2_mul_sd, 1>; -defm SUB : scalar_sse12_fp_binop_rm<0x5C, "sub", fsub, - int_x86_sse_sub_ss, int_x86_sse2_sub_sd>; -defm DIV : scalar_sse12_fp_binop_rm<0x5E, "div", fdiv, - int_x86_sse_div_ss, int_x86_sse2_div_sd>; +defm MAX : scalar_sse1_fp_binop_rm<0x5F, "max", X86fmax, int_x86_sse_max_ss>; +defm MIN : scalar_sse1_fp_binop_rm<0x5D, "min", X86fmin, int_x86_sse_min_ss>; -defm MAX : scalar_sse12_fp_binop_rm<0x5F, "max", X86fmax, - int_x86_sse_max_ss, int_x86_sse2_max_sd>; -defm MIN : scalar_sse12_fp_binop_rm<0x5D, "min", X86fmin, - int_x86_sse_min_ss, int_x86_sse2_min_sd>; +//===----------------------------------------------------------------------===// +// SSE packed FP Instructions +// Move Instructions +def MOVAPSrr : PSI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movaps {$src, $dst|$dst, $src}", []>; +def MOVAPSrm : PSI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movaps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (loadv4f32 addr:$src))]>; + +def MOVAPSmr : PSI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src), + "movaps {$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)]>; + +def MOVUPSrr : PSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movups {$src, $dst|$dst, $src}", []>; +def MOVUPSrm : PSI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movups {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>; +def MOVUPSmr : PSI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src), + "movups {$src, $dst|$dst, $src}", + [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>; + +let isTwoAddress = 1 in { +let AddedComplexity = 20 in { +def MOVLPSrm : PSI<0x12, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2), + "movlps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))), + MOVLP_shuffle_mask)))]>; +def MOVHPSrm : PSI<0x16, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2), + "movhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))), + MOVHP_shuffle_mask)))]>; +} // AddedComplexity +} // isTwoAddress + +def MOVLPSmr : PSI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movlps {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)]>; + +// v2f64 extract element 1 is always custom lowered to unpack high to low +// and extract element 0 so the non-store version isn't too horrible. +def MOVHPSmr : PSI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movhps {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (vector_shuffle + (bc_v2f64 (v4f32 VR128:$src)), (undef), + UNPCKH_shuffle_mask)), (iPTR 0))), + addr:$dst)]>; + +let isTwoAddress = 1 in { +let AddedComplexity = 15 in { +def MOVLHPSrr : PSI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "movlhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVHP_shuffle_mask)))]>; + +def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "movhlps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVHLPS_shuffle_mask)))]>; +} // AddedComplexity +} // isTwoAddress + + + +/// packed_sse1_fp_binop_rm - Packed SSE binops come in three basic forms: +/// 1. v4f32 - This comes in SSE1 form for float. +/// 2. rr vs rm - They include a reg+reg form and a ref+mem form. +/// +let isTwoAddress = 1 in { +multiclass packed_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, bit Commutable = 0> { + // Packed operation, reg+reg. + def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> { + let isCommutable = Commutable; + } + + // Packed operation, reg+mem. + def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>; +} +} + +defm ADD : packed_sse1_fp_binop_rm<0x58, "add", fadd, 1>; +defm MUL : packed_sse1_fp_binop_rm<0x59, "mul", fmul, 1>; +defm DIV : packed_sse1_fp_binop_rm<0x5E, "div", fdiv>; +defm SUB : packed_sse1_fp_binop_rm<0x5C, "sub", fsub>; + +// Arithmetic + +class PS_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId> + : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId VR128:$src))]>; +class PS_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId> + : PSI<o, MRMSrcMem, (ops VR128:$dst, f32mem:$src), + !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId (load addr:$src)))]>; + +class PS_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId> + : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; +class PS_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId> + : PSI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>; + +def SQRTPSr : PS_Intr<0x51, "sqrtps", int_x86_sse_sqrt_ps>; +def SQRTPSm : PS_Intm<0x51, "sqrtps", int_x86_sse_sqrt_ps>; + +def RSQRTPSr : PS_Intr<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>; +def RSQRTPSm : PS_Intm<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>; +def RCPPSr : PS_Intr<0x53, "rcpps", int_x86_sse_rcp_ps>; +def RCPPSm : PS_Intm<0x53, "rcpps", int_x86_sse_rcp_ps>; + +let isTwoAddress = 1 in { + let isCommutable = 1 in { + def MAXPSrr : PS_Intrr<0x5F, "maxps", int_x86_sse_max_ps>; + def MINPSrr : PS_Intrr<0x5D, "minps", int_x86_sse_min_ps>; + } + + def MAXPSrm : PS_Intrm<0x5F, "maxps", int_x86_sse_max_ps>; + def MINPSrm : PS_Intrm<0x5D, "minps", int_x86_sse_min_ps>; +} + +// Logical +let isTwoAddress = 1 in { + let isCommutable = 1 in { + def ANDPSrr : PSI<0x54, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 + (and VR128:$src1, VR128:$src2)))]>; + def ORPSrr : PSI<0x56, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 + (or VR128:$src1, VR128:$src2)))]>; + def XORPSrr : PSI<0x57, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 + (xor VR128:$src1, VR128:$src2)))]>; + } + + def ANDPSrm : PSI<0x54, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (and VR128:$src1, + (bc_v2i64 (loadv4f32 addr:$src2))))]>; + def ORPSrm : PSI<0x56, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (or VR128:$src1, + (bc_v2i64 (loadv4f32 addr:$src2))))]>; + def XORPSrm : PSI<0x57, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (xor VR128:$src1, + (bc_v2i64 (loadv4f32 addr:$src2))))]>; + def ANDNPSrr : PSI<0x55, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "andnps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (and (xor VR128:$src1, + (bc_v2i64 (v4i32 immAllOnesV))), + VR128:$src2)))]>; + def ANDNPSrm : PSI<0x55, MRMSrcMem, + (ops VR128:$dst, VR128:$src1,f128mem:$src2), + "andnps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (and (xor VR128:$src1, + (bc_v2i64 (v4i32 immAllOnesV))), + (bc_v2i64 (loadv4f32 addr:$src2)))))]>; +} + +let isTwoAddress = 1 in { + def CMPPSrri : PSIi8<0xC2, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc), + "cmp${cc}ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1, + VR128:$src, imm:$cc))]>; + def CMPPSrmi : PSIi8<0xC2, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc), + "cmp${cc}ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1, + (load addr:$src), imm:$cc))]>; +} + +// Shuffle and unpack instructions +let isTwoAddress = 1 in { + let isConvertibleToThreeAddress = 1 in // Convert to pshufd + def SHUFPSrri : PSIi8<0xC6, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, + VR128:$src2, i32i8imm:$src3), + "shufps {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, VR128:$src2, + SHUFP_shuffle_mask:$src3)))]>; + def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, + f128mem:$src2, i32i8imm:$src3), + "shufps {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, (load addr:$src2), + SHUFP_shuffle_mask:$src3)))]>; + + let AddedComplexity = 10 in { + def UNPCKHPSrr : PSI<0x15, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "unpckhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, VR128:$src2, + UNPCKH_shuffle_mask)))]>; + def UNPCKHPSrm : PSI<0x15, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "unpckhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, (load addr:$src2), + UNPCKH_shuffle_mask)))]>; + + def UNPCKLPSrr : PSI<0x14, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "unpcklps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, VR128:$src2, + UNPCKL_shuffle_mask)))]>; + def UNPCKLPSrm : PSI<0x14, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "unpcklps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, (load addr:$src2), + UNPCKL_shuffle_mask)))]>; + } // AddedComplexity +} // isTwoAddress + +// Mask creation +def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "movmskps {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>; +def MOVMSKPDrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "movmskpd {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>; + +// Prefetching loads. +// TODO: no intrinsics for these? +def PREFETCHT0 : PSI<0x18, MRM1m, (ops i8mem:$src), "prefetcht0 $src", []>; +def PREFETCHT1 : PSI<0x18, MRM2m, (ops i8mem:$src), "prefetcht1 $src", []>; +def PREFETCHT2 : PSI<0x18, MRM3m, (ops i8mem:$src), "prefetcht2 $src", []>; +def PREFETCHNTA : PSI<0x18, MRM0m, (ops i8mem:$src), "prefetchnta $src", []>; + +// Non-temporal stores +def MOVNTPSmr : PSI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src), + "movntps {$src, $dst|$dst, $src}", + [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>; + +// Load, store, and memory fence +def SFENCE : PSI<0xAE, MRM7m, (ops), "sfence", [(int_x86_sse_sfence)]>; + +// MXCSR register +def LDMXCSR : PSI<0xAE, MRM2m, (ops i32mem:$src), + "ldmxcsr $src", [(int_x86_sse_ldmxcsr addr:$src)]>; +def STMXCSR : PSI<0xAE, MRM3m, (ops i32mem:$dst), + "stmxcsr $dst", [(int_x86_sse_stmxcsr addr:$dst)]>; + +// Alias instructions that map zero vector to pxor / xorp* for sse. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +let isReMaterializable = 1 in +def V_SET0 : PSI<0x57, MRMInitReg, (ops VR128:$dst), + "xorps $dst, $dst", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; + +// FR32 to 128-bit vector conversion. +def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src), + "movss {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4f32 (scalar_to_vector FR32:$src)))]>; +def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src), + "movss {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>; + +// FIXME: may not be able to eliminate this movss with coalescing the src and +// dest register classes are different. We really want to write this pattern +// like this: +// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), +// (f32 FR32:$src)>; +def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, VR128:$src), + "movss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (vector_extract (v4f32 VR128:$src), + (iPTR 0)))]>; +def MOVPS2SSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, VR128:$src), + "movss {$src, $dst|$dst, $src}", + [(store (f32 (vector_extract (v4f32 VR128:$src), + (iPTR 0))), addr:$dst)]>; + + +// Move to lower bits of a VR128, leaving upper bits alone. +// Three operand (but two address) aliases. +let isTwoAddress = 1 in { + def MOVLSS2PSrr : SSI<0x10, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, FR32:$src2), + "movss {$src2, $dst|$dst, $src2}", []>; + + let AddedComplexity = 15 in + def MOVLPSrr : SSI<0x10, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "movss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVL_shuffle_mask)))]>; +} + +// Move to lower bits of a VR128 and zeroing upper bits. |