aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/X86/X86InstrSSE.td2255
1 files changed, 1251 insertions, 1004 deletions
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 57149b8742..1d454aa67b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2,8 +2,8 @@
//
// The LLVM Compiler Infrastructure
//
-// This file was developed by the Evan Cheng and is distributed under
-// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// This file was developed by Evan Cheng and is distributed under the University
+// of Illinois Open Source License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
@@ -41,6 +41,21 @@ def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
//===----------------------------------------------------------------------===//
+// SSE 'Special' Instructions
+//===----------------------------------------------------------------------===//
+
+def IMPLICIT_DEF_VR128 : I<0, Pseudo, (ops VR128:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set VR128:$dst, (v4f32 (undef)))]>,
+ Requires<[HasSSE1]>;
+def IMPLICIT_DEF_FR32 : I<0, Pseudo, (ops FR32:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>;
+def IMPLICIT_DEF_FR64 : I<0, Pseudo, (ops FR64:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
// SSE Complex Patterns
//===----------------------------------------------------------------------===//
@@ -70,6 +85,7 @@ def X86loadpf64 : PatFrag<(ops node:$ptr), (f64 (X86loadp node:$ptr))>;
def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i32 : PatFrag<(ops node:$ptr), (v2i32 (load node:$ptr))>;
def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
@@ -182,106 +198,6 @@ def PSHUFD_binary_shuffle_mask : PatLeaf<(build_vector), [{
// SSE scalar FP Instructions
//===----------------------------------------------------------------------===//
-// Instruction templates
-// SSI - SSE1 instructions with XS prefix.
-// SDI - SSE2 instructions with XD prefix.
-// PSI - SSE1 instructions with TB prefix.
-// PDI - SSE2 instructions with TB and OpSize prefixes.
-// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
-// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
-// S3I - SSE3 instructions with TB and OpSize prefixes.
-// S3SI - SSE3 instructions with XS prefix.
-// S3DI - SSE3 instructions with XD prefix.
-// SS38I - SSSE3 instructions with T8 and OpSize prefixes.
-// SS3AI - SSSE3 instructions with TA and OpSize prefixes.
-class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>;
-class SDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE2]>;
-class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
-class PDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
-class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
-class PDIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : Ii8<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
-
-class S3SI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE3]>;
-class S3DI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE3]>;
-class S3I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
-
-class SS38I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, T8, OpSize, Requires<[HasSSSE3]>;
-class SS3AI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
- : I<o, F, ops, asm, pattern>, TA, OpSize, Requires<[HasSSSE3]>;
-
-//===----------------------------------------------------------------------===//
-// Helpers for defining instructions that directly correspond to intrinsics.
-
-multiclass SS_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> {
- def r : SSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v4f32 (IntId VR128:$src)))]>;
- def m : SSI<o, MRMSrcMem, (ops VR128:$dst, ssmem:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v4f32 (IntId sse_load_f32:$src)))]>;
-}
-
-multiclass SD_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> {
- def r : SDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v2f64 (IntId VR128:$src)))]>;
- def m : SDI<o, MRMSrcMem, (ops VR128:$dst, sdmem:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v2f64 (IntId sse_load_f64:$src)))]>;
-}
-
-class PS_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (IntId VR128:$src))]>;
-class PS_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PSI<o, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (IntId (load addr:$src)))]>;
-class PD_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (IntId VR128:$src))]>;
-class PD_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PDI<o, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
- !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (IntId (load addr:$src)))]>;
-
-class PS_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
- !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
-class PS_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PSI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2),
- !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>;
-class PD_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
- !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
-class PD_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
- : PDI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
- !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>;
-
-// Some 'special' instructions
-def IMPLICIT_DEF_FR32 : I<0, Pseudo, (ops FR32:$dst),
- "#IMPLICIT_DEF $dst",
- [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>;
-def IMPLICIT_DEF_FR64 : I<0, Pseudo, (ops FR64:$dst),
- "#IMPLICIT_DEF $dst",
- [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>;
-
// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded by the
// scheduler into a branch sequence.
let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
@@ -310,28 +226,204 @@ let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
(v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
}
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with TB prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+
+// Helpers for defining instructions that directly correspond to intrinsics.
+multiclass SS_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> {
+ def r : SSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (IntId VR128:$src)))]>;
+ def m : SSI<o, MRMSrcMem, (ops VR128:$dst, ssmem:$src),
+ !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (IntId sse_load_f32:$src)))]>;
+}
+
// Move Instructions
def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src),
- "movss {$src, $dst|$dst, $src}", []>;
+ "movss {$src, $dst|$dst, $src}", []>;
def MOVSSrm : SSI<0x10, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
- "movss {$src, $dst|$dst, $src}",
- [(set FR32:$dst, (loadf32 addr:$src))]>;
-def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src),
- "movsd {$src, $dst|$dst, $src}", []>;
-def MOVSDrm : SDI<0x10, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
- "movsd {$src, $dst|$dst, $src}",
- [(set FR64:$dst, (loadf64 addr:$src))]>;
-
+ "movss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (loadf32 addr:$src))]>;
def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src),
- "movss {$src, $dst|$dst, $src}",
- [(store FR32:$src, addr:$dst)]>;
-def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src),
- "movsd {$src, $dst|$dst, $src}",
- [(store FR64:$src, addr:$dst)]>;
+ "movss {$src, $dst|$dst, $src}",
+ [(store FR32:$src, addr:$dst)]>;
-/// scalar_sse12_fp_binop_rm - Scalar SSE binops come in four basic forms:
-/// 1. f32 vs f64 - These come in SSE1/SSE2 forms for float/doubles.
-/// 2. rr vs rm - They include a reg+reg form and a ref+mem form.
+def SQRTSSr : SSI<0x51, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+ "sqrtss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fsqrt FR32:$src))]>;
+def SQRTSSm : SSI<0x51, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+ "sqrtss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fsqrt (loadf32 addr:$src)))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+defm SQRTSS_Int : SS_IntUnary<0x51, "sqrtss" , int_x86_sse_sqrt_ss>;
+defm RSQRTSS_Int : SS_IntUnary<0x52, "rsqrtss", int_x86_sse_rsqrt_ss>;
+defm RCPSS_Int : SS_IntUnary<0x53, "rcpss" , int_x86_sse_rcp_ss>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR32:$src),
+ "cvtsi2ss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (ops FR32:$dst, i32mem:$src),
+ "cvtsi2ss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "cvtss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+ "cvtss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si
+ (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+ def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, GR32:$src2),
+ "cvtsi2ss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ GR32:$src2))]>;
+ def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i32mem:$src2),
+ "cvtsi2ss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let isTwoAddress = 1 in {
+ def CMPSSrr : SSI<0xC2, MRMSrcReg,
+ (ops FR32:$dst, FR32:$src1, FR32:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}",
+ []>;
+ def CMPSSrm : SSI<0xC2, MRMSrcMem,
+ (ops FR32:$dst, FR32:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}", []>;
+}
+
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops FR32:$src1, FR32:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86cmp FR32:$src1, FR32:$src2)]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops FR32:$src1, f32mem:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86cmp FR32:$src1, (loadf32 addr:$src2))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isTwoAddress = 1 in {
+ def Int_CMPSSrr : SSI<0xC2, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def Int_CMPSSrm : SSI<0xC2, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+ "comiss {$src2, $src1|$src1, $src2}",
+ [(X86comi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+ "comiss {$src2, $src1|$src1, $src2}",
+ [(X86comi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+def FsFLD0SS : I<0xEF, MRMInitReg, (ops FR32:$dst),
+ "pxor $dst, $dst", [(set FR32:$dst, fp32imm0)]>,
+ Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+ "movaps {$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (ops FR32:$dst, f128mem:$src),
+ "movaps {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (X86loadpf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isTwoAddress = 1 in {
+
+let isCommutable = 1 in {
+ def FsANDPSrr : PSI<0x54, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+ def FsORPSrr : PSI<0x56, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+ def FsXORPSrr : PSI<0x57, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1,
+ (X86loadpf32 addr:$src2)))]>;
+def FsORPSrm : PSI<0x56, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1,
+ (X86loadpf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1,
+ (X86loadpf32 addr:$src2)))]>;
+
+def FsANDNPSrr : PSI<0x55, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "andnps {$src2, $dst|$dst, $src2}", []>;
+def FsANDNPSrm : PSI<0x55, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "andnps {$src2, $dst|$dst, $src2}", []>;
+}
+
+/// scalar_sse1_fp_binop_rm - Scalar SSE1 binops come in three basic forms:
+///
+/// 1. f32 - This comes in SSE1 form for floats.
+/// 2. rr vs rm - They include a reg+reg form and a reg+mem form.
///
/// In addition, scalar SSE ops have an intrinsic form. This form is unlike the
/// normal form, in that they take an entire vector (instead of a scalar) and
@@ -339,27 +431,20 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src),
/// above permutations, giving us 8 forms for 'instruction'.
///
let isTwoAddress = 1 in {
-multiclass scalar_sse12_fp_binop_rm<bits<8> opc, string OpcodeStr,
- SDNode OpNode, Intrinsic F32Int,
- Intrinsic F64Int, bit Commutable = 0> {
+multiclass scalar_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F32Int,
+ bit Commutable = 0> {
// Scalar operation, reg+reg.
def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
!strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
[(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
let isCommutable = Commutable;
}
- def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
- !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
- [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
- let isCommutable = Commutable;
- }
+
// Scalar operation, reg+mem.
def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
!strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
[(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
- def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
- !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
- [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
// Vector intrinsic operation, reg+reg.
def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
@@ -367,219 +452,499 @@ multiclass scalar_sse12_fp_binop_rm<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
let isCommutable = Commutable;
}
- def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
- !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
- let isCommutable = Commutable;
- }
+
// Vector intrinsic operation, reg+mem.
def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
!strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
[(set VR128:$dst, (F32Int VR128:$src1,
sse_load_f32:$src2))]>;
- def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
- !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (F64Int VR128:$src1,
- sse_load_f64:$src2))]>;
}
}
// Arithmetic instructions
+defm ADD : scalar_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : scalar_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : scalar_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : scalar_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
-defm ADD : scalar_sse12_fp_binop_rm<0x58, "add", fadd,
- int_x86_sse_add_ss, int_x86_sse2_add_sd, 1>;
-defm MUL : scalar_sse12_fp_binop_rm<0x59, "mul", fmul,
- int_x86_sse_mul_ss, int_x86_sse2_mul_sd, 1>;
-defm SUB : scalar_sse12_fp_binop_rm<0x5C, "sub", fsub,
- int_x86_sse_sub_ss, int_x86_sse2_sub_sd>;
-defm DIV : scalar_sse12_fp_binop_rm<0x5E, "div", fdiv,
- int_x86_sse_div_ss, int_x86_sse2_div_sd>;
+defm MAX : scalar_sse1_fp_binop_rm<0x5F, "max", X86fmax, int_x86_sse_max_ss>;
+defm MIN : scalar_sse1_fp_binop_rm<0x5D, "min", X86fmin, int_x86_sse_min_ss>;
-defm MAX : scalar_sse12_fp_binop_rm<0x5F, "max", X86fmax,
- int_x86_sse_max_ss, int_x86_sse2_max_sd>;
-defm MIN : scalar_sse12_fp_binop_rm<0x5D, "min", X86fmin,
- int_x86_sse_min_ss, int_x86_sse2_min_sd>;
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+// Move Instructions
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movaps {$src, $dst|$dst, $src}", []>;
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movaps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+ "movaps {$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movups {$src, $dst|$dst, $src}", []>;
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movups {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+ "movups {$src, $dst|$dst, $src}",
+ [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let isTwoAddress = 1 in {
+let AddedComplexity = 20 in {
+def MOVLPSrm : PSI<0x12, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ "movlps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+ MOVLP_shuffle_mask)))]>;
+def MOVHPSrm : PSI<0x16, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ "movhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+ MOVHP_shuffle_mask)))]>;
+} // AddedComplexity
+} // isTwoAddress
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+ "movlps {$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+ "movhps {$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (vector_shuffle
+ (bc_v2f64 (v4f32 VR128:$src)), (undef),
+ UNPCKH_shuffle_mask)), (iPTR 0))),
+ addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let AddedComplexity = 15 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "movlhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVHP_shuffle_mask)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "movhlps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVHLPS_shuffle_mask)))]>;
+} // AddedComplexity
+} // isTwoAddress
+
+
+
+/// packed_sse1_fp_binop_rm - Packed SSE binops come in three basic forms:
+/// 1. v4f32 - This comes in SSE1 form for float.
+/// 2. rr vs rm - They include a reg+reg form and a ref+mem form.
+///
+let isTwoAddress = 1 in {
+multiclass packed_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, bit Commutable = 0> {
+ // Packed operation, reg+reg.
+ def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Packed operation, reg+mem.
+ def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+}
+}
+
+defm ADD : packed_sse1_fp_binop_rm<0x58, "add", fadd, 1>;
+defm MUL : packed_sse1_fp_binop_rm<0x59, "mul", fmul, 1>;
+defm DIV : packed_sse1_fp_binop_rm<0x5E, "div", fdiv>;
+defm SUB : packed_sse1_fp_binop_rm<0x5C, "sub", fsub>;
+
+// Arithmetic
+
+class PS_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>;
+class PS_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : PSI<o, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+ !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId (load addr:$src)))]>;
+
+class PS_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+class PS_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : PSI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>;
+
+def SQRTPSr : PS_Intr<0x51, "sqrtps", int_x86_sse_sqrt_ps>;
+def SQRTPSm : PS_Intm<0x51, "sqrtps", int_x86_sse_sqrt_ps>;
+
+def RSQRTPSr : PS_Intr<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>;
+def RSQRTPSm : PS_Intm<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>;
+def RCPPSr : PS_Intr<0x53, "rcpps", int_x86_sse_rcp_ps>;
+def RCPPSm : PS_Intm<0x53, "rcpps", int_x86_sse_rcp_ps>;
+
+let isTwoAddress = 1 in {
+ let isCommutable = 1 in {
+ def MAXPSrr : PS_Intrr<0x5F, "maxps", int_x86_sse_max_ps>;
+ def MINPSrr : PS_Intrr<0x5D, "minps", int_x86_sse_min_ps>;
+ }
+
+ def MAXPSrm : PS_Intrm<0x5F, "maxps", int_x86_sse_max_ps>;
+ def MINPSrm : PS_Intrm<0x5D, "minps", int_x86_sse_min_ps>;
+}
+
+// Logical
+let isTwoAddress = 1 in {
+ let isCommutable = 1 in {
+ def ANDPSrr : PSI<0x54, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (and VR128:$src1, VR128:$src2)))]>;
+ def ORPSrr : PSI<0x56, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (or VR128:$src1, VR128:$src2)))]>;
+ def XORPSrr : PSI<0x57, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (xor VR128:$src1, VR128:$src2)))]>;
+ }
+
+ def ANDPSrm : PSI<0x54, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (and VR128:$src1,
+ (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+ def ORPSrm : PSI<0x56, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (or VR128:$src1,
+ (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+ def XORPSrm : PSI<0x57, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (xor VR128:$src1,
+ (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+ def ANDNPSrr : PSI<0x55, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "andnps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor VR128:$src1,
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ VR128:$src2)))]>;
+ def ANDNPSrm : PSI<0x55, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1,f128mem:$src2),
+ "andnps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor VR128:$src1,
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ (bc_v2i64 (loadv4f32 addr:$src2)))))]>;
+}
+
+let isTwoAddress = 1 in {
+ def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc),
+ "cmp${cc}ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+// Shuffle and unpack instructions
+let isTwoAddress = 1 in {
+ let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+ def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1,
+ VR128:$src2, i32i8imm:$src3),
+ "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ SHUFP_shuffle_mask:$src3)))]>;
+ def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1,
+ f128mem:$src2, i32i8imm:$src3),
+ "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ SHUFP_shuffle_mask:$src3)))]>;
+
+ let AddedComplexity = 10 in {
+ def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "unpckhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ UNPCKH_shuffle_mask)))]>;
+ def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "unpckhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ UNPCKH_shuffle_mask)))]>;
+
+ def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "unpcklps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ UNPCKL_shuffle_mask)))]>;
+ def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "unpcklps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ UNPCKL_shuffle_mask)))]>;
+ } // AddedComplexity
+} // isTwoAddress
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "movmskps {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "movmskpd {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetching loads.
+// TODO: no intrinsics for these?
+def PREFETCHT0 : PSI<0x18, MRM1m, (ops i8mem:$src), "prefetcht0 $src", []>;
+def PREFETCHT1 : PSI<0x18, MRM2m, (ops i8mem:$src), "prefetcht1 $src", []>;
+def PREFETCHT2 : PSI<0x18, MRM3m, (ops i8mem:$src), "prefetcht2 $src", []>;
+def PREFETCHNTA : PSI<0x18, MRM0m, (ops i8mem:$src), "prefetchnta $src", []>;
+
+// Non-temporal stores
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+ "movntps {$src, $dst|$dst, $src}",
+ [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+// Load, store, and memory fence
+def SFENCE : PSI<0xAE, MRM7m, (ops), "sfence", [(int_x86_sse_sfence)]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (ops i32mem:$src),
+ "ldmxcsr $src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (ops i32mem:$dst),
+ "stmxcsr $dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in
+def V_SET0 : PSI<0x57, MRMInitReg, (ops VR128:$dst),
+ "xorps $dst, $dst",
+ [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+
+// FR32 to 128-bit vector conversion.
+def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4f32 (scalar_to_vector FR32:$src)))]>;
+def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+// (f32 FR32:$src)>;
+def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, VR128:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPS2SSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, VR128:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(store (f32 (vector_extract (v4f32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let isTwoAddress = 1 in {
+ def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, FR32:$src2),
+ "movss {$src2, $dst|$dst, $src2}", []>;
+
+ let AddedComplexity = 15 in
+ def MOVLPSrr : SSI<0x10, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "movss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVL_shuffle_mask)))]>;
+}
+
+// Move to lower bits of a VR128 and zeroing upper bits.