diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 92 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 8 |
2 files changed, 52 insertions, 48 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8d4b4645b1..f73c3c24c4 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2782,23 +2782,28 @@ static SDOperand getSwapEltZeroMask(unsigned NumElems, unsigned DestElt, return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); } -/// PromoteSplat - Promote a splat of v8i16 or v16i8 to v4i32. -/// -static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) { +/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. +static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG, bool HasSSE2) { + MVT::ValueType PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32; + MVT::ValueType VT = Op.getValueType(); + if (PVT == VT) + return Op; SDOperand V1 = Op.getOperand(0); SDOperand Mask = Op.getOperand(2); - MVT::ValueType VT = Op.getValueType(); unsigned NumElems = Mask.getNumOperands(); - Mask = getUnpacklMask(NumElems, DAG); - while (NumElems != 4) { - V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask); - NumElems >>= 1; + // Special handling of v4f32 -> v4i32. + if (VT != MVT::v4f32) { + Mask = getUnpacklMask(NumElems, DAG); + while (NumElems > 4) { + V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask); + NumElems >>= 1; + } + Mask = getZeroVector(MVT::v4i32, DAG); } - V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1); - Mask = getZeroVector(MVT::v4i32, DAG); - SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1, - DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask); + V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1); + SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1, + DAG.getNode(ISD::UNDEF, PVT), Mask); return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle); } @@ -3426,6 +3431,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { SDOperand PermMask = Op.getOperand(2); MVT::ValueType VT = Op.getValueType(); unsigned NumElems = PermMask.getNumOperands(); + bool isMMX = MVT::getSizeInBits(VT) == 64; bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; @@ -3443,9 +3449,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { return V2; if (isSplatMask(PermMask.Val)) { - if (NumElems <= 4) return Op; - // Promote it to a v4i32 splat. - return PromoteSplat(Op, DAG); + if (isMMX || NumElems < 4) return Op; + // Promote it to a v4{if}32 splat. + return PromoteSplat(Op, DAG, Subtarget->hasSSE2()); } // If the shuffle can be profitably rewritten as a narrower shuffle, then @@ -3556,35 +3562,39 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { return Op; } - // If VT is integer, try PSHUF* first, then SHUFP*. - if (MVT::isInteger(VT)) { - // MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically - // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. - if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) && - X86::isPSHUFDMask(PermMask.Val)) || - X86::isPSHUFHWMask(PermMask.Val) || - X86::isPSHUFLWMask(PermMask.Val)) { - if (V2.getOpcode() != ISD::UNDEF) - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, - DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask); + // Try PSHUF* first, then SHUFP*. + // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically + // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. + if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.Val)) { + if (V2.getOpcode() != ISD::UNDEF) + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, + DAG.getNode(ISD::UNDEF, VT), PermMask); + return Op; + } + + if (!isMMX) { + if (Subtarget->hasSSE2() && + (X86::isPSHUFDMask(PermMask.Val) || + X86::isPSHUFHWMask(PermMask.Val) || + X86::isPSHUFLWMask(PermMask.Val))) { + MVT::ValueType RVT = VT; + if (VT == MVT::v4f32) { + RVT = MVT::v4i32; + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, + DAG.getNode(ISD::BIT_CONVERT, RVT, V1), + DAG.getNode(ISD::UNDEF, RVT), PermMask); + } else if (V2.getOpcode() != ISD::UNDEF) + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, V1, + DAG.getNode(ISD::UNDEF, RVT), PermMask); + if (RVT != VT) + Op = DAG.getNode(ISD::BIT_CONVERT, VT, Op); return Op; } - if (X86::isSHUFPMask(PermMask.Val) && - MVT::getSizeInBits(VT) != 64) // Don't do this for MMX. + // Binary or unary shufps. + if (X86::isSHUFPMask(PermMask.Val) || + (V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.Val))) return Op; - } else { - // Floating point cases in the other order. - if (X86::isSHUFPMask(PermMask.Val)) - return Op; - if (X86::isPSHUFDMask(PermMask.Val) || - X86::isPSHUFHWMask(PermMask.Val) || - X86::isPSHUFLWMask(PermMask.Val)) { - if (V2.getOpcode() != ISD::UNDEF) - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, - DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask); - return Op; - } } // Handle v8i16 specifically since SSE can do byte extraction and insertion. @@ -3595,7 +3605,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { } // Handle all 4 wide cases with a number of shuffles. - if (NumElems == 4 && MVT::getSizeInBits(VT) != 64) { + if (NumElems == 4 && !isMMX) { // Don't do this for MMX. MVT::ValueType MaskVT = PermMask.getValueType(); MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 7d70480a82..dbc04b01e2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2803,13 +2803,7 @@ def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm), (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; } -// Splat v4f32 -def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm), - (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>, - Requires<[HasSSE1]>; - // Special unary SHUFPSrri case. -// FIXME: when we want non two-address code, then we should use PSHUFD? def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef), SHUFP_unary_shuffle_mask:$sm)), (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, @@ -2820,7 +2814,7 @@ def : Pat<(v2f64 (vector_shuffle VR128:$src1, (undef), (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE2]>; // Unary v4f32 shuffle with PSHUF* in order to fold a load. -def : Pat<(vector_shuffle (memopv4f32 addr:$src1), (undef), +def : Pat<(vector_shuffle (bc_v4i32 (memopv4f32 addr:$src1)), (undef), SHUFP_unary_shuffle_mask:$sm), (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>, Requires<[HasSSE2]>; |