diff options
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 152 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFragmentsSIMD.td | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 10 | ||||
-rw-r--r-- | test/CodeGen/X86/avx-vpermil.ll | 16 |
5 files changed, 156 insertions, 32 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f51a455b70..13cab27a6b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2717,7 +2717,10 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PUNPCKHBW: case X86ISD::PUNPCKHDQ: case X86ISD::PUNPCKHQDQ: - case X86ISD::VPERMIL: + case X86ISD::VPERMILPS: + case X86ISD::VPERMILPSY: + case X86ISD::VPERMILPD: + case X86ISD::VPERMILPDY: return true; } return false; @@ -2743,7 +2746,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMIL: + case X86ISD::VPERMILPS: + case X86ISD::VPERMILPSY: + case X86ISD::VPERMILPD: + case X86ISD::VPERMILPDY: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3400,21 +3406,63 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } -/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMIL*. -static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) { +/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. +/// Note that VPERMIL mask matching is different depending whether theunderlying +/// type is 32 or 64. In the VPERMILPS the high half of the mask should point +/// to the same elements of the low, but to the higher half of the source. +/// In VPERMILPD the two lanes could be shuffled independently of each other +/// with the same restriction that lanes can't be crossed. +static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, + const X86Subtarget *Subtarget) { + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits()/128; + + if (!Subtarget->hasAVX()) + return false; + + // Match any permutation of 128-bit vector with 64-bit types + if (NumLanes == 1 && NumElts != 2) + return false; + + // Only match 256-bit with 32 types + if (VT.getSizeInBits() == 256 && NumElts != 4) + return false; + + // The mask on the high lane is independent of the low. Both can match + // any element in inside its own lane, but can't cross. + int LaneSize = NumElts/NumLanes; + for (int l = 0; l < NumLanes; ++l) + for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { + int LaneStart = l*LaneSize; + if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) + return false; + } + + return true; +} + +/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. +/// Note that VPERMIL mask matching is different depending whether theunderlying +/// type is 32 or 64. In the VPERMILPS the high half of the mask should point +/// to the same elements of the low, but to the higher half of the source. +/// In VPERMILPD the two lanes could be shuffled independently of each other +/// with the same restriction that lanes can't be crossed. +static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, + const X86Subtarget *Subtarget) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; - // Match any permutation of 128-bit vector with 32/64-bit types - if (NumLanes == 1) { - if (NumElts == 4 || NumElts == 2) - return true; + if (!Subtarget->hasAVX()) + return false; + + // Match any permutation of 128-bit vector with 32-bit types + if (NumLanes == 1 && NumElts != 4) return false; - } - // Only match 256-bit with 32/64-bit types - if (NumElts != 8 && NumElts != 4) + // Only match 256-bit with 32 types + if (VT.getSizeInBits() == 256 && NumElts != 8) return false; // The mask on the high lane should be the same as the low. Actually, @@ -3424,7 +3472,6 @@ static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) { int HighElt = i+LaneSize; if (Mask[i] < 0 || Mask[HighElt] < 0) continue; - if (Mask[HighElt]-Mask[i] != LaneSize) return false; } @@ -3432,9 +3479,9 @@ static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) { return true; } -/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMIL* instructions. -static unsigned getShuffleVPERMILImmediate(SDNode *N) { +/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMILPS* instructions. +static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); EVT VT = SVOp->getValueType(0); @@ -3448,6 +3495,24 @@ static unsigned getShuffleVPERMILImmediate(SDNode *N) { return Mask; } +/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMILPD* instructions. +static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + EVT VT = SVOp->getValueType(0); + + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits()/128; + + unsigned Mask = 0; + int LaneSize = NumElts/NumLanes; + for (int l = 0; l < NumLanes; ++l) + for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) + Mask |= (SVOp->getMaskElt(i)-l*LaneSize) << i; + + return Mask; +} + /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. @@ -4163,7 +4228,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, Depth+1); } - case X86ISD::VPERMIL: + case X86ISD::VPERMILPS: + case X86ISD::VPERMILPSY: + // FIXME: Implement the other types ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), ShuffleMask); @@ -5784,6 +5851,22 @@ static inline unsigned getUNPCKHOpcode(EVT VT) { return 0; } +static inline unsigned getVPERMILOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: + case MVT::v4f32: return X86ISD::VPERMILPS; + case MVT::v2i64: + case MVT::v2f64: return X86ISD::VPERMILPD; + case MVT::v8i32: + case MVT::v8f32: return X86ISD::VPERMILPSY; + case MVT::v4i64: + case MVT::v4f64: return X86ISD::VPERMILPDY; + default: + llvm_unreachable("Unknown type for vpermil"); + } + return 0; +} + static SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI, @@ -6123,14 +6206,25 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); //===--------------------------------------------------------------------===// - // Custom lower or generate target specific nodes for 256-bit shuffles. + // Generate target specific nodes for 128 or 256-bit shuffles only + // supported in the AVX instruction set. + // - // Handle VPERMIL permutations - if (isVPERMILMask(M, VT)) { - unsigned TargetMask = getShuffleVPERMILImmediate(SVOp); - if (VT == MVT::v8f32) - return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG); - } + // Handle VPERMILPS* permutations + if (isVPERMILPSMask(M, VT, Subtarget)) + return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, + getShuffleVPERMILPSImmediate(SVOp), DAG); + + // Handle VPERMILPD* permutations + if (isVPERMILPDMask(M, VT, Subtarget)) + return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, + getShuffleVPERMILPDImmediate(SVOp), DAG); + + //===--------------------------------------------------------------------===// + // Since no target specific shuffle was selected for this generic one, + // lower it into other known shuffles. FIXME: this isn't true yet, but + // this is the plan. + // // Handle general 256-bit shuffles if (VT.is256BitVector()) @@ -9748,7 +9842,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; - case X86ISD::VPERMIL: return "X86ISD::VPERMIL"; + case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; + case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; + case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; + case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -12666,7 +12763,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMIL: + case X86ISD::VPERMILPS: + case X86ISD::VPERMILPSY: + case X86ISD::VPERMILPD: + case X86ISD::VPERMILPDY: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 5d6f36febf..c960ad6d22 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -271,7 +271,10 @@ namespace llvm { PUNPCKHWD, PUNPCKHDQ, PUNPCKHQDQ, - VPERMIL, + VPERMILPS, + VPERMILPSY, + VPERMILPD, + VPERMILPDY, // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, // according to %al. An operator is needed so that this can be expanded diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 4522924858..4e8d6ef935 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -153,7 +153,10 @@ def X86Punpckhwd : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>; def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>; def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>; -def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>; +def X86VPermilps : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>; +def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>; +def X86VPermilpd : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>; +def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>; //===----------------------------------------------------------------------===// // SSE Complex Patterns diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 59f1dc183b..1594d82237 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5522,6 +5522,12 @@ def : Pat<(int_x86_avx_vperm2f128_si_256 VR256:$src1, (memopv8i32 addr:$src2), imm:$src3), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; +// Shuffle with VPERMIL instructions +def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))), + (VPERMILPSYri VR256:$src1, imm:$imm)>; +def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))), + (VPERMILPDYri VR256:$src1, imm:$imm)>; + //===----------------------------------------------------------------------===// // VZERO - Zero YMM registers // @@ -5543,10 +5549,6 @@ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", // The AVX version of some but not all of them are described here, and more // should come in a near future. -// Shuffle with VPERMIL instructions -def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))), - (VPERMILPSYri VR256:$src1, imm:$imm)>; - // Shuffle with PSHUFD instruction folding loads. The first two patterns match // SSE2 loads, which are always promoted to v2i64. The last one should match // the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll new file mode 100644 index 0000000000..d57c1737c4 --- /dev/null +++ b/test/CodeGen/X86/avx-vpermil.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; CHECK: vpermilps +define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5> + ret <8 x float> %shuffle +} + +; CHECK: vpermilpd +define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3> + ret <4 x double> %shuffle +} + |