diff options
author | Nadav Rotem <nadav.rotem@intel.com> | 2012-04-11 06:40:27 +0000 |
---|---|---|
committer | Nadav Rotem <nadav.rotem@intel.com> | 2012-04-11 06:40:27 +0000 |
commit | e611378a6e45fcb4a039d8c0089cd8fed2d311dc (patch) | |
tree | d9891754db505d67c3d03c330ba7a2b1b7bfc666 | |
parent | bee78fe5fcd8464f58bc729dede1a87d763ac3ae (diff) |
Reapply 154396 after fixing a test.
Original message:
Modify the code that lowers shuffles to blends from using blendvXX to vblendXX.
blendV uses a register for the selection while Vblend uses an immediate.
On sandybridge they still have the same latency and execute on the same execution ports.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154483 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 89 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 7 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFragmentsSIMD.td | 6 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 21 | ||||
-rw-r--r-- | test/CodeGen/X86/avx-shuffle.ll | 10 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-20.ll | 2 |
6 files changed, 93 insertions, 42 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4f14a0e20b..d662d12b7a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5391,59 +5391,75 @@ static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - LLVMContext *Context = DAG.getContext(); EVT VT = Op.getValueType(); EVT InVT = V1.getValueType(); - EVT EltVT = VT.getVectorElementType(); - unsigned EltSize = EltVT.getSizeInBits(); int MaskSize = VT.getVectorNumElements(); int InSize = InVT.getVectorNumElements(); - // TODO: At the moment we only use AVX blends. We could also use SSE4 blends. - if (!Subtarget->hasAVX()) + if (!Subtarget->hasSSE41()) return SDValue(); if (MaskSize != InSize) return SDValue(); - SmallVector<Constant*,2> MaskVals; - ConstantInt *Zero = ConstantInt::get(*Context, APInt(EltSize, 0)); - ConstantInt *NegOne = ConstantInt::get(*Context, APInt(EltSize, -1)); + int ISDNo = 0; + MVT OpTy; + + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i16: + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v8i16; + break; + case MVT::v4i32: + case MVT::v4f32: + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v4f32; + break; + case MVT::v2i64: + case MVT::v2f64: + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v2f64; + break; + case MVT::v8i32: + case MVT::v8f32: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v8f32; + break; + case MVT::v4i64: + case MVT::v4f64: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v4f64; + break; + case MVT::v16i16: + if (!Subtarget->hasAVX2()) + return SDValue(); + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v16i16; + break; + } + assert(ISDNo && "Invalid Op Number"); + + unsigned MaskVals = 0; for (int i = 0; i < MaskSize; ++i) { int EltIdx = SVOp->getMaskElt(i); if (EltIdx == i || EltIdx == -1) - MaskVals.push_back(NegOne); + MaskVals |= (1<<i); else if (EltIdx == (i + MaskSize)) - MaskVals.push_back(Zero); + continue; // Bit is set to zero; else return SDValue(); } - Constant *MaskC = ConstantVector::get(MaskVals); - EVT MaskTy = EVT::getEVT(MaskC->getType()); - assert(MaskTy.getSizeInBits() == VT.getSizeInBits() && "Invalid mask size"); - SDValue MaskIdx = DAG.getConstantPool(MaskC, PtrTy); - unsigned Alignment = cast<ConstantPoolSDNode>(MaskIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(MaskTy, dl, DAG.getEntryNode(), MaskIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - - if (Subtarget->hasAVX2() && MaskTy == MVT::v32i8) - return DAG.getNode(ISD::VSELECT, dl, VT, Mask, V1, V2); - - if (Subtarget->hasAVX()) { - switch (MaskTy.getSimpleVT().SimpleTy) { - default: return SDValue(); - case MVT::v16i8: - case MVT::v4i32: - case MVT::v2i64: - case MVT::v8i32: - case MVT::v4i64: - return DAG.getNode(ISD::VSELECT, dl, VT, Mask, V1, V2); - } - } - - return SDValue(); + V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2); + SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2, + DAG.getConstant(MaskVals, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } // v8i16 shuffles - Prefer shuffles in the following order: @@ -11050,6 +11066,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDV: return "X86ISD::BLENDV"; + case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; + case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; + case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index ca8efe64dd..4e0073365a 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -175,9 +175,14 @@ namespace llvm { /// PSIGN - Copy integer sign. PSIGN, - /// BLEND family of opcodes + /// BLENDV - Blend where the selector is an XMM. BLENDV, + /// BLENDxx - Blend where the selector is an immediate. + BLENDPW, + BLENDPS, + BLENDPD, + /// HADD - Integer horizontal add. HADD, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index ae3ed1bcb3..041a64f336 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -126,6 +126,8 @@ def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, +SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>; @@ -158,6 +160,10 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>; +def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>; +def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f4e4418491..7741f409db 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6735,12 +6735,22 @@ let Predicates = [HasAVX] in { def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), (v4f64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + + def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2), + (imm:$mask))), + (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>; + def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2), + (imm:$mask))), + (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>; } let Predicates = [HasAVX2] in { def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), (v32i8 VR256:$src2))), (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2), + (imm:$mask))), + (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>; } /// SS41I_ternary_int - SSE 4.1 ternary operator @@ -6789,6 +6799,17 @@ let Predicates = [HasSSE41] in { def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), (v2f64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; + + def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2), + (imm:$mask))), + (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>; + def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (imm:$mask))), + (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>; + def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (imm:$mask))), + (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>; + } let Predicates = [HasAVX] in diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll index f323f3fd20..16c447be17 100644 --- a/test/CodeGen/X86/avx-shuffle.ll +++ b/test/CodeGen/X86/avx-shuffle.ll @@ -164,7 +164,7 @@ i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 } ; CHECK: blend1 -; CHECK: vblendvps +; CHECK: vblendps ; CHECK: ret define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> @@ -172,7 +172,7 @@ define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { } ; CHECK: blend2 -; CHECK: vblendvps +; CHECK: vblendps ; CHECK: ret define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> @@ -180,7 +180,7 @@ define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { } ; CHECK: blend2a -; CHECK: vblendvps +; CHECK: vblendps ; CHECK: ret define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline { %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> @@ -188,7 +188,7 @@ define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinlin } ; CHECK: blend3 -; CHECK-NOT: vblendvps +; CHECK-NOT: vblendps ; CHECK: ret define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7> @@ -196,7 +196,7 @@ define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { } ; CHECK: blend4 -; CHECK: vblendvpd +; CHECK: vblendpd ; CHECK: ret define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll index fc06b9514e..b6b8ba6f84 100644 --- a/test/CodeGen/X86/vec_shuffle-20.ll +++ b/test/CodeGen/X86/vec_shuffle-20.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -o /dev/null -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 3 +; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2 define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind { entry: |