diff options
-rw-r--r-- | lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 62 | ||||
-rw-r--r-- | lib/Target/ARM/ARMTargetTransformInfo.cpp | 6 | ||||
-rw-r--r-- | test/Analysis/CostModel/ARM/cast.ll | 4 | ||||
-rw-r--r-- | test/CodeGen/ARM/vcvt-cost.ll | 46 |
5 files changed, 81 insertions, 38 deletions
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index adc0e9d86e..1c4274a910 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -581,6 +581,7 @@ private: SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); + SDValue SplitVecOp_TRUNCATE(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index bd8b6ac0c7..04c6bfd0c2 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1046,6 +1046,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break; case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break; + case ISD::TRUNCATE: Res = SplitVecOp_TRUNCATE(N); break; case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break; case ISD::STORE: Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo); @@ -1062,7 +1063,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::FTRUNC: - case ISD::TRUNCATE: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: @@ -1293,6 +1293,66 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) { &Elts[0], Elts.size()); } +SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) { + // The result type is legal, but the input type is illegal. If splitting + // ends up with the result type of each half still being legal, just + // do that. If, however, that would result in an illegal result type, + // we can try to get more clever with power-two vectors. Specifically, + // split the input type, but also widen the result element size, then + // concatenate the halves and truncate again. For example, consider a target + // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit + // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do: + // %inlo = v4i32 extract_subvector %in, 0 + // %inhi = v4i32 extract_subvector %in, 4 + // %lo16 = v4i16 trunc v4i32 %inlo + // %hi16 = v4i16 trunc v4i32 %inhi + // %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16 + // %res = v8i8 trunc v8i16 %in16 + // + // Without this transform, the original truncate would end up being + // scalarized, which is pretty much always a last resort. + SDValue InVec = N->getOperand(0); + EVT InVT = InVec->getValueType(0); + EVT OutVT = N->getValueType(0); + unsigned NumElements = OutVT.getVectorNumElements(); + // Widening should have already made sure this is a power-two vector + // if we're trying to split it at all. assert() that's true, just in case. + assert(!(NumElements & 1) && "Splitting vector, but not in half!"); + + unsigned InElementSize = InVT.getVectorElementType().getSizeInBits(); + unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits(); + + // If the input elements are only 1/2 the width of the result elements, + // just use the normal splitting. Our trick only work if there's room + // to split more than once. + if (InElementSize <= OutElementSize * 2) + return SplitVecOp_UnaryOp(N); + DebugLoc DL = N->getDebugLoc(); + + // Extract the halves of the input via extract_subvector. + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), NumElements/2); + SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec, + DAG.getIntPtrConstant(0)); + SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec, + DAG.getIntPtrConstant(NumElements/2)); + // Truncate them to 1/2 the element size. + EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, + NumElements/2); + SDValue HalfLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InLoVec); + SDValue HalfHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InHiVec); + // Concatenate them to get the full intermediate truncation result. + EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements); + SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo, + HalfHi); + // Now finish up by truncating all the way down to the original result + // type. This should normally be something that ends up being legal directly, + // but in theory if a target has very wide vectors and an annoyingly + // restricted set of legal types, this split can chain to build things up. + return DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec); +} + SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 1019b972e9..3149f19b78 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -223,9 +223,9 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, - // Operations that we legalize using load/stores to the stack. - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 }, + // Operations that we legalize using splitting. + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, // Vector float <-> i32 conversions. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll index ba9d84cf3e..0cdd61cac4 100644 --- a/test/Analysis/CostModel/ARM/cast.ll +++ b/test/Analysis/CostModel/ARM/cast.ll @@ -175,9 +175,9 @@ define i32 @casts() { %rext_5 = zext <4 x i16> undef to <4 x i64> ; Vector cast cost of instructions lowering the cast to the stack. - ; CHECK: cost of 19 {{.*}} trunc + ; CHECK: cost of 3 {{.*}} trunc %r74 = trunc <8 x i32> undef to <8 x i8> - ; CHECK: cost of 38 {{.*}} trunc + ; CHECK: cost of 6 {{.*}} trunc %r75 = trunc <16 x i32> undef to <16 x i8> ; Floating point truncation costs. diff --git a/test/CodeGen/ARM/vcvt-cost.ll b/test/CodeGen/ARM/vcvt-cost.ll index 04619b9c6c..0d45c40b88 100644 --- a/test/CodeGen/ARM/vcvt-cost.ll +++ b/test/CodeGen/ARM/vcvt-cost.ll @@ -32,29 +32,22 @@ define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) { store %TA1_5 %r, %TA1_5* %storeaddr ret void } -;; We currently estimate the cost of this instruction as expensive. If lowering -;; is improved the cost needs to change. + %T0_51 = type <8 x i32> %T1_51 = type <8 x i8> ; CHECK: func_cvt51: define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) { -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb +; CHECK: vmovn.i32 +; CHECK: vmovn.i32 +; CHECK: vmovn.i16 %v0 = load %T0_51* %loadaddr ; COST: func_cvt51 -; COST: cost of 19 {{.*}} trunc +; COST: cost of 3 {{.*}} trunc %r = trunc %T0_51 %v0 to %T1_51 store %T1_51 %r, %T1_51* %storeaddr ret void } -;; We currently estimate the cost of this instruction as expensive. If lowering -;; is improved the cost needs to change. + %TT0_5 = type <16 x i8> %TT1_5 = type <16 x i32> ; CHECK: func_cvt52: @@ -87,31 +80,20 @@ define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) { store %TTA1_5 %r, %TTA1_5* %storeaddr ret void } -;; We currently estimate the cost of this instruction as expensive. If lowering -;; is improved the cost needs to change. + %TT0_51 = type <16 x i32> %TT1_51 = type <16 x i8> ; CHECK: func_cvt512: define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) { -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb +; CHECK: vmovn.i32 +; CHECK: vmovn.i32 +; CHECK: vmovn.i32 +; CHECK: vmovn.i32 +; CHECK: vmovn.i16 +; CHECK: vmovn.i16 %v0 = load %TT0_51* %loadaddr ; COST: func_cvt512 -; COST: cost of 38 {{.*}} trunc +; COST: cost of 6 {{.*}} trunc %r = trunc %TT0_51 %v0 to %TT1_51 store %TT1_51 %r, %TT1_51* %storeaddr ret void |