aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeTypes.h1
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp62
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.cpp6
-rw-r--r--test/Analysis/CostModel/ARM/cast.ll4
-rw-r--r--test/CodeGen/ARM/vcvt-cost.ll46
5 files changed, 81 insertions, 38 deletions
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index adc0e9d86e..1c4274a910 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -581,6 +581,7 @@ private:
SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
+ SDValue SplitVecOp_TRUNCATE(SDNode *N);
SDValue SplitVecOp_VSETCC(SDNode *N);
SDValue SplitVecOp_FP_ROUND(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index bd8b6ac0c7..04c6bfd0c2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1046,6 +1046,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break;
+ case ISD::TRUNCATE: Res = SplitVecOp_TRUNCATE(N); break;
case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break;
case ISD::STORE:
Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
@@ -1062,7 +1063,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
case ISD::FTRUNC:
- case ISD::TRUNCATE:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
@@ -1293,6 +1293,66 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
&Elts[0], Elts.size());
}
+SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
+ // The result type is legal, but the input type is illegal. If splitting
+ // ends up with the result type of each half still being legal, just
+ // do that. If, however, that would result in an illegal result type,
+ // we can try to get more clever with power-two vectors. Specifically,
+ // split the input type, but also widen the result element size, then
+ // concatenate the halves and truncate again. For example, consider a target
+ // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
+ // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
+ // %inlo = v4i32 extract_subvector %in, 0
+ // %inhi = v4i32 extract_subvector %in, 4
+ // %lo16 = v4i16 trunc v4i32 %inlo
+ // %hi16 = v4i16 trunc v4i32 %inhi
+ // %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
+ // %res = v8i8 trunc v8i16 %in16
+ //
+ // Without this transform, the original truncate would end up being
+ // scalarized, which is pretty much always a last resort.
+ SDValue InVec = N->getOperand(0);
+ EVT InVT = InVec->getValueType(0);
+ EVT OutVT = N->getValueType(0);
+ unsigned NumElements = OutVT.getVectorNumElements();
+ // Widening should have already made sure this is a power-two vector
+ // if we're trying to split it at all. assert() that's true, just in case.
+ assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+
+ unsigned InElementSize = InVT.getVectorElementType().getSizeInBits();
+ unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits();
+
+ // If the input elements are only 1/2 the width of the result elements,
+ // just use the normal splitting. Our trick only work if there's room
+ // to split more than once.
+ if (InElementSize <= OutElementSize * 2)
+ return SplitVecOp_UnaryOp(N);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Extract the halves of the input via extract_subvector.
+ EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
+ InVT.getVectorElementType(), NumElements/2);
+ SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
+ DAG.getIntPtrConstant(0));
+ SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
+ DAG.getIntPtrConstant(NumElements/2));
+ // Truncate them to 1/2 the element size.
+ EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
+ NumElements/2);
+ SDValue HalfLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InLoVec);
+ SDValue HalfHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InHiVec);
+ // Concatenate them to get the full intermediate truncation result.
+ EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
+ SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
+ HalfHi);
+ // Now finish up by truncating all the way down to the original result
+ // type. This should normally be something that ends up being legal directly,
+ // but in theory if a target has very wide vectors and an annoyingly
+ // restricted set of legal types, this split can chain to build things up.
+ return DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
+}
+
SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
assert(N->getValueType(0).isVector() &&
N->getOperand(0).getValueType().isVector() &&
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1019b972e9..3149f19b78 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -223,9 +223,9 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
- // Operations that we legalize using load/stores to the stack.
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
+ // Operations that we legalize using splitting.
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
// Vector float <-> i32 conversions.
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll
index ba9d84cf3e..0cdd61cac4 100644
--- a/test/Analysis/CostModel/ARM/cast.ll
+++ b/test/Analysis/CostModel/ARM/cast.ll
@@ -175,9 +175,9 @@ define i32 @casts() {
%rext_5 = zext <4 x i16> undef to <4 x i64>
; Vector cast cost of instructions lowering the cast to the stack.
- ; CHECK: cost of 19 {{.*}} trunc
+ ; CHECK: cost of 3 {{.*}} trunc
%r74 = trunc <8 x i32> undef to <8 x i8>
- ; CHECK: cost of 38 {{.*}} trunc
+ ; CHECK: cost of 6 {{.*}} trunc
%r75 = trunc <16 x i32> undef to <16 x i8>
; Floating point truncation costs.
diff --git a/test/CodeGen/ARM/vcvt-cost.ll b/test/CodeGen/ARM/vcvt-cost.ll
index 04619b9c6c..0d45c40b88 100644
--- a/test/CodeGen/ARM/vcvt-cost.ll
+++ b/test/CodeGen/ARM/vcvt-cost.ll
@@ -32,29 +32,22 @@ define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) {
store %TA1_5 %r, %TA1_5* %storeaddr
ret void
}
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
%T0_51 = type <8 x i32>
%T1_51 = type <8 x i8>
; CHECK: func_cvt51:
define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) {
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i16
%v0 = load %T0_51* %loadaddr
; COST: func_cvt51
-; COST: cost of 19 {{.*}} trunc
+; COST: cost of 3 {{.*}} trunc
%r = trunc %T0_51 %v0 to %T1_51
store %T1_51 %r, %T1_51* %storeaddr
ret void
}
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
%TT0_5 = type <16 x i8>
%TT1_5 = type <16 x i32>
; CHECK: func_cvt52:
@@ -87,31 +80,20 @@ define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) {
store %TTA1_5 %r, %TTA1_5* %storeaddr
ret void
}
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
%TT0_51 = type <16 x i32>
%TT1_51 = type <16 x i8>
; CHECK: func_cvt512:
define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) {
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i16
+; CHECK: vmovn.i16
%v0 = load %TT0_51* %loadaddr
; COST: func_cvt512
-; COST: cost of 38 {{.*}} trunc
+; COST: cost of 6 {{.*}} trunc
%r = trunc %TT0_51 %v0 to %TT1_51
store %TT1_51 %r, %TT1_51* %storeaddr
ret void