5 files changed, 81 insertions, 38 deletions
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index adc0e9d86e..1c4274a910 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -581,6 +581,7 @@ private:
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue SplitVecOp_TRUNCATE(SDNode *N);
   SDValue SplitVecOp_VSETCC(SDNode *N);
   SDValue SplitVecOp_FP_ROUND(SDNode *N);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index bd8b6ac0c7..04c6bfd0c2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1046,6 +1046,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
     case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
     case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
+    case ISD::TRUNCATE:          Res = SplitVecOp_TRUNCATE(N); break;
     case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
     case ISD::STORE:
       Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
@@ -1062,7 +1063,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:
     case ISD::FTRUNC:
-    case ISD::TRUNCATE:
     case ISD::SIGN_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::ANY_EXTEND:
@@ -1293,6 +1293,66 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
                      &Elts[0], Elts.size());
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
+  // The result type is legal, but the input type is illegal.  If splitting
+  // ends up with the result type of each half still being legal, just
+  // do that.  If, however, that would result in an illegal result type,
+  // we can try to get more clever with power-two vectors. Specifically,
+  // split the input type, but also widen the result element size, then
+  // concatenate the halves and truncate again.  For example, consider a target
+  // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
+  // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
+  //   %inlo = v4i32 extract_subvector %in, 0
+  //   %inhi = v4i32 extract_subvector %in, 4
+  //   %lo16 = v4i16 trunc v4i32 %inlo
+  //   %hi16 = v4i16 trunc v4i32 %inhi
+  //   %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
+  //   %res = v8i8 trunc v8i16 %in16
+  //
+  // Without this transform, the original truncate would end up being
+  // scalarized, which is pretty much always a last resort.
+  SDValue InVec = N->getOperand(0);
+  EVT InVT = InVec->getValueType(0);
+  EVT OutVT = N->getValueType(0);
+  unsigned NumElements = OutVT.getVectorNumElements();
+  // Widening should have already made sure this is a power-two vector
+  // if we're trying to split it at all. assert() that's true, just in case.
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+
+  unsigned InElementSize = InVT.getVectorElementType().getSizeInBits();
+  unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits();
+
+  // If the input elements are only 1/2 the width of the result elements,
+  // just use the normal splitting. Our trick only work if there's room
+  // to split more than once.
+  if (InElementSize <= OutElementSize * 2)
+    return SplitVecOp_UnaryOp(N);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Extract the halves of the input via extract_subvector.
+  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
+                                 InVT.getVectorElementType(), NumElements/2);
+  SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
+                                DAG.getIntPtrConstant(0));
+  SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
+                                DAG.getIntPtrConstant(NumElements/2));
+  // Truncate them to 1/2 the element size.
+  EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
+                                NumElements/2);
+  SDValue HalfLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InLoVec);
+  SDValue HalfHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InHiVec);
+  // Concatenate them to get the full intermediate truncation result.
+  EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
+  SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
+                                 HalfHi);
+  // Now finish up by truncating all the way down to the original result
+  // type. This should normally be something that ends up being legal directly,
+  // but in theory if a target has very wide vectors and an annoyingly
+  // restricted set of legal types, this split can chain to build things up.
+  return DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1019b972e9..3149f19b78 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -223,9 +223,9 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
 
-    // Operations that we legalize using load/stores to the stack.
-    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
-    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
+    // Operations that we legalize using splitting.
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
 
     // Vector float <-> i32 conversions.
     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll
index ba9d84cf3e..0cdd61cac4 100644
--- a/test/Analysis/CostModel/ARM/cast.ll
+++ b/test/Analysis/CostModel/ARM/cast.ll
@@ -175,9 +175,9 @@ define i32 @casts() {
   %rext_5 = zext <4 x i16> undef to <4 x i64>
 
   ; Vector cast cost of instructions lowering the cast to the stack.
-  ; CHECK: cost of 19 {{.*}} trunc
+  ; CHECK: cost of 3 {{.*}} trunc
   %r74 = trunc <8 x i32> undef to <8 x i8>
-  ; CHECK: cost of 38 {{.*}} trunc
+  ; CHECK: cost of 6 {{.*}} trunc
   %r75 = trunc <16 x i32> undef to <16 x i8>
 
   ; Floating point truncation costs.
diff --git a/test/CodeGen/ARM/vcvt-cost.ll b/test/CodeGen/ARM/vcvt-cost.ll
index 04619b9c6c..0d45c40b88 100644
--- a/test/CodeGen/ARM/vcvt-cost.ll
+++ b/test/CodeGen/ARM/vcvt-cost.ll
@@ -32,29 +32,22 @@ define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) {
   store %TA1_5 %r, %TA1_5* %storeaddr
   ret void
 }
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
 %T0_51 = type <8 x i32>
 %T1_51 = type <8 x i8>
 ; CHECK: func_cvt51:
 define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) {
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i16
   %v0 = load %T0_51* %loadaddr
 ; COST: func_cvt51
-; COST: cost of 19 {{.*}} trunc
+; COST: cost of 3 {{.*}} trunc
   %r = trunc %T0_51 %v0 to %T1_51
   store %T1_51 %r, %T1_51* %storeaddr
   ret void
 }
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
 %TT0_5 = type <16 x i8>
 %TT1_5 = type <16 x i32>
 ; CHECK: func_cvt52:
@@ -87,31 +80,20 @@ define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) {
   store %TTA1_5 %r, %TTA1_5* %storeaddr
   ret void
 }
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
 %TT0_51 = type <16 x i32>
 %TT1_51 = type <16 x i8>
 ; CHECK: func_cvt512:
 define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) {
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i16
+; CHECK: vmovn.i16
   %v0 = load %TT0_51* %loadaddr
 ; COST: func_cvt512
-; COST: cost of 38 {{.*}} trunc
+; COST: cost of 6 {{.*}} trunc
   %r = trunc %TT0_51 %v0 to %TT1_51
   store %TT1_51 %r, %TT1_51* %storeaddr
   ret void