3 files changed, 161 insertions, 45 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0a41d6d957..ce4a2c9068 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -4293,28 +4293,11 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
 /// operands.
 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
                                          TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
   if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
     SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
     if (Result.getNode()) return Result;
   }
-
-  // fold (add (arm_neon_vabd a, b) c) -> (arm_neon_vaba c, a, b)
-  EVT VT = N->getValueType(0);
-  if (N0.getOpcode() == ISD::INTRINSIC_WO_CHAIN && VT.isInteger()) {
-    unsigned IntNo = cast<ConstantSDNode>(N0.getOperand(0))->getZExtValue();
-    if (IntNo == Intrinsic::arm_neon_vabds)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT,
-                         DAG.getConstant(Intrinsic::arm_neon_vabas, MVT::i32),
-                         N1, N0.getOperand(1), N0.getOperand(2));
-    if (IntNo == Intrinsic::arm_neon_vabdu)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), VT,
-                         DAG.getConstant(Intrinsic::arm_neon_vabau, MVT::i32),
-                         N1, N0.getOperand(1), N0.getOperand(2));
-  }
-
   return SDValue();
 }
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index fb51a2f873..8c8d1d7b8a 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1288,6 +1288,24 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                    (ResTy (NEONvduplane (OpTy DPR_8:$src3),
                                                         imm:$lane)))))))]>;
 
+// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
+class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set DPR:$dst, (Ty (OpNode DPR:$src1,
+                             (Ty (IntOp (Ty DPR:$src2), (Ty DPR:$src3))))))]>;
+class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst, (Ty (OpNode QPR:$src1,
+                             (Ty (IntOp (Ty QPR:$src2), (Ty QPR:$src3))))))]>;
+
 // Neon 3-argument intrinsics, both double- and quad-register.
 // The destination register is also used as the first source operand register.
 class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -1342,6 +1360,17 @@ class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                               (TyD (NEONvduplane (TyD DPR_8:$src3),
                                                  imm:$lane))))))]>;
 
+// Long Intrinsic-Op vector operations with explicit extend (VABAL).
+class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                   SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
+                                (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src2),
+                                                        (TyD DPR:$src3)))))))]>;
 
 // Neon Long 3-argument intrinsic.  The destination register is
 // a quad-register and is also used as the first source operand register.
@@ -1433,6 +1462,19 @@ class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
   let isCommutable = Commutable;
 }
 
+// Long 3-register intrinsics with explicit extend (VABDL).
+class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                 InstrItinClass itin, string OpcodeStr, string Dt,
+                 ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                 bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src1),
+                                                (TyD DPR:$src2))))))]> {
+  let isCommutable = Commutable;
+}
+
 // Long 3-register intrinsics.
 class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
@@ -1918,6 +1960,21 @@ multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                       v8i16, v8i8, IntOp, Commutable>;
 }
 
+// ....with explicit extend (VABDL).
+multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "8"),
+                         v8i16, v8i8, IntOp, ExtOp, Commutable>;
+  def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin, 
+                         OpcodeStr, !strconcat(Dt, "16"),
+                         v4i32, v4i16, IntOp, ExtOp, Commutable>;
+  def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "32"),
+                         v2i64, v2i32, IntOp, ExtOp, Commutable>;
+}
+
 
 // Neon Wide 3-register vector intrinsics,
 //   source operand element sizes of 8, 16 and 32 bits:
@@ -1975,6 +2032,29 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8,
                           mul, ShOp>;
 }
 
+// Neon Intrinsic-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD, InstrItinClass itinQ,
+                        string OpcodeStr, string Dt, Intrinsic IntOp,
+                        SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
+  def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
+  def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
+  def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
+  def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
+}
+
 // Neon 3-argument intrinsics,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
@@ -2050,6 +2130,21 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
 }
 
+// ....with explicit extend (VABAL).
+multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                            InstrItinClass itin, string OpcodeStr, string Dt,
+                            Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> {
+  def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
+                           IntOp, ExtOp, OpNode>;
+  def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
+                           IntOp, ExtOp, OpNode>;
+  def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
+                           IntOp, ExtOp, OpNode>;
+}
+
 
 // Neon 2-register vector intrinsics,
 //   element sizes of 8, 16 and 32 bits:
@@ -2765,32 +2860,32 @@ def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
 //   VABD     : Vector Absolute Difference
 defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vabd", "s", int_arm_neon_vabds, 0>;
+                           "vabd", "s", int_arm_neon_vabds, 1>;
 defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vabd", "u", int_arm_neon_vabdu, 0>;
+                           "vabd", "u", int_arm_neon_vabdu, 1>;
 def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
-                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>;
+                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
 def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
-                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>;
+                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
 
 //   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
-defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                            "vabdl", "s", int_arm_neon_vabdls, 0>;
-defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                            "vabdl", "u", int_arm_neon_vabdlu, 0>;
+defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "s", int_arm_neon_vabds, zext, 1>;
+defm VABDLu   : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
 
 //   VABA     : Vector Absolute Difference and Accumulate
-defm VABAs    : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
-                            "vaba", "s", int_arm_neon_vabas>;
-defm VABAu    : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
-                            "vaba", "u", int_arm_neon_vabau>;
+defm VABAs    : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "s", int_arm_neon_vabds, add>;
+defm VABAu    : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "u", int_arm_neon_vabdu, add>;
 
 //   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
-defm VABALs   : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD, IIC_VABAD,
-                             "vabal", "s", int_arm_neon_vabals>;
-defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD, IIC_VABAD,
-                             "vabal", "u", int_arm_neon_vabalu>;
+defm VABALs   : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "s", int_arm_neon_vabds, zext, add>;
+defm VABALu   : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "u", int_arm_neon_vabdu, zext, add>;
 
 // Vector Maximum and Minimum.
 
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index e625d413f5..fd64460d6f 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -81,21 +81,21 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     } else if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
       if (((Name.compare(14, 5, "vmovl", 5) == 0 ||
             Name.compare(14, 5, "vaddl", 5) == 0 ||
-            Name.compare(14, 5, "vsubl", 5) == 0) &&
-           (Name.compare(19, 2, "s.", 2) == 0 ||
-            Name.compare(19, 2, "u.", 2) == 0)) ||
-
-          ((Name.compare(14, 5, "vaddw", 5) == 0 ||
-            Name.compare(14, 5, "vsubw", 5) == 0) &&
-           (Name.compare(19, 2, "s.", 2) == 0 ||
-            Name.compare(19, 2, "u.", 2) == 0)) ||
-
-          ((Name.compare(14, 5, "vmull", 5) == 0 ||
+            Name.compare(14, 5, "vsubl", 5) == 0 ||
+            Name.compare(14, 5, "vaddw", 5) == 0 ||
+            Name.compare(14, 5, "vsubw", 5) == 0 ||
+            Name.compare(14, 5, "vmull", 5) == 0 ||
             Name.compare(14, 5, "vmlal", 5) == 0 ||
-            Name.compare(14, 5, "vmlsl", 5) == 0) &&
+            Name.compare(14, 5, "vmlsl", 5) == 0 ||
+            Name.compare(14, 5, "vabdl", 5) == 0 ||
+            Name.compare(14, 5, "vabal", 5) == 0) &&
            (Name.compare(19, 2, "s.", 2) == 0 ||
             Name.compare(19, 2, "u.", 2) == 0)) ||
 
+          (Name.compare(14, 4, "vaba", 4) == 0 &&
+           (Name.compare(18, 2, "s.", 2) == 0 ||
+            Name.compare(18, 2, "u.", 2) == 0)) ||
+
           (Name.compare(14, 6, "vmovn.", 6) == 0)) {
 
         // Calls to these are transformed into IR without intrinsics.
@@ -391,6 +391,35 @@ static void ExtendNEONArgs(CallInst *CI, Value *Arg0, Value *Arg1,
   }
 }
 
+/// CallVABD - As part of expanding a call to one of the old NEON vabdl, vaba,
+/// or vabal intrinsics, construct a call to a vabd intrinsic.  Examine the
+/// name of the old intrinsic to determine whether to use a signed or unsigned
+/// vabd intrinsic.  Get the type from the old call instruction, adjusted for
+/// half-size vector elements if the old intrinsic was vabdl or vabal.
+static Instruction *CallVABD(CallInst *CI, Value *Arg0, Value *Arg1) {
+  Function *F = CI->getCalledFunction();
+  const std::string& Name = F->getName();
+  bool isLong = (Name.at(18) == 'l');
+  bool isSigned = (Name.at(isLong ? 19 : 18) == 's');
+
+  Intrinsic::ID intID;
+  if (isSigned)
+    intID = Intrinsic::arm_neon_vabds;
+  else
+    intID = Intrinsic::arm_neon_vabdu;
+
+  const Type *Ty = CI->getType();
+  if (isLong)
+    Ty = VectorType::getTruncatedElementVectorType(cast<const VectorType>(Ty));
+
+  Function *VABD = Intrinsic::getDeclaration(F->getParent(), intID, &Ty, 1);
+  Value *Operands[2];
+  Operands[0] = Arg0;
+  Operands[1] = Arg1;
+  return CallInst::Create(VABD, Operands, Operands+2, 
+                          "upgraded."+CI->getName(), CI);
+}
+
 // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the 
 // upgraded intrinsic. All argument and return casting must be provided in 
 // order to seamlessly integrate with existing context.
@@ -434,6 +463,15 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
         NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI,
                                          "upgraded."+CI->getName(), CI);
+      } else if (Name.compare(14, 4, "vabd", 4) == 0) {
+        NewI = CallVABD(CI, CI->getArgOperand(0), CI->getArgOperand(1));
+        NewI = new ZExtInst(NewI, CI->getType(), "upgraded."+CI->getName(), CI);
+      } else if (Name.compare(14, 4, "vaba", 4) == 0) {
+        NewI = CallVABD(CI, CI->getArgOperand(1), CI->getArgOperand(2));
+        if (Name.at(18) == 'l')
+          NewI = new ZExtInst(NewI, CI->getType(), "", CI);
+        NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), NewI,
+                                         "upgraded."+CI->getName(), CI);
       } else if (Name.compare(14, 6, "vmovn.", 6) == 0) {
         NewI = new TruncInst(CI->getArgOperand(0), CI->getType(),
                              "upgraded." + CI->getName(), CI);
@@ -675,7 +713,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   }
 
   switch (NewFn->getIntrinsicID()) {
-  default:  llvm_unreachable("Unknown function for CallInst upgrade.");
+  default: llvm_unreachable("Unknown function for CallInst upgrade.");
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3: