diff options
author | Bob Wilson <bob.wilson@apple.com> | 2009-06-22 23:27:02 +0000 |
---|---|---|
committer | Bob Wilson <bob.wilson@apple.com> | 2009-06-22 23:27:02 +0000 |
commit | 5bafff36c798608a189c517d37527e4a38863071 (patch) | |
tree | 79bd2abbc5253e6f00db07023cf7d829cbcdee5a /lib | |
parent | 5de83afcdc3f4f0edf8caacba523f5d05ee48048 (diff) |
Add support for ARM's Advanced SIMD (NEON) instruction set.
This is still a work in progress but most of the NEON instruction set
is supported.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73919 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Target/ARM/ARMCallingConv.td | 41 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelDAGToDAG.cpp | 62 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 1077 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.h | 72 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrFormats.td | 119 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.cpp | 8 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.h | 6 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.td | 10 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 1665 | ||||
-rw-r--r-- | lib/Target/ARM/ARMRegisterInfo.td | 127 | ||||
-rw-r--r-- | lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp | 18 | ||||
-rw-r--r-- | lib/Target/ARM/README.txt | 20 |
12 files changed, 3100 insertions, 125 deletions
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 47151e667c..8a4c741faf 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -24,19 +24,29 @@ def CC_ARM_APCS : CallingConv<[ CCIfType<[i8, i16], CCPromoteToType<i32>>, - // f64 is passed in pairs of GPRs, possibly split onto the stack - CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, CCIfType<[i32], CCAssignToStack<4, 4>>, - CCIfType<[f64], CCAssignToStack<8, 4>> + CCIfType<[f64], CCAssignToStack<8, 4>>, + CCIfType<[v2f64], CCAssignToStack<16, 4>> ]>; def RetCC_ARM_APCS : CallingConv<[ CCIfType<[f32], CCBitConvertToType<i32>>, - CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> @@ -59,7 +69,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[ CCAssignToReg<[R0, R1, R2, R3]>>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[f64], CCAssignToStack<8, 8>> + CCIfType<[f64], CCAssignToStack<8, 8>>, + CCIfType<[v2f64], CCAssignToStack<16, 8>> ]>; def RetCC_ARM_AAPCS_Common : CallingConv<[ @@ -72,13 +83,21 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[ //===----------------------------------------------------------------------===// def CC_ARM_AAPCS : CallingConv<[ - CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, CCDelegateTo<CC_ARM_AAPCS_Common> ]>; def RetCC_ARM_AAPCS : CallingConv<[ - CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, CCDelegateTo<RetCC_ARM_AAPCS_Common> ]>; @@ -88,6 +107,10 @@ def RetCC_ARM_AAPCS : CallingConv<[ //===----------------------------------------------------------------------===// def CC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, @@ -95,6 +118,10 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ ]>; def RetCC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index d3c3047fc9..ee9dadff15 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -32,6 +32,9 @@ #include "llvm/Support/Debug.h" using namespace llvm; +static const unsigned arm_dsubreg_0 = 5; +static const unsigned arm_dsubreg_1 = 6; + //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine /// instructions for SelectionDAG operations. @@ -918,6 +921,65 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl, MVT::Other, Ops, 3); } + + case ISD::CONCAT_VECTORS: { + MVT VT = Op.getValueType(); + assert(VT.is128BitVector() && Op.getNumOperands() == 2 && + "unexpected CONCAT_VECTORS"); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDNode *Result = + CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT); + if (N0.getOpcode() != ISD::UNDEF) + Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT, + SDValue(Result, 0), N0, + CurDAG->getTargetConstant(arm_dsubreg_0, + MVT::i32)); + if (N1.getOpcode() != ISD::UNDEF) + Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT, + SDValue(Result, 0), N1, + CurDAG->getTargetConstant(arm_dsubreg_1, + MVT::i32)); + return Result; + } + + case ISD::VECTOR_SHUFFLE: { + MVT VT = Op.getValueType(); + + // Match 128-bit splat to VDUPLANEQ. (This could be done with a Pat in + // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be + // transformed first into a lane number and then to both a subregister + // index and an adjusted lane number.) If the source operand is a + // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP. + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + if (VT.is128BitVector() && SVOp->isSplat() && + Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR && + Op.getOperand(1).getOpcode() == ISD::UNDEF) { + unsigned LaneVal = SVOp->getSplatIndex(); + + MVT HalfVT; + unsigned Opc = 0; + switch (VT.getVectorElementType().getSimpleVT()) { + default: assert(false && "unhandled VDUP splat type"); + case MVT::i8: Opc = ARM::VDUPLN8q; HalfVT = MVT::v8i8; break; + case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break; + case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break; + case MVT::f32: Opc = ARM::VDUPLNfq; HalfVT = MVT::v2f32; break; + } + + // The source operand needs to be changed to a subreg of the original + // 128-bit operand, and the lane number needs to be adjusted accordingly. + unsigned NumElts = VT.getVectorNumElements() / 2; + unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1); + SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32); + SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32); + SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG, + dl, HalfVT, N->getOperand(0), SR); + return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane); + } + + break; + } } return SelectCode(Op); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 2443625d4b..29d3da2b79 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -56,6 +56,52 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State); +void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, + MVT PromotedBitwiseVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); + + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); + } + + MVT ElemTy = VT.getVectorElementType(); + if (ElemTy != MVT::i64 && ElemTy != MVT::f64) + setOperationAction(ISD::VSETCC, VT, Custom); + if (ElemTy == MVT::i8 || ElemTy == MVT::i16) + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + if (VT.isInteger()) { + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + } + + // Promote all bit-wise operations. + if (VT.isInteger() && VT != PromotedBitwiseVT) { + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); + } +} + +void ARMTargetLowering::addDRTypeForNEON(MVT VT) { + addRegisterClass(VT, ARM::DPRRegisterClass); + addTypeForNEON(VT, MVT::f64, MVT::v2i32); +} + +void ARMTargetLowering::addQRTypeForNEON(MVT VT) { + addRegisterClass(VT, ARM::QPRRegisterClass); + addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); +} + ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) : TargetLowering(TM), ARMPCLabelIndex(0) { Subtarget = &TM.getSubtarget<ARMSubtarget>(); @@ -152,6 +198,30 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTruncStoreAction(MVT::f64, MVT::f32, Expand); } + + if (Subtarget->hasNEON()) { + addDRTypeForNEON(MVT::v2f32); + addDRTypeForNEON(MVT::v8i8); + addDRTypeForNEON(MVT::v4i16); + addDRTypeForNEON(MVT::v2i32); + addDRTypeForNEON(MVT::v1i64); + + addQRTypeForNEON(MVT::v4f32); + addQRTypeForNEON(MVT::v2f64); + addQRTypeForNEON(MVT::v16i8); + addQRTypeForNEON(MVT::v8i16); + addQRTypeForNEON(MVT::v4i32); + addQRTypeForNEON(MVT::v2i64); + + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + } + computeRegisterProperties(); // ARM does not have f32 extending load. @@ -352,6 +422,36 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::FMDRR: return "ARMISD::FMDRR"; case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + + case ARMISD::VCEQ: return "ARMISD::VCEQ"; + case ARMISD::VCGE: return "ARMISD::VCGE"; + case ARMISD::VCGEU: return "ARMISD::VCGEU"; + case ARMISD::VCGT: return "ARMISD::VCGT"; + case ARMISD::VCGTU: return "ARMISD::VCGTU"; + case ARMISD::VTST: return "ARMISD::VTST"; + + case ARMISD::VSHL: return "ARMISD::VSHL"; + case ARMISD::VSHRs: return "ARMISD::VSHRs"; + case ARMISD::VSHRu: return "ARMISD::VSHRu"; + case ARMISD::VSHLLs: return "ARMISD::VSHLLs"; + case ARMISD::VSHLLu: return "ARMISD::VSHLLu"; + case ARMISD::VSHLLi: return "ARMISD::VSHLLi"; + case ARMISD::VSHRN: return "ARMISD::VSHRN"; + case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; + case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; + case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; + case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; + case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; + case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; + case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; + case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; + case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; + case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; + case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; + case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; + case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; + case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VDUPLANEQ: return "ARMISD::VDUPLANEQ"; } } @@ -423,63 +523,93 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, #include "ARMGenCallingConv.inc" // APCS f64 is in register pairs, possibly split to stack -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const unsigned HiRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - static const unsigned LoRegList[] = { ARM::R1, - ARM::R2, - ARM::R3, - ARM::NoRegister }; - - unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 4); - if (Reg == 0) - return false; // we didn't handle it +static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + // Try to get the first register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else { + // For the 2nd half of a v2f64, do not fail. + if (CanFail) + return false; - unsigned i; - for (i = 0; i < 4; ++i) - if (HiRegList[i] == Reg) - break; + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 4), + LocVT, LocInfo)); + return true; + } - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); - if (LoRegList[i] != ARM::NoRegister) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - MVT::i32, LocInfo)); + // Try to get the second register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); else State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, State.AllocateStack(4, 4), - MVT::i32, LocInfo)); + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; return true; // we handled it } // AAPCS f64 is in aligned register pairs -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); - if (Reg == 0) - return false; // we didn't handle it + if (Reg == 0) { + // For the 2nd half of a v2f64, do not just fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 8), + LocVT, LocInfo)); + return true; + } unsigned i; for (i = 0; i < 2; ++i) if (HiRegList[i] == Reg) break; - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - MVT::i32, LocInfo)); + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; return true; // we handled it } -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, CCState &State) { static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; @@ -492,9 +622,20 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (HiRegList[i] == Reg) break; - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - MVT::i32, LocInfo)); + LocVT, LocInfo)); + return true; +} + +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; return true; // we handled it } @@ -558,7 +699,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, SDValue Val; if (VA.needsCustom()) { - // Handle f64 as custom. + // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); @@ -569,6 +710,24 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, Chain = Hi.getValue(1); InFlag = Hi.getValue(2); Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi); + + if (VA.getLocVT() == MVT::v2f64) { + SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(0, MVT::i32)); + + VA = RVLocs[++i]; // skip ahead to next loc + Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi); + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(1, MVT::i32)); + } } else { Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); @@ -625,6 +784,31 @@ ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, PseudoSourceValue::getStack(), LocMemOffset); } +void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVector<SDValue, 8> &MemOpChains, + ISD::ArgFlagsTy Flags) { + DebugLoc dl = TheCall->getDebugLoc(); + + SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Arg); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd)); + + if (NextVA.isRegLoc()) + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1))); + else { + assert(NextVA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, NextVA, + Chain, fmrrd.getValue(1), Flags)); + } +} + /// LowerCALL - Lowering a ISD::CALL node into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. @@ -651,7 +835,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { SDValue StackPtr = DAG.getRegister(ARM::SP, MVT::i32); - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + RegsToPassVector RegsToPass; SmallVector<SDValue, 8> MemOpChains; // Walk the register/memloc assignments, inserting copies/loads. In the case @@ -681,22 +865,32 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { break; } - // f64 is passed in i32 pairs and must be combined + // f64 and v2f64 are passed in i32 pairs and must be split into pieces if (VA.needsCustom()) { - SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl, - DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1); - RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd)); - VA = ArgLocs[++i]; // skip ahead to next loc - if (VA.isRegLoc()) - RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(1))); - else { - assert(VA.isMemLoc()); - if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); - - MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, - Chain, fmrrd.getValue(1), - Flags)); + if (VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + + PassF64ArgInRegs(TheCall, DAG, Chain, Op0, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(TheCall, DAG, Chain, Op1, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, + Chain, Op1, Flags)); + } + } else { + PassF64ArgInRegs(TheCall, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); @@ -864,9 +1058,28 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { break; } - // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is - // available. if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + // Extract the first half and return it in two registers. + SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue HalfGPRs = DAG.getNode(ARMISD::FMRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Half); + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(1), Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + + // Extract the 2nd half and fall through to handle it as an f64 value. + Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + } + // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is + // available. SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag); @@ -1117,6 +1330,40 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, } SDValue +ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + DebugLoc dl) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + TargetRegisterClass *RC; + if (AFI->isThumbFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + // Transform the arguments stored in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + + SDValue ArgValue2; + if (NextVA.isMemLoc()) { + unsigned ArgSize = NextVA.getLocVT().getSizeInBits()/8; + MachineFrameInfo *MFI = MF.getFrameInfo(); + int FI = MFI->CreateFixedObject(ArgSize, NextVA.getLocMemOffset()); + + // Create load node to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0); + } else { + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + } + + return DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, ArgValue, ArgValue2); +} + +SDValue ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1141,47 +1388,45 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { // Arguments stored in registers. if (VA.isRegLoc()) { MVT RegVT = VA.getLocVT(); - TargetRegisterClass *RC; - if (AFI->isThumbFunction()) - RC = ARM::tGPRRegisterClass; - else - RC = ARM::GPRRegisterClass; - if (FloatABIType == FloatABI::Hard) { - if (RegVT == MVT::f32) - RC = ARM::SPRRegisterClass; - else if (RegVT == MVT::f64) - RC = ARM::DPRRegisterClass; - } else if (RegVT == MVT::f64) { - // f64 is passed in pairs of GPRs and must be combined. + SDValue ArgValue; + if (VA.needsCustom()) { + // f64 and vector types are split up into multiple registers or + // combinations of registers and stack slots. RegVT = MVT::i32; - } else if (!((RegVT == MVT::i32) || (RegVT == MVT::f32))) - assert(0 && "RegVT not supported by FORMAL_ARGUMENTS Lowering"); - // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); + if (VA.getLocVT() == MVT::v2f64) { + SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], + Root, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Root, DAG, dl); + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue2, DAG.getIntPtrConstant(1)); + } else + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Root, DAG, dl); - // f64 is passed in i32 pairs and must be combined. - if (VA.needsCustom()) { - SDValue ArgValue2; + } else { + TargetRegisterClass *RC; + if (FloatABIType == FloatABI::Hard && RegVT == MVT::f32) + RC = ARM::SPRRegisterClass; + else if (FloatABIType == FloatABI::Hard && RegVT == MVT::f64) + RC = ARM::DPRRegisterClass; + else if (AFI->isThumbFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; - VA = ArgLocs[++i]; // skip ahead to next loc - if (VA.isMemLoc()) { - // must be APCS to split like this - unsigned ArgSize = VA.getLocVT().getSizeInBits()/8; - int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset()); - - // Create load node to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0); - } else { - Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); - } + assert((RegVT == MVT::i32 || RegVT == MVT::f32 || + (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)) && + "RegVT not supported by FORMAL_ARGUMENTS Lowering"); - ArgValue = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, - ArgValue, ArgValue2); + // Transform the arguments in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted @@ -1638,8 +1883,78 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); } -static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - assert(N->getValueType(0) == MVT::i64 && +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// +static SDValue getZeroVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Zero vectors are used to represent vector negation and in those cases + // will be implemented with the NEON VNEG instruction. However, VNEG does + // not support i64 elements, so sometimes the zero vectors will need to be + // explicitly constructed. For those cases, and potentially other uses in + // the future, always build zero vectors as <4 x i32> or <2 x i32> bitcasted + // to their dest type. This ensures they get CSE'd. + SDValue Vec; + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + if (VT.getSizeInBits() == 64) + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); + else + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); +} + +/// getOnesVector - Returns a vector of specified type with all bits set. +/// +static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest + // type. This ensures they get CSE'd. + SDValue Vec; + SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + if (VT.getSizeInBits() == 64) + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); + else + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); +} |