diff options
-rw-r--r-- | lib/Target/PowerPC/PPC.td | 94 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCISelLowering.cpp | 207 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCISelLowering.h | 7 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCInstrAltivec.td | 3 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCInstrInfo.td | 28 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCSubtarget.cpp | 5 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCSubtarget.h | 7 | ||||
-rw-r--r-- | test/CodeGen/PowerPC/recipest.ll | 178 |
8 files changed, 499 insertions, 30 deletions
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index a1ea2297bf..98869eaa17 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -57,6 +57,16 @@ def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true", "Enable the MFOCRF instruction">; def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true", "Enable the fsqrt instruction">; +def FeatureFRE : SubtargetFeature<"fre", "HasFRE", "true", + "Enable the fre instruction">; +def FeatureFRES : SubtargetFeature<"fres", "HasFRES", "true", + "Enable the fres instruction">; +def FeatureFRSQRTE : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true", + "Enable the frsqrte instruction">; +def FeatureFRSQRTES : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true", + "Enable the frsqrtes instruction">; +def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true", + "Assume higher precision reciprocal estimates">; def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true", "Enable the stfiwx instruction">; def FeatureLFIWAX : SubtargetFeature<"lfiwax","HasLFIWAX", "true", @@ -101,30 +111,46 @@ include "PPCInstrInfo.td" def : Processor<"generic", G3Itineraries, [Directive32]>; def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL, + FeatureFRES, FeatureFRSQRTE, FeatureBookE]>; def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL, + FeatureFRES, FeatureFRSQRTE, FeatureBookE]>; def : Processor<"601", G3Itineraries, [Directive601]>; def : Processor<"602", G3Itineraries, [Directive602]>; -def : Processor<"603", G3Itineraries, [Directive603]>; -def : Processor<"603e", G3Itineraries, [Directive603]>; -def : Processor<"603ev", G3Itineraries, [Directive603]>; -def : Processor<"604", G3Itineraries, [Directive604]>; -def : Processor<"604e", G3Itineraries, [Directive604]>; -def : Processor<"620", G3Itineraries, [Directive620]>; -def : Processor<"750", G4Itineraries, [Directive750]>; -def : Processor<"g3", G3Itineraries, [Directive750]>; -def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>; -def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>; -def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>; -def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec]>; +def : Processor<"603", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"603e", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"603ev", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"604", G3Itineraries, [Directive604, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"604e", G3Itineraries, [Directive604, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"620", G3Itineraries, [Directive620, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"750", G4Itineraries, [Directive750, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"g3", G3Itineraries, [Directive750, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; def : Processor<"970", G5Itineraries, [Directive970, FeatureAltivec, - FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, + FeatureMFOCRF, FeatureFSqrt, + FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX, Feature64Bit /*, Feature64BitRegs */]>; def : Processor<"g5", G5Itineraries, [Directive970, FeatureAltivec, FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, + FeatureFRES, FeatureFRSQRTE, Feature64Bit /*, Feature64BitRegs */]>; def : ProcessorModel<"e500mc", PPCE500mcModel, [DirectiveE500mc, FeatureMFOCRF, @@ -134,43 +160,57 @@ def : ProcessorModel<"e5500", PPCE5500Model, FeatureSTFIWX, FeatureBookE, FeatureISEL]>; def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE, FeatureMFOCRF, - FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX, + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeaturePOPCNTD, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */]>; def : Processor<"a2q", PPCA2Itineraries, [DirectiveA2, FeatureBookE, FeatureMFOCRF, - FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX, + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeaturePOPCNTD, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureQPX]>; def : Processor<"pwr3", G5Itineraries, - [DirectivePwr3, FeatureAltivec, FeatureMFOCRF, + [DirectivePwr3, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, FeatureSTFIWX, Feature64Bit]>; def : Processor<"pwr4", G5Itineraries, [DirectivePwr4, FeatureAltivec, FeatureMFOCRF, - FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>; + FeatureFSqrt, FeatureFRES, FeatureFRSQRTE, + FeatureSTFIWX, Feature64Bit]>; def : Processor<"pwr5", G5Itineraries, [DirectivePwr5, FeatureAltivec, FeatureMFOCRF, - FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>; + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, + FeatureSTFIWX, Feature64Bit]>; def : Processor<"pwr5x", G5Itineraries, [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, - FeatureFSqrt, FeatureSTFIWX, FeatureFPRND, - Feature64Bit]>; + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, + FeatureSTFIWX, FeatureFPRND, Feature64Bit]>; def : Processor<"pwr6", G5Itineraries, [DirectivePwr6, FeatureAltivec, - FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, - FeatureLFIWAX, FeatureFPRND, Feature64Bit - /*, Feature64BitRegs */]>; + FeatureMFOCRF, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>; def : Processor<"pwr6x", G5Itineraries, [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, - FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX, + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, FeatureFPRND, Feature64Bit]>; def : Processor<"pwr7", G5Itineraries, [DirectivePwr7, FeatureAltivec, - FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, - FeatureLFIWAX, FeatureFPRND, FeatureFPCVT, - FeatureISEL, FeaturePOPCNTD, FeatureLDBRX, + FeatureMFOCRF, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */]>; def : Processor<"ppc", G3Itineraries, [Directive32]>; def : Processor<"ppc64", G5Itineraries, diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index ee43fc668e..3bda37b43e 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -150,10 +150,15 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root - if (!Subtarget->hasFSQRT()) { + if (!Subtarget->hasFSQRT() && + !(TM.Options.UnsafeFPMath && + Subtarget->hasFRSQRTE() && Subtarget->hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); + + if (!Subtarget->hasFSQRT() && + !(TM.Options.UnsafeFPMath && + Subtarget->hasFRSQRTES() && Subtarget->hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); - } setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); @@ -469,6 +474,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::MUL, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); + + if (TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + } + setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -519,6 +530,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setTargetDAGCombine(ISD::BR_CC); setTargetDAGCombine(ISD::BSWAP); + // Use reciprocal estimates. + if (TM.Options.UnsafeFPMath) { + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::FSQRT); + } + // Darwin long double math library functions have $LDBL128 appended. if (Subtarget->isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); @@ -590,6 +607,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::FCFID: return "PPCISD::FCFID"; case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; + case PPCISD::FRE: return "PPCISD::FRE"; + case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; @@ -6658,6 +6677,153 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Target Optimization Hooks //===----------------------------------------------------------------------===// +SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N, + DAGCombinerInfo &DCI, + bool UseOperand) const { + if (DCI.isAfterLegalizeVectorOps()) + return SDValue(); + + if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRES()) || + (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRE()) || + (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) { + + // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) + // For the reciprocal, we need to find the zero of the function: + // F(X) = A X - 1 [which has a zero at X = 1/A] + // => + // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form + // does not require additional intermediate precision] + + // Convergence is quadratic, so we essentially double the number of digits + // correct after every iteration. The minimum architected relative + // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has + // 23 digits and double has 52 digits. + int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; + if (N->getValueType(0).getScalarType() == MVT::f64) + ++Iterations; + + SelectionDAG &DAG = DCI.DAG; + DebugLoc dl = N->getDebugLoc(); + + SDValue FPOne = + DAG.getConstantFP(1.0, N->getValueType(0).getScalarType()); + if (N->getValueType(0).isVector()) { + assert(N->getValueType(0).getVectorNumElements() == 4 && + "Unknown vector type"); + FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), + FPOne, FPOne, FPOne, FPOne); + } + + SDValue Est = DAG.getNode(PPCISD::FRE, dl, + N->getValueType(0), + UseOperand ? N->getOperand(1) : + SDValue(N, 0)); + DCI.AddToWorklist(Est.getNode()); + + // Newton iterations: Est = Est + Est (1 - Arg * Est) + for (int i = 0; i < Iterations; ++i) { + SDValue NewEst = DAG.getNode(ISD::FMUL, dl, + N->getValueType(0), + UseOperand ? N->getOperand(1) : + SDValue(N, 0), + Est); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, dl, + N->getValueType(0), FPOne, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FMUL, dl, + N->getValueType(0), Est, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FADD, dl, + N->getValueType(0), Est, NewEst); + DCI.AddToWorklist(Est.getNode()); + } + + return Est; + } + + return SDValue(); +} + +SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.isAfterLegalizeVectorOps()) + return SDValue(); + + if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRSQRTES()) || + (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRSQRTE()) || + (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) { + + // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) + // For the reciprocal sqrt, we need to find the zero of the function: + // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] + // => + // X_{i+1} = X_i (1.5 - A X_i^2 / 2) + // As a result, we precompute A/2 prior to the iteration loop. + + // Convergence is quadratic, so we essentially double the number of digits + // correct after every iteration. The minimum architected relative + // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has + // 23 digits and double has 52 digits. + int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; + if (N->getValueType(0).getScalarType() == MVT::f64) + ++Iterations; + + SelectionDAG &DAG = DCI.DAG; + DebugLoc dl = N->getDebugLoc(); + + SDValue FPThreeHalfs = + DAG.getConstantFP(1.5, N->getValueType(0).getScalarType()); + if (N->getValueType(0).isVector()) { + assert(N->getValueType(0).getVectorNumElements() == 4 && + "Unknown vector type"); + FPThreeHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), + FPThreeHalfs, FPThreeHalfs, + FPThreeHalfs, FPThreeHalfs); + } + + SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, + N->getValueType(0), N->getOperand(0)); + DCI.AddToWorklist(Est.getNode()); + + // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that + // this entire sequence requires only one FP constant. + SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + FPThreeHalfs, N->getOperand(0)); + DCI.AddToWorklist(HalfArg.getNode()); + + HalfArg = DAG.getNode(ISD::FSUB, dl, N->getValueType(0), + HalfArg, N->getOperand(0)); + DCI.AddToWorklist(HalfArg.getNode()); + + // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) + for (int i = 0; i < Iterations; ++i) { + SDValue NewEst = DAG.getNode(ISD::FMUL, dl, + N->getValueType(0), Est, Est); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FMUL, dl, + N->getValueType(0), HalfArg, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, dl, + N->getValueType(0), FPThreeHalfs, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FMUL, dl, + N->getValueType(0), Est, NewEst); + DCI.AddToWorklist(Est.getNode()); + } + + return Est; + } + + return SDValue(); +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { const TargetMachine &TM = getTargetMachine(); @@ -6684,7 +6850,44 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return N->getOperand(0); } break; + case ISD::FDIV: { + assert(TM.Options.UnsafeFPMath && + "Reciprocal estimates require UnsafeFPMath"); + + if (N->getOperand(1).getOpcode() == ISD::FSQRT) { + SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(1).getNode(), DCI); + if (RV.getNode() != 0) { + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } + } + + SDValue RV = DAGCombineFastRecip(N, DCI); + if (RV.getNode() != 0) { + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } + + } + break; + case ISD::FSQRT: { + assert(TM.Options.UnsafeFPMath && + "Reciprocal estimates require UnsafeFPMath"); + + // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the + // reciprocal sqrt. + SDValue RV = DAGCombineFastRecipFSQRT(N, DCI); + if (RV.getNode() != 0) { + DCI.AddToWorklist(RV.getNode()); + RV = DAGCombineFastRecip(RV.getNode(), DCI, false); + if (RV.getNode() != 0) + return RV; + } + } + break; case ISD::SINT_TO_FP: if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) { if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) { diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 6690899e5d..2ddcb25a23 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -49,6 +49,9 @@ namespace llvm { /// unsigned integers. FCTIDUZ, FCTIWUZ, + /// Reciprocal estimate instructions (unary FP ops). + FRE, FRSQRTE, + // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking // three v4f32 operands and producing a v4f32 result. VMADDFP, VNMSUBFP, @@ -620,6 +623,10 @@ namespace llvm { SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + + SDValue DAGCombineFastRecip(SDNode *N, DAGCombinerInfo &DCI, + bool UseOperand = true) const; + SDValue DAGCombineFastRecipFSQRT(SDNode *N, DAGCombinerInfo &DCI) const; }; } diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index fff91df418..18968ba0dd 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -795,6 +795,9 @@ def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C), def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC), (VPERM $vA, $vB, $vC)>; +def : Pat<(PPCfre v4f32:$A), (VREFP $A)>; +def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>; + // Vector shifts def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)), (v16i8 (VSLB $vA, $vB))>; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 067f5aacfa..feca7c0370 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -62,6 +62,9 @@ def SDT_PPCTC_ret : SDTypeProfile<0, 2, [ // PowerPC specific DAG Nodes. // +def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; +def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; + def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>; def PPCfcfids : SDNode<"PPCISD::FCFIDS", SDTFPRoundOp, []>; @@ -1223,8 +1226,21 @@ def FNEGS : XForm_26<63, 40, (outs F4RC:$frD), (ins F4RC:$frB), def FNEGD : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB), "fneg $frD, $frB", FPGeneral, [(set f64:$frD, (fneg f64:$frB))]>; + +// Reciprocal estimates. +def FRE : XForm_26<63, 24, (outs F8RC:$frD), (ins F8RC:$frB), + "fre $frD, $frB", FPGeneral, + [(set f64:$frD, (PPCfre f64:$frB))]>; +def FRES : XForm_26<59, 24, (outs F4RC:$frD), (ins F4RC:$frB), + "fres $frD, $frB", FPGeneral, + [(set f32:$frD, (PPCfre f32:$frB))]>; +def FRSQRTE : XForm_26<63, 26, (outs F8RC:$frD), (ins F8RC:$frB), + "frsqrte $frD, $frB", FPGeneral, + [(set f64:$frD, (PPCfrsqrte f64:$frB))]>; +def FRSQRTES : XForm_26<59, 26, (outs F4RC:$frD), (ins F4RC:$frB), + "frsqrtes $frD, $frB", FPGeneral, + [(set f32:$frD, (PPCfrsqrte f32:$frB))]>; } - // XL-Form instructions. condition register logical ops. // @@ -1687,5 +1703,15 @@ def : Pat<(membarrier (i32 imm /*ll*/), def : Pat<(atomic_fence (imm), (imm)), (SYNC)>; +// Additional FNMSUB patterns: -a*c + b == -(a*c - b) +def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), + (FNMSUB $A, $C, $B)>; +def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), + (FNMSUB $A, $C, $B)>; +def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B), + (FNMSUBS $A, $C, $B)>; +def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B), + (FNMSUBS $A, $C, $B)>; + include "PPCInstrAltivec.td" include "PPCInstr64Bit.td" diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index 1b7150fd37..a8f2b3f47d 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -38,6 +38,11 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU, , HasAltivec(false) , HasQPX(false) , HasFSQRT(false) + , HasFRE(false) + , HasFRES(false) + , HasFRSQRTE(false) + , HasFRSQRTES(false) + , HasRecipPrec(false) , HasSTFIWX(false) , HasLFIWAX(false) , HasFPRND(false) diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index d5dfa1e8e1..65b4d211fc 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -77,6 +77,8 @@ protected: bool HasAltivec; bool HasQPX; bool HasFSQRT; + bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES; + bool HasRecipPrec; bool HasSTFIWX; bool HasLFIWAX; bool HasFPRND; @@ -159,6 +161,11 @@ public: // Specific obvious features. bool hasFSQRT() const { return HasFSQRT; } + bool hasFRE() const { return HasFRE; } + bool hasFRES() const { return HasFRES; } + bool hasFRSQRTE() const { return HasFRSQRTE; } + bool hasFRSQRTES() const { return HasFRSQRTES; } + bool hasRecipPrec() const { return HasRecipPrec; } bool hasSTFIWX() const { return HasSTFIWX; } bool hasLFIWAX() const { return HasLFIWAX; } bool hasFPRND() const { return HasFPRND; } diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll new file mode 100644 index 0000000000..cad98c5a35 --- /dev/null +++ b/test/CodeGen/PowerPC/recipest.ll @@ -0,0 +1,178 @@ +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck -check-prefix=CHECK-SAFE %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +declare double @llvm.sqrt.f64(double) +declare float @llvm.sqrt.f32(float) +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define double @foo(double %a, double %b) nounwind { +entry: + %x = call double @llvm.sqrt.f64(double %b) + %r = fdiv double %a, %x + ret double %r + +; CHECK: @foo +; CHECK: frsqrte +; CHECK: fnmsub +; CHECK: fmul +; CHECK: fmadd +; CHECK: fmul +; CHECK: fmul +; CHECK: fmadd +; CHECK: fmul +; CHECK: fmul +; CHECK: blr + +; CHECK-SAFE: @foo +; CHECK-SAFE: fsqrt +; CHECK-SAFE: fdiv +; CHECK-SAFE: blr +} + +define float @goo(float %a, float %b) nounwind { +entry: + %x = call float @llvm.sqrt.f32(float %b) + %r = fdiv float %a, %x + ret float %r + +; CHECK: @goo +; CHECK: frsqrtes +; CHECK: fnmsubs +; CHECK: fmuls +; CHECK: fmadds +; CHECK: fmuls +; CHECK: fmuls +; CHECK: blr + +; CHECK-SAFE: @goo +; CHECK-SAFE: fsqrts +; CHECK-SAFE: fdivs +; CHECK-SAFE: blr +} + +define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind { +entry: + %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) + %r = fdiv <4 x float> %a, %x + ret <4 x float> %r + +; CHECK: @hoo +; CHECK: vrsqrtefp + +; CHECK-SAFE: @hoo +; CHECK-SAFE-NOT: vrsqrtefp +; CHECK-SAFE: blr +} + +define double @foo2(double %a, double %b) nounwind { +entry: + %r = fdiv double %a, %b + ret double %r + +; CHECK: @foo2 +; CHECK: fre +; CHECK: fnmsub +; CHECK: fmadd +; CHECK: fnmsub +; CHECK: fmadd +; CHECK: fmul +; CHECK: blr + +; CHECK-SAFE: @foo2 +; CHECK-SAFE: fdiv +; CHECK-SAFE: blr +} + +define float @goo2(float %a, float %b) nounwind { +entry: + %r = fdiv float %a, %b + ret float %r + +; CHECK: @goo2 +; CHECK: fres +; CHECK: fnmsubs +; CHECK: fmadds +; CHECK: fmuls +; CHECK: blr + +; CHECK-SAFE: @goo2 +; CHECK-SAFE: fdivs +; CHECK-SAFE: blr +} + +define <4 x float> @hoo2(<4 x float> %a, <4 x float> %b) nounwind { +entry: + %r = fdiv <4 x float> %a, %b + ret <4 x float> %r + +; CHECK: @hoo2 +; CHECK: vrefp + +; CHECK-SAFE: @hoo2 +; CHECK-SAFE-NOT: vrefp +; CHECK-SAFE: blr +} + +define double @foo3(double %a) nounwind { +entry: + %r = call double @llvm.sqrt.f64(double %a) + ret double %r + +; CHECK: @foo3 +; CHECK: frsqrte +; CHECK: fnmsub +; CHECK: fmul +; CHECK: fmadd +; CHECK: fmul +; CHECK: fmul +; CHECK: fmadd +; CHECK: fmul +; CHECK: fre +; CHECK: fnmsub +; CHECK: fmadd +; CHECK: fnmsub +; CHECK: fmadd +; CHECK: blr + +; CHECK-SAFE: @foo3 +; CHECK-SAFE: fsqrt +; CHECK-SAFE: blr +} + +define float @goo3(float %a) nounwind { +entry: + %r = call float @llvm.sqrt.f32(float %a) + ret float %r + +; CHECK: @goo3 +; CHECK: frsqrtes +; CHECK: fnmsubs +; CHECK: fmuls +; CHECK: fmadds +; CHECK: fmuls +; CHECK: fres +; CHECK: fnmsubs +; CHECK: fmadds +; CHECK: blr + +; CHECK-SAFE: @goo3 +; CHECK-SAFE: fsqrts +; CHECK-SAFE: blr +} + +define <4 x float> @hoo3(<4 x float> %a) nounwind { +entry: + %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) + ret <4 x float> %r + +; CHECK: @hoo3 +; CHECK: vrsqrtefp +; CHECK: vrefp + +; CHECK-SAFE: @hoo3 +; CHECK-SAFE-NOT: vrsqrtefp +; CHECK-SAFE: blr +} + |