diff options
-rw-r--r-- | lib/Target/ARM/ARM.h | 1 | ||||
-rw-r--r-- | lib/Target/ARM/ARM.td | 26 | ||||
-rw-r--r-- | lib/Target/ARM/ARMBaseInstrInfo.cpp | 67 | ||||
-rw-r--r-- | lib/Target/ARM/ARMBaseInstrInfo.h | 40 | ||||
-rw-r--r-- | lib/Target/ARM/ARMHazardRecognizer.cpp | 114 | ||||
-rw-r--r-- | lib/Target/ARM/ARMHazardRecognizer.h | 53 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelDAGToDAG.cpp | 56 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.td | 17 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 66 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrVFP.td | 78 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.cpp | 2 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.h | 8 | ||||
-rw-r--r-- | lib/Target/ARM/ARMTargetMachine.cpp | 6 | ||||
-rw-r--r-- | lib/Target/ARM/CMakeLists.txt | 2 | ||||
-rw-r--r-- | lib/Target/ARM/MLxExpansionPass.cpp | 324 | ||||
-rw-r--r-- | lib/Target/ARM/Thumb2HazardRecognizer.cpp | 53 | ||||
-rw-r--r-- | lib/Target/ARM/Thumb2HazardRecognizer.h | 40 | ||||
-rw-r--r-- | lib/Target/ARM/Thumb2InstrInfo.cpp | 6 | ||||
-rw-r--r-- | lib/Target/ARM/Thumb2InstrInfo.h | 3 | ||||
-rw-r--r-- | test/CodeGen/ARM/reg_sequence.ll | 3 |
20 files changed, 773 insertions, 192 deletions
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index 83c83335b2..4679f7443b 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -49,6 +49,7 @@ FunctionPass *createARMExpandPseudoPass(); FunctionPass *createARMGlobalMergePass(const TargetLowering* tli); FunctionPass *createARMConstantIslandPass(); FunctionPass *createNEONMoveFixPass(); +FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); FunctionPass *createThumb2SizeReductionPass(); diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index a99dbfbd64..35d3d1ed66 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -46,14 +46,11 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", "Floating point unit supports single precision only">; -// Some processors have multiply-accumulate instructions that don't -// play nicely with other VFP instructions, and it's generally better +// Some processors have FP multiply-accumulate instructions that don't +// play nicely with other VFP / NEON instructions, and it's generally better // to just not use them. -// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for -// others as well. We should do more benchmarking and confirm one way or -// the other. -def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true", - "Disable VFP MAC instructions">; +def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", + "Disable VFP / NEON MAC instructions">; // Some processors benefit from using NEON instructions for scalar // single-precision FP operations. def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", @@ -150,26 +147,29 @@ def : ProcNoItin<"iwmmxt", [ArchV5TE]>; // V6 Processors. def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>; def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2, - FeatureHasSlowVMLx]>; + FeatureHasSlowFPVMLx]>; def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>; -def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2, + FeatureHasSlowFPVMLx]>; def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>; -def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2]>; +def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2, + FeatureHasSlowFPVMLx]>; // V6M Processors. def : Processor<"cortex-m0", ARMV6Itineraries, [ArchV6M]>; // V6T2 Processors. def : Processor<"arm1156t2-s", ARMV6Itineraries, [ArchV6T2]>; -def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2]>; +def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2, + FeatureHasSlowFPVMLx]>; // V7 Processors. def : Processor<"cortex-a8", CortexA8Itineraries, [ArchV7A, ProcA8, - FeatureHasSlowVMLx, FeatureT2XtPk]>; + FeatureHasSlowFPVMLx, FeatureT2XtPk]>; def : Processor<"cortex-a9", CortexA9Itineraries, [ArchV7A, ProcA9, - FeatureHasSlowVMLx, FeatureT2XtPk]>; + FeatureHasSlowFPVMLx, FeatureT2XtPk]>; // V7M Processors. def : ProcNoItin<"cortex-m3", [ArchV7M]>; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 4838b7dcc0..afec5c2bd3 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -15,6 +15,7 @@ #include "ARM.h" #include "ARMAddressingModes.h" #include "ARMConstantPoolValue.h" +#include "ARMHazardRecognizer.h" #include "ARMMachineFunctionInfo.h" #include "ARMRegisterInfo.h" #include "ARMGenInstrInfo.inc" @@ -40,9 +41,58 @@ static cl::opt<bool> EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, cl::desc("Enable ARM 2-addr to 3-addr conv")); + +/// ARM_MLxEntry - Record information about MLA / MLS instructions. +struct ARM_MLxEntry { + unsigned MLxOpc; // MLA / MLS opcode + unsigned MulOpc; // Expanded multiplication opcode + unsigned AddSubOpc; // Expanded add / sub opcode + bool NegAcc; // True if the acc is negated before the add / sub. + bool HasLane; // True if instruction has an extra "lane" operand. +}; + +static const ARM_MLxEntry ARM_MLxTable[] = { + // MLxOpc, MulOpc, AddSubOpc, NegAcc, HasLane + // fp scalar ops + { ARM::VMLAS, ARM::VMULS, ARM::VADDS, false, false }, + { ARM::VMLSS, ARM::VMULS, ARM::VSUBS, false, false }, + { ARM::VMLAD, ARM::VMULD, ARM::VADDD, false, false }, + { ARM::VMLSD, ARM::VMULD, ARM::VSUBD, false, false }, + { ARM::VMLAfd_sfp, ARM::VMULfd_sfp, ARM::VADDfd_sfp, false, false }, + { ARM::VMLSfd_sfp, ARM::VMULfd_sfp, ARM::VSUBfd_sfp, false, false }, + { ARM::VNMLAS, ARM::VNMULS, ARM::VSUBS, true, false }, + { ARM::VNMLSS, ARM::VMULS, ARM::VSUBS, true, false }, + { ARM::VNMLAD, ARM::VNMULD, ARM::VSUBD, true, false }, + { ARM::VNMLSD, ARM::VMULD, ARM::VSUBD, true, false }, + + // fp SIMD ops + { ARM::VMLAfd, ARM::VMULfd, ARM::VADDfd, false, false }, + { ARM::VMLSfd, ARM::VMULfd, ARM::VSUBfd, false, false }, + { ARM::VMLAfq, ARM::VMULfq, ARM::VADDfq, false, false }, + { ARM::VMLSfq, ARM::VMULfq, ARM::VSUBfq, false, false }, + { ARM::VMLAslfd, ARM::VMULslfd, ARM::VADDfd, false, true }, + { ARM::VMLSslfd, ARM::VMULslfd, ARM::VSUBfd, false, true }, + { ARM::VMLAslfq, ARM::VMULslfq, ARM::VADDfq, false, true }, + { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, +}; + ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)), Subtarget(STI) { + for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) { + if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) + assert(false && "Duplicated entries?"); + MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc); + MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc); + } +} + +ScheduleHazardRecognizer *ARMBaseInstrInfo:: +CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const { + if (Subtarget.isThumb2() || Subtarget.hasVFP2()) + return (ScheduleHazardRecognizer *) + new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget); + return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II); } MachineInstr * @@ -197,7 +247,6 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMIs[0]; } - // Branch analysis. bool ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, @@ -2196,3 +2245,19 @@ hasLowDefLatency(const InstrItineraryData *ItinData, } return false; } + +bool +ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, + bool &NegAcc, bool &HasLane) const { + DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode); + if (I == MLxEntryMap.end()) + return false; + + const ARM_MLxEntry &Entry = ARM_MLxTable[I->second]; + MulOpc = Entry.MulOpc; + AddSubOpc = Entry.AddSubOpc; + NegAcc = Entry.NegAcc; + HasLane = Entry.HasLane; + return true; +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 776be4d564..ca8b9a0240 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -17,6 +17,8 @@ #include "ARM.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" namespace llvm { class ARMSubtarget; @@ -191,9 +193,11 @@ namespace ARMII { class ARMBaseInstrInfo : public TargetInstrInfoImpl { const ARMSubtarget &Subtarget; + protected: // Can be only subclassed. explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + public: // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. @@ -206,7 +210,9 @@ public: virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0; const ARMSubtarget &getSubtarget() const { return Subtarget; } -public: + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const; + // Branch analysis. virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, @@ -393,6 +399,38 @@ private: const MachineInstr *UseMI, unsigned UseIdx) const; bool hasLowDefLatency(const InstrItineraryData *ItinData, const MachineInstr *DefMI, unsigned DefIdx) const; + +private: + /// Modeling special VFP / NEON fp MLA / MLS hazards. + + /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal + /// MLx table. + DenseMap<unsigned, unsigned> MLxEntryMap; + + /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause + /// stalls when scheduled together with fp MLA / MLS opcodes. + SmallSet<unsigned, 16> MLxHazardOpcodes; + +public: + /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS + /// instruction. + bool isFpMLxInstruction(unsigned Opcode) const { + return MLxEntryMap.count(Opcode); + } + + /// isFpMLxInstruction - This version also returns the multiply opcode and the + /// addition / subtraction opcode to expand to. Return true for 'HasLane' for + /// the MLX instructions with an extra lane operand. + bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, bool &NegAcc, + bool &HasLane) const; + + /// canCauseFpMLxStall - Return true if an instruction of the specified opcode + /// will cause stalls when scheduled after (within 4-cycle window) a fp + /// MLA / MLS instruction. + bool canCauseFpMLxStall(unsigned Opcode) const { + return MLxHazardOpcodes.count(Opcode); + } }; static inline diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp new file mode 100644 index 0000000000..317934faec --- /dev/null +++ b/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -0,0 +1,114 @@ +//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMHazardRecognizer.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI, + const TargetRegisterInfo &TRI) { + // FIXME: Detect integer instructions properly. + const TargetInstrDesc &TID = MI->getDesc(); + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainVFP) { + unsigned Opcode = MI->getOpcode(); + if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD || + Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + } else if (Domain == ARMII::DomainNEON) { + if (MI->getDesc().mayStore() || MI->getDesc().mayLoad()) + return false; + } else + return false; + return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI); +} + +ScheduleHazardRecognizer::HazardType +ARMHazardRecognizer::getHazardType(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + + if (!MI->isDebugValue()) { + if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1]) + return Hazard; + + // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following + // a VMLA / VMLS will cause 4 cycle stall. + const TargetInstrDesc &TID = MI->getDesc(); + if (LastMI && (TID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) { + MachineInstr *DefMI = LastMI; + const TargetInstrDesc &LastTID = LastMI->getDesc(); + // Skip over one non-VFP / NEON instruction. + if (!LastTID.isBarrier() && + (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { + MachineBasicBlock::iterator I = LastMI; + if (I != LastMI->getParent()->begin()) { + I = llvm::prior(I); + DefMI = &*I; + } + } + + if (TII.isFpMLxInstruction(DefMI->getOpcode()) && + (TII.canCauseFpMLxStall(MI->getOpcode()) || + hasRAWHazard(DefMI, MI, TRI))) { + // Try to schedule another instruction for the next 4 cycles. + if (Stalls == 0) + Stalls = 4; + return Hazard; + } + } + } + + return PostRAHazardRecognizer::getHazardType(SU); +} + +void ARMHazardRecognizer::Reset() { + LastMI = 0; + Stalls = 0; + ITBlockSize = 0; + PostRAHazardRecognizer::Reset(); +} + +void ARMHazardRecognizer::EmitInstruction(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + unsigned Opcode = MI->getOpcode(); + if (ITBlockSize) { + --ITBlockSize; + } else if (Opcode == ARM::t2IT) { + unsigned Mask = MI->getOperand(1).getImm(); + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + ITBlockSize = 4 - NumTZ; + MachineBasicBlock::iterator I = MI; + for (unsigned i = 0; i < ITBlockSize; ++i) { + // Advance to the next instruction, skipping any dbg_value instructions. + do { + ++I; + } while (I->isDebugValue()); + ITBlockMIs[ITBlockSize-1-i] = &*I; + } + } + + if (!MI->isDebugValue()) { + LastMI = MI; + Stalls = 0; + } + + PostRAHazardRecognizer::EmitInstruction(SU); +} + +void ARMHazardRecognizer::AdvanceCycle() { + if (Stalls && --Stalls == 0) + // Stalled for 4 cycles but still can't schedule any other instructions. + LastMI = 0; + PostRAHazardRecognizer::AdvanceCycle(); +} diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h new file mode 100644 index 0000000000..d1919d8cf4 --- /dev/null +++ b/lib/Target/ARM/ARMHazardRecognizer.h @@ -0,0 +1,53 @@ +//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling ARM functions. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMHAZARDRECOGNIZER_H +#define ARMHAZARDRECOGNIZER_H + +#include "llvm/CodeGen/PostRAHazardRecognizer.h" + +namespace llvm { + +class ARMBaseInstrInfo; +class ARMBaseRegisterInfo; +class ARMSubtarget; +class MachineInstr; + +class ARMHazardRecognizer : public PostRAHazardRecognizer { + const ARMBaseInstrInfo &TII; + const ARMBaseRegisterInfo &TRI; + const ARMSubtarget &STI; + + MachineInstr *LastMI; + unsigned Stalls; + unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled. + MachineInstr *ITBlockMIs[4]; + +public: + ARMHazardRecognizer(const InstrItineraryData *ItinData, + const ARMBaseInstrInfo &tii, + const ARMBaseRegisterInfo &tri, + const ARMSubtarget &sti) : + PostRAHazardRecognizer(ItinData), TII(tii), TRI(tri), STI(sti), + LastMI(0), ITBlockSize(0) {} + + virtual HazardType getHazardType(SUnit *SU); + virtual void Reset(); + virtual void EmitInstruction(SUnit *SU); + virtual void AdvanceCycle(); +}; + + +} // end namespace llvm + +#endif // ARMHAZARDRECOGNIZER_H diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 96bdf482d9..bfba11449f 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -13,6 +13,7 @@ #define DEBUG_TYPE "arm-isel" #include "ARM.h" +#include "ARMBaseInstrInfo.h" #include "ARMAddressingModes.h" #include "ARMTargetMachine.h" #include "llvm/CallingConv.h" @@ -41,6 +42,11 @@ DisableShifterOp("disable-shifter-op", cl::Hidden, cl::desc("Disable isel of shifter-op"), cl::init(false)); +static cl::opt<bool> +CheckVMLxHazard("check-vmlx-hazard", cl::Hidden, + cl::desc("Check fp vmla / vmls hazard at isel time"), + cl::init(false)); + //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine /// instructions for SelectionDAG operations. @@ -54,6 +60,7 @@ enum AddrMode2Type { class ARMDAGToDAGISel : public SelectionDAGISel { ARMBaseTargetMachine &TM; + const ARMBaseInstrInfo *TII; /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. @@ -63,7 +70,8 @@ public: explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), TM(tm), - Subtarget(&TM.getSubtarget<ARMSubtarget>()) { + TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())), + Subtarget(&TM.getSubtarget<ARMSubtarget>()) { } virtual const char *getPassName() const { @@ -78,6 +86,8 @@ public: SDNode *Select(SDNode *N); + + bool hasNoVMLxHazardUse(SDNode *N) const; bool isShifterOpProfitable(const SDValue &Shift, ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt); bool SelectShifterOperandReg(SDValue N, SDValue &A, @@ -272,6 +282,50 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { isInt32Immediate(N->getOperand(1).getNode(), Imm); } +/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS +/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at +/// least on current ARM implementations) which should be avoidded. +bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { + if (OptLevel == CodeGenOpt::None) + return true; + + if (!CheckVMLxHazard) + return true; + + if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9()) + return true; + + if (!N->hasOneUse()) + return false; + + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() == ISD::CopyToReg) + return true; + if (Use->isMachineOpcode()) { + const TargetInstrDesc &TID = TII->get(Use->getMachineOpcode()); + if (TID.mayStore()) + return true; + unsigned Opcode = TID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return true; + // vmlx feeding into another vmlx. We actually want to unfold + // the use later in the MLxExpansion pass. e.g. + // vmla + // vmla (stall 8 cycles) + // + // vmul (5 cycles) + // vadd (5 cycles) + // vmla + // This adds up to about 18 - 19 cycles. + // + // vmla + // vmul (stall 4 cycles) + // vadd adds up to about 14 cycles. + return TII->isFpMLxInstruction(Opcode); + } + + return false; +} bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, ARM_AM::ShiftOpc ShOpcVal, diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 9ca36d86d2..8a59ff5b58 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -175,7 +175,7 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">; // FIXME: Eventually this will be just "hasV6T2Ops". def UseMovt : Predicate<"Subtarget->useMovt()">; def DontUseMovt : Predicate<"!Subtarget->useMovt()">; -def UseVMLx : Predicate<"Subtarget->useVMLx()">; +def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -279,6 +279,21 @@ def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{ return N->hasOneUse(); }]>; +// An 'fmul' node with a single use. +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{ + return N->hasOneUse(); +}]>; + +// An 'fadd' node which checks for single non-hazardous use. +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + +// An 'fsub' node which checks for single non-hazardous use. +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + //===----------------------------------------------------------------------===// // Operand Definitions. // diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index f5b30c35d3..bc45d781c8 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -1907,7 +1907,7 @@ class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, // Multiply-Add/Sub operations: single-, double- and quad-register. class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode MulOp, SDNode OpNode> + ValueType Ty, SDPatternOperator MulOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR_VFP2:$Vd), (ins DPR_VFP2:$src1, DPR_VFP2:$Vn, DPR_VFP2:$Vm), N3RegFrm, itin, @@ -1915,7 +1915,7 @@ class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode MulOp, SDNode OpNode> + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", @@ -1924,7 +1924,7 @@ class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode MulOp, SDNode ShOp> + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp> : N3V<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), @@ -1951,7 +1951,7 @@ class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, - SDNode MulOp, SDNode OpNode> + SDPatternOperator MulOp, SDPatternOperator OpNode> : N3V<op24, op23, op21_20, op11_8, 1, op4, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", @@ -1959,7 +1959,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>; class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, - SDNode MulOp, SDNode ShOp> + SDPatternOperator MulOp, SDPatternOperator ShOp> : N3V<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), @@ -3282,15 +3282,19 @@ defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", - v2f32, fmul, fadd>; + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", - v4f32, fmul, fadd>; + v4f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", - v2f32, fmul, fadd>; + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32", - v4f32, v2f32, fmul, fadd>; + v4f32, v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def : Pat<(v8i16 (add (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -3308,14 +3312,15 @@ def : Pat<(v4i32 (add (v4i32 QPR:$src1), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f32 (fadd (v4f32 QPR:$src1), - (fmul (v4f32 QPR:$src2), +def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLAslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), - (SubReg_i32_lane imm:$lane)))>; + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; // VMLAL : Vector Multiply Accumulate Long (Q += D * D) defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, @@ -3335,15 +3340,19 @@ defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>; defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", - v2f32, fmul, fsub>; + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", - v4f32, fmul, fsub>; + v4f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", - v2f32, fmul, fsub>; + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32", - v4f32, v2f32, fmul, fsub>; + v4f32, v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; def : Pat<(v8i16 (sub (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -3361,13 +3370,14 @@ def : Pat<(v4i32 (sub (v4i32 QPR:$src1), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f32 (fsub (v4f32 QPR:$src1), - (fmul (v4f32 QPR:$src2), +def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), (v2f32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), - (SubReg_i32_lane imm:$lane)))>; + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; // VMLSL : Vector Multiply Subtract Long (Q -= D * D) defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, @@ -4706,15 +4716,17 @@ def : N3VSPat<fmul, VMULfd_sfp>; // vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so // we want to avoid them for now. e.g., alternating vmla/vadd instructions. -//let neverHasSideEffects = 1 in -//def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32", -// v2f32, fmul, fadd>; -//def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>; +let neverHasSideEffects = 1 in +def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32", + v2f32, fmul_su, fadd>; +def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; -//let neverHasSideEffects = 1 in -//def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32", -// v2f32, fmul, fsub>; -//def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>; +let neverHasSideEffects = 1 in +def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32", + v2f32, fmul_su, fsub>; +def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; // Vector Absolute used for single-precision FP let neverHasSideEffects = 1 in diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index d6221691b3..568c74a89d 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -751,93 +751,93 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1, def VMLAD : ADbI<0b11100, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fadd (fmul DPR:$Dn, DPR:$Dm), - (f64 DPR:$Ddin)))]>, + [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseVMLx]>; + Requires<[HasVFP2,UseFPVMLx]>; def VMLAS : ASbIn<0b11100, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fadd (fmul SPR:$Sn, SPR:$Sm), - SPR:$Sdin))]>, + [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), + SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fadd DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))), +def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseVMLx]>; -def : Pat<(fadd SPR:$dstin, (fmul SPR:$a, SPR:$b)), + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP, UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; def VMLSD : ADbI<0b11100, 0b00, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fadd (fneg (fmul DPR:$Dn,DPR:$Dm)), - (f64 DPR:$Ddin)))]>, + [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseVMLx]>; + Requires<[HasVFP2,UseFPVMLx]>; def VMLSS : ASbIn<0b11100, 0b00, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fadd (fneg (fmul SPR:$Sn, SPR:$Sm)), - SPR:$Sdin))]>, + [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))), +def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseVMLx]>; -def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)), + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd,(fsub (fneg (fmul DPR:$Dn,DPR:$Dm)), - (f64 DPR:$Ddin)))]>, + [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseVMLx]>; + Requires<[HasVFP2,UseFPVMLx]>; def VNMLAS : ASbI<0b11100, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fsub (fneg (fmul SPR:$Sn, SPR:$Sm)), - SPR:$Sdin))]>, + [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fsub (fneg (fmul DPR:$a, (f64 DPR:$b))), DPR:$dstin), +def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseVMLx]>; -def : Pat<(fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin), + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm", - [(set DPR:$Dd, (fsub (fmul DPR:$Dn, DPR:$Dm), - (f64 DPR:$Ddin)))]>, + [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,UseVMLx]>; + Requires<[HasVFP2,UseFPVMLx]>; def VNMLSS : ASbI<0b11100, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", - [(set SPR:$Sd, (fsub (fmul SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, + [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fsub (fmul DPR:$a, (f64 DPR:$b)), DPR:$dstin), +def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,UseVMLx]>; -def : Pat<(fsub (fmul SPR:$a, SPR:$b), SPR:$dstin), + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index e39c30e092..5508dc8578 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -37,7 +37,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, , ARMProcFamily(Others) , ARMFPUType(None) , UseNEONForSinglePrecisionFP(false) - , SlowVMLx(false) + , SlowFPVMLx(false) , SlowFPBrcc(false) , IsThumb(isT) , ThumbMode(Thumb1) diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 551b2f99ff..240364fa21 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -57,9 +57,9 @@ protected: /// determine if NEON should actually be used. bool UseNEONForSinglePrecisionFP; - /// SlowVMLx - If the VFP2 instructions are available, indicates whether - /// the VML[AS] instructions are slow (if so, don't use them). - bool SlowVMLx; + /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates + /// whether the FP VML[AS] instructions are slow (if so, don't use them). + bool SlowFPVMLx; /// SlowFPBrcc - True if floating point compare + branch is slow. bool SlowFPBrcc; @@ -176,7 +176,7 @@ protected: bool hasDivide() const { return HasHardwareDivide; } bool hasT2ExtractPack() const { return HasT2ExtractPack; } bool hasDataBarrier() const { return HasDataBarrier; } - bool useVMLx() const {return hasVFP2() && !SlowVMLx; } + bool useFPVMLx() const { return !SlowFPVMLx; } bool isFPBrccSlow() const { return SlowFPBrcc; } bool isFPOnlySP() const { return FPOnlySP; } bool prefers32BitThumb() const { return Pref32BitThumb; } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 1c05ab5797..89047f4051 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -16,11 +16,14 @@ #include "ARM.h" #include "llvm/PassManager.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegistry.h" using namespace llvm; +static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden); + static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); switch (TheTriple.getOS()) { @@ -146,6 +149,9 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) PM.add(createARMLoadStoreOptimizationPass(true)); + if (ExpandMLx && + OptLevel != CodeGenOpt::None && Subtarget.hasVFP2()) + PM.add(createMLxExpansionPass()); return true; } diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt index 59a580ab0e..632ab95313 100644 --- a/lib/Target/ARM/CMakeLists.txt +++ b/lib/Target/ARM/CMakeLists.txt @@ -29,6 +29,7 @@ add_llvm_target(ARMCodeGen ARMFastISel.cpp ARMFrameInfo.cpp ARMGlobalMerge.cpp + ARMHazardRecognizer.cpp ARMISelDAGToDAG.cpp ARMISelLowering.cpp ARMInstrInfo.cpp @@ -46,7 +47,6 @@ add_llvm_target(ARMCodeGen Thumb1InstrInfo.cpp Thumb1FrameInfo.cpp Thumb1RegisterInfo.cpp - Thumb2HazardRecognizer.cpp Thumb2ITBlockPass.cpp Thumb2InstrInfo.cpp Thumb2RegisterInfo.cpp diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp new file mode 100644 index 0000000000..ec7257b2d1 --- /dev/null +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -0,0 +1,324 @@ +//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ----------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of +// multiple and add / sub instructions) when special VMLx hazards are detected. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mlx-expansion" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static cl::opt<bool> +ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden); +static cl::opt<unsigned> +ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden); + +STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded"); + +namespace { + struct MLxExpansion : public MachineFunctionPass { + static char ID; + MLxExpansion() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM MLA / MLS expansion pass"; + } + + private: + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + unsigned HazardLimit; + unsigned MIIdx; + MachineInstr* LastMIs[4]; + + void clearStack(); + void pushStack(MachineInstr *MI); + MachineInstr *getAccDefMI(MachineInstr *MI) const; + unsigned getDefReg(MachineInstr *MI) const; + bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; + bool FindMLxHazard(MachineInstr *MI) const; + void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane); + bool ExpandFPMLxInstructions(MachineBasicBlock &MBB); + }; + char MLxExpansion::ID = 0; +} + +void MLxExpansion::clearStack() { + std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0); + MIIdx = 0; +} + +void MLxExpansion::pushStack(MachineInstr *MI) { + LastMIs[MIIdx] = MI; + if (++MIIdx == 4) + MIIdx = 0; +} + +MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { + // Look past COPY and INSERT_SUBREG instructions to find the + // real definition MI. This is important for _sfp instructions. + unsigned Reg = MI->getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return 0; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *DefMI = MRI->getVRegDef(Reg); + while (true) { + if (DefMI->getParent() != MBB) + break; + if (DefMI->isCopyLike()) { + Reg = DefMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } else if (DefMI->isInsertSubreg()) { + Reg = DefMI->getOperand(2).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } + break; + } + return DefMI; +} + +unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { + unsigned Reg = MI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + + while (UseMI->isCopy() || UseMI->isInsertSubreg()) { + Reg = UseMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + } + + return Reg; +} + +bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { + const TargetInstrDesc &TID = MI->getDesc(); + // FIXME: Detect integer instructions properly. + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainVFP) { + unsigned Opcode = TID.getOpcode(); + if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD || + Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + } else if (Domain == ARMII::DomainNEON) { + if (TID.mayStore() || TID.mayLoad()) + return false; + } else { + return false; + } + + return MI->readsRegister(Reg, TRI); + return false; +} + + +bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { + if (NumExpand >= ExpandLimit) + return false; + + if (ForceExapnd) + return true; + + MachineInstr *DefMI = getAccDefMI(MI); + if (TII->isFpMLxInstruction(DefMI->getOpcode())) + // r0 = vmla + // r3 = vmla r0, r1, r2 + // takes 16 - 17 cycles + // + // r0 = vmla + // r4 = vmul r1, r2 + // r3 = vadd r0, r4 + // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. + return true; + + // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the + // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall + // preserves the in-order retirement of the instructions. + // Look at the next few instructions, if *most* of them can cause hazards, + // then the scheduler can't *fix* this, we'd better break up the VMLA. + for (unsigned i = 1; i <= 4; ++i) { + int Idx = ((int)MIIdx - i + 4) % 4; + MachineInstr *NextMI = LastMIs[Idx]; + if (!NextMI) + continue; + + if (TII->canCauseFpMLxStall(NextMI->getOpcode())) + return true; + + // Look for VMLx RAW hazard. + if (hasRAWHazard(getDefReg(MI), NextMI)) + return true; + } + + return false; +} + +/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair +/// of MUL + ADD / SUB instructions. +void +MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane) { + unsigned DstReg = MI->getOperand(0).getReg(); + bool DstDead = MI->getOperand(0).isDead(); + unsigned AccReg = MI->getOperand(1).getReg(); + unsigned Src1Reg = MI->getOperand(2).getReg(); + unsigned Src2Reg = MI->getOperand(3).getReg(); + bool Src1Kill = MI->getOperand(2).isKill(); + bool Src2Kill = MI->getOperand(3).isKill(); + unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; + unsigned NextOp = HasLane ? 5 : 4; + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); + unsigned PredReg = MI->getOperand(++NextOp).getReg(); + + const TargetInstrDesc &TID1 = TII->get(MulOpc); + const TargetInstrDesc &TID2 = TII->get(AddSubOpc); + unsigned TmpReg = MRI->createVirtualRegister(TID1.getRegClass(0, TRI)); + + MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID1, TmpReg) + .addReg(Src1Reg, getKillRegState(Src1Kill)) + .addReg(Src2Reg, getKillRegState(Src2Kill)); + if (HasLane) + MIB.addImm(LaneImm); + MIB.addImm(Pred).addReg(PredReg); + + MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID2) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead)); + + if (NegAcc) { + bool AccKill = MRI->hasOneNonDBGUse(AccReg); + MIB.addReg(TmpReg, getKillRegState(true)) + .addReg(AccReg, getKillRegState(AccKill)); + } else { + MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true)); + } + MIB.addImm(Pred).addReg(PredReg); + + DEBUG({ + dbgs() << "Expanding: " << *MI; + dbgs() << " to:\n"; + MachineBasicBlock::iterator MII = MI; + MII = llvm::prior(MII); + MachineInstr &MI2 = *MII; + MII = llvm::prior(MII); + MachineInstr &MI1 = *MII; + dbgs() << " " << MI1; + dbgs() << " " << MI2; + }); + + MI->eraseFromParent(); + ++NumExpand; +} + +bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { + bool Changed = false; + + clearStack(); + + unsigned Skip = 0; + MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); + while (MII != E) { + MachineInstr *MI = &*MII; + + if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) { + ++MII; + continue; + } + + const TargetInstrDesc &TID = MI->getDesc(); + if (TID.isBarrier()) { + clearStack(); + Skip = 0; + ++MII; + continue; + } + + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainGeneral) { + if (++Skip == 2) + // Assume dual issues of non-VFP / NEON instructions. + pushStack(0); + } else { + Skip = 0; + + unsigned MulOpc, AddSubOpc; + bool NegAcc, HasLane; + if (!TII->isFpMLxInstruction(TID.getOpcode(), + MulOpc, AddSubOpc, NegAcc, HasLane) || + !FindMLxHazard(MI)) + pushStack(MI); + else { + ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane); + E = MBB.rend(); // May have changed if MI was the 1st instruction. + Changed = true; + continue; + } + } + + ++MII; + } + + return Changed; +} + +bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo()); + TRI = Fn.getTarget().getRegisterInfo(); + MRI = &Fn.getRegInfo(); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= ExpandFPMLxInstructions(MBB); + } + + return Modified; +} + +FunctionPass *llvm::createMLxExpansionPass() { + return new MLxExpansion(); +} diff --git a/lib/Target/ARM/Thumb2HazardRecognizer.cpp b/lib/Target/ARM/Thumb2HazardRecognizer.cpp deleted file mode 100644 index 172908da22..0000000000 --- a/lib/Target/ARM/Thumb2HazardRecognizer.cpp +++ /dev/null @@ -1,53 +0,0 @@ -//===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "ARM.h" -#include "Thumb2HazardRecognizer.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/ScheduleDAG.h" -using namespace llvm; - -ScheduleHazardRecognizer::HazardType -Thumb2HazardRecognizer::getHazardType(SUnit *SU) { - if (ITBlockSize) { - MachineInstr *MI = SU->getInstr(); - if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1]) - return Hazard; - } - - return PostRAHazardRecognizer::getHazardType(SU); -} - -void Thumb2HazardRecognizer::Reset() { - ITBlockSize = 0; - PostRAHazardRecognizer::Reset(); -} - -void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) { - MachineInstr *MI = SU->getInstr(); - unsigned Opcode = MI->getOpcode(); - if (ITBlockSize) { - --ITBlockSize; - } else if (Opcode == ARM::t2IT) { - unsigned Mask = MI->getOperand(1).getImm(); - unsigned NumTZ = CountTrailingZeros_32(Mask); - assert(NumTZ <= 3 && "Invalid IT mask!"); - ITBlockSize = 4 - NumTZ; - MachineBasicBlock::iterator I = MI; - for (unsigned i = 0; i < ITBlockSize; ++i) { - // Advance to the next instruction, skipping any dbg_value instructions. - do { - ++I; - } while (I->isDebugValue()); - ITBlockMIs[ITBlockSize-1-i] = &*I; - } - } - - PostRAHazardRecognizer::EmitInstruction(SU); -} diff --git a/lib/Target/ARM/Thumb2HazardRecognizer.h b/lib/Target/ARM/Thumb2HazardRecognizer.h deleted file mode 100644 index aa4411f186..0000000000 --- a/lib/Target/ARM/Thumb2HazardRecognizer.h +++ /dev/null @@ -1,40 +0,0 @@ -//===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines hazard recognizers for scheduling Thumb2 functions on -// ARM processors. -// -//===----------------------------------------------------------------------===// - -#ifndef THUMB2HAZARDRECOGNIZER_H -#define THUMB2HAZARDRECOGNIZER_H - -#include "llvm/CodeGen/PostRAHazardRecognizer.h" - -namespace llvm { - -class MachineInstr; - -class Thumb2HazardRecognizer : public PostRAHazardRecognizer { - unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled. - MachineInstr *ITBlockMIs[4]; - -public: - Thumb2HazardRecognizer(const InstrItineraryData *ItinData) : - PostRAHazardRecognizer(ItinData) {} - - virtual HazardType getHazardType(SUnit *SU); - virtual void Reset(); - virtual void EmitInstruction(SUnit *SU); -}; - - -} // end namespace llvm - -#endif // THUMB2HAZARDRECOGNIZER_H diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 719b140ce9..2f67257f8f 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -17,7 +17,6 @@ #include "ARMAddressingModes.h" #include "ARMGenInstrInfo.inc" #include "ARMMachineFunctionInfo.h" -#include "Thumb2HazardRecognizer.h" #include "Thumb2InstrInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -175,11 +174,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI); } -ScheduleHazardRecognizer *Thumb2InstrInfo:: -CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const { - return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II); -} - void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h index 9ed7eea7e2..f2637d7fbc 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.h +++ b/lib/Target/ARM/Thumb2InstrInfo.h @@ -65,9 +65,6 @@ public: /// always be able to get register info as well (through this method). /// const Thumb2RegisterInfo &getRegisterInfo() const { return RI; } - - ScheduleHazardRecognizer * - CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const; }; /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll index 24eb3a88fb..53214fd4c3 100644 --- a/test/CodeGen/ARM/reg_sequence.ll +++ b/test/CodeGen/ARM/reg_sequence.ll @@ -270,8 +270,9 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind { define arm_aapcs_vfpcc i32 @t10() nounwind { entry: ; CHECK: t10: +; CHECK: vmul.f32 q8, q8, d0[0] ; CHECK: vmov.i32 q9, #0x3F000000 -; CHECK: vmla.f32 q8, q8, d0[0] +; CHECK: vadd.f32 q8, q8, q8 %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1] %2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1] |