diff options
author | Evan Cheng <evan.cheng@apple.com> | 2010-12-05 22:04:16 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2010-12-05 22:04:16 +0000 |
commit | 48575f6ea7d5cd21ab29ca370f58fcf9ca31400b (patch) | |
tree | fd7f84a4921afa7c4baac36c5772ae688f4f31da /lib/Target/ARM/MLxExpansionPass.cpp | |
parent | 0a3fdd6e11cd351737b4451c05ec5d794e6855cf (diff) |
Making use of VFP / NEON floating point multiply-accumulate / subtraction is
difficult on current ARM implementations for a few reasons.
1. Even though a single vmla has latency that is one cycle shorter than a pair
of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause
additional pipeline stall. So it's frequently better to single codegen
vmul + vadd.
2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to
stall for 4 cycles. We need to schedule them apart.
3. A vmla followed vmla is a special case. Obvious issuing back to back RAW
vmla + vmla is very bad. But this isn't ideal either:
vmul
vadd
vmla
Instead, we want to expand the second vmla:
vmla
vmul
vadd
Even with the 4 cycle vmul stall, the second sequence is still 2 cycles
faster.
Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough
but it isn't the optimial solution. This patch attempts to make it possible to
use vmla / vmls in cases where it is profitable.
A. Add missing isel predicates which cause vmla to be codegen'ed.
B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to
compute a fmul and a fmla.
C. Add additional isel checks for vmla, avoid cases where vmla is feeding into
fp instructions (except for the #3 exceptional case).
D. Add ARM hazard recognizer to model the vmla / vmls hazards.
E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the
vmla / vmls will trigger one of the special hazards.
Work in progress, only A+B are enabled.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/ARM/MLxExpansionPass.cpp')
-rw-r--r-- | lib/Target/ARM/MLxExpansionPass.cpp | 324 |
1 files changed, 324 insertions, 0 deletions
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp new file mode 100644 index 0000000000..ec7257b2d1 --- /dev/null +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -0,0 +1,324 @@ +//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ----------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of +// multiple and add / sub instructions) when special VMLx hazards are detected. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mlx-expansion" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static cl::opt<bool> +ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden); +static cl::opt<unsigned> +ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden); + +STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded"); + +namespace { + struct MLxExpansion : public MachineFunctionPass { + static char ID; + MLxExpansion() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM MLA / MLS expansion pass"; + } + + private: + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + unsigned HazardLimit; + unsigned MIIdx; + MachineInstr* LastMIs[4]; + + void clearStack(); + void pushStack(MachineInstr *MI); + MachineInstr *getAccDefMI(MachineInstr *MI) const; + unsigned getDefReg(MachineInstr *MI) const; + bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; + bool FindMLxHazard(MachineInstr *MI) const; + void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane); + bool ExpandFPMLxInstructions(MachineBasicBlock &MBB); + }; + char MLxExpansion::ID = 0; +} + +void MLxExpansion::clearStack() { + std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0); + MIIdx = 0; +} + +void MLxExpansion::pushStack(MachineInstr *MI) { + LastMIs[MIIdx] = MI; + if (++MIIdx == 4) + MIIdx = 0; +} + +MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { + // Look past COPY and INSERT_SUBREG instructions to find the + // real definition MI. This is important for _sfp instructions. + unsigned Reg = MI->getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return 0; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *DefMI = MRI->getVRegDef(Reg); + while (true) { + if (DefMI->getParent() != MBB) + break; + if (DefMI->isCopyLike()) { + Reg = DefMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } else if (DefMI->isInsertSubreg()) { + Reg = DefMI->getOperand(2).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } + break; + } + return DefMI; +} + +unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { + unsigned Reg = MI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + + while (UseMI->isCopy() || UseMI->isInsertSubreg()) { + Reg = UseMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + } + + return Reg; +} + +bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { + const TargetInstrDesc &TID = MI->getDesc(); + // FIXME: Detect integer instructions properly. + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainVFP) { + unsigned Opcode = TID.getOpcode(); + if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD || + Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + } else if (Domain == ARMII::DomainNEON) { + if (TID.mayStore() || TID.mayLoad()) + return false; + } else { + return false; + } + + return MI->readsRegister(Reg, TRI); + return false; +} + + +bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { + if (NumExpand >= ExpandLimit) + return false; + + if (ForceExapnd) + return true; + + MachineInstr *DefMI = getAccDefMI(MI); + if (TII->isFpMLxInstruction(DefMI->getOpcode())) + // r0 = vmla + // r3 = vmla r0, r1, r2 + // takes 16 - 17 cycles + // + // r0 = vmla + // r4 = vmul r1, r2 + // r3 = vadd r0, r4 + // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. + return true; + + // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the + // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall + // preserves the in-order retirement of the instructions. + // Look at the next few instructions, if *most* of them can cause hazards, + // then the scheduler can't *fix* this, we'd better break up the VMLA. + for (unsigned i = 1; i <= 4; ++i) { + int Idx = ((int)MIIdx - i + 4) % 4; + MachineInstr *NextMI = LastMIs[Idx]; + if (!NextMI) + continue; + + if (TII->canCauseFpMLxStall(NextMI->getOpcode())) + return true; + + // Look for VMLx RAW hazard. + if (hasRAWHazard(getDefReg(MI), NextMI)) + return true; + } + + return false; +} + +/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair +/// of MUL + ADD / SUB instructions. +void +MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane) { + unsigned DstReg = MI->getOperand(0).getReg(); + bool DstDead = MI->getOperand(0).isDead(); + unsigned AccReg = MI->getOperand(1).getReg(); + unsigned Src1Reg = MI->getOperand(2).getReg(); + unsigned Src2Reg = MI->getOperand(3).getReg(); + bool Src1Kill = MI->getOperand(2).isKill(); + bool Src2Kill = MI->getOperand(3).isKill(); + unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; + unsigned NextOp = HasLane ? 5 : 4; + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); + unsigned PredReg = MI->getOperand(++NextOp).getReg(); + + const TargetInstrDesc &TID1 = TII->get(MulOpc); + const TargetInstrDesc &TID2 = TII->get(AddSubOpc); + unsigned TmpReg = MRI->createVirtualRegister(TID1.getRegClass(0, TRI)); + + MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID1, TmpReg) + .addReg(Src1Reg, getKillRegState(Src1Kill)) + .addReg(Src2Reg, getKillRegState(Src2Kill)); + if (HasLane) + MIB.addImm(LaneImm); + MIB.addImm(Pred).addReg(PredReg); + + MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID2) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead)); + + if (NegAcc) { + bool AccKill = MRI->hasOneNonDBGUse(AccReg); + MIB.addReg(TmpReg, getKillRegState(true)) + .addReg(AccReg, getKillRegState(AccKill)); + } else { + MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true)); + } + MIB.addImm(Pred).addReg(PredReg); + + DEBUG({ + dbgs() << "Expanding: " << *MI; + dbgs() << " to:\n"; + MachineBasicBlock::iterator MII = MI; + MII = llvm::prior(MII); + MachineInstr &MI2 = *MII; + MII = llvm::prior(MII); + MachineInstr &MI1 = *MII; + dbgs() << " " << MI1; + dbgs() << " " << MI2; + }); + + MI->eraseFromParent(); + ++NumExpand; +} + +bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { + bool Changed = false; + + clearStack(); + + unsigned Skip = 0; + MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); + while (MII != E) { + MachineInstr *MI = &*MII; + + if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) { + ++MII; + continue; + } + + const TargetInstrDesc &TID = MI->getDesc(); + if (TID.isBarrier()) { + clearStack(); + Skip = 0; + ++MII; + continue; + } + + unsigned Domain = TID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainGeneral) { + if (++Skip == 2) + // Assume dual issues of non-VFP / NEON instructions. + pushStack(0); + } else { + Skip = 0; + + unsigned MulOpc, AddSubOpc; + bool NegAcc, HasLane; + if (!TII->isFpMLxInstruction(TID.getOpcode(), + MulOpc, AddSubOpc, NegAcc, HasLane) || + !FindMLxHazard(MI)) + pushStack(MI); + else { + ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane); + E = MBB.rend(); // May have changed if MI was the 1st instruction. + Changed = true; + continue; + } + } + + ++MII; + } + + return Changed; +} + +bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo()); + TRI = Fn.getTarget().getRegisterInfo(); + MRI = &Fn.getRegInfo(); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= ExpandFPMLxInstructions(MBB); + } + + return Modified; +} + +FunctionPass *llvm::createMLxExpansionPass() { + return new MLxExpansion(); +} |