From 82d3d4524f2595b2dce617e963b6d67876b4f9ba Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 18 Jan 2013 21:15:53 +0000 Subject: R600: Proper insert S_WAITCNT instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some instructions like memory reads/writes are executed asynchronously, so we need to insert S_WAITCNT instructions to block before accessing their results. Previously we have just inserted S_WAITCNT instructions after each async instruction, this patch fixes this and adds a prober insertion pass. Patch by: Christian König Tested-by: Michel Dänzer Reviewed-by: Tom Stellard Signed-off-by: Christian König git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172846 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPU.h | 1 + lib/Target/R600/AMDGPUTargetMachine.cpp | 5 + lib/Target/R600/CMakeLists.txt | 1 + lib/Target/R600/SIISelLowering.cpp | 12 -- lib/Target/R600/SIISelLowering.h | 5 - lib/Target/R600/SIInsertWaits.cpp | 353 ++++++++++++++++++++++++++++++++ lib/Target/R600/SIInstrInfo.h | 4 +- lib/Target/R600/SIInstrInfo.td | 30 +-- 8 files changed, 379 insertions(+), 32 deletions(-) create mode 100644 lib/Target/R600/SIInsertWaits.cpp (limited to 'lib') diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 0f5125d39b..c75ec245e0 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -30,6 +30,7 @@ FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm); +FunctionPass *createSIInsertWaits(TargetMachine &tm); // Passes common to R600 and SI Pass *createAMDGPUStructurizeCFGPass(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index d09dc2efff..26ac928347 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -116,6 +116,11 @@ bool AMDGPUPassConfig::addPreRegAlloc() { } bool AMDGPUPassConfig::addPostRegAlloc() { + const AMDGPUSubtarget &ST = TM->getSubtarget(); + + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + addPass(createSIInsertWaits(*TM)); + } return false; } diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index ce0b56bdf0..790a4aa4db 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -40,6 +40,7 @@ add_llvm_target(R600CodeGen R600RegisterInfo.cpp SIAnnotateControlFlow.cpp SIAssignInterpRegs.cpp + SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp SILowerLiteralConstants.cpp diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 18fa9081b0..ef9d17c2dc 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -66,11 +66,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineRegisterInfo & MRI = BB->getParent()->getRegInfo(); MachineBasicBlock::iterator I = MI; - if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) { - AppendS_WAITCNT(MI, *BB, llvm::next(I)); - return BB; - } - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); @@ -141,13 +136,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( return BB; } -void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I) const { - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); -} - - void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index db36eefa14..8528c24a3c 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -23,11 +23,6 @@ namespace llvm { class SITargetLowering : public AMDGPUTargetLowering { const SIInstrInfo * TII; - /// Memory reads and writes are syncronized using the S_WAITCNT instruction. - /// This function takes the most conservative approach and inserts an - /// S_WAITCNT instruction after every read and write. - void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I) const; void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB, MachineBasicBlock::iterator I, unsigned Opocde) const; void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp new file mode 100644 index 0000000000..24fc929369 --- /dev/null +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -0,0 +1,353 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Insert wait instructions for memory reads and writes. +/// +/// Memory reads and writes are issued asynchronously, so we need to insert +/// S_WAITCNT instructions when we want to access any of their results or +/// overwrite any register that's used asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +/// \brief One variable for each of the hardware counters +typedef union { + struct { + unsigned VM; + unsigned EXP; + unsigned LGKM; + } Named; + unsigned Array[3]; + +} Counters; + +typedef Counters RegCounters[512]; +typedef std::pair RegInterval; + +class SIInsertWaits : public MachineFunctionPass { + +private: + static char ID; + const SIInstrInfo *TII; + const SIRegisterInfo &TRI; + const MachineRegisterInfo *MRI; + + /// \brief Constant hardware limits + static const Counters WaitCounts; + + /// \brief Constant zero value + static const Counters ZeroCounts; + + /// \brief Counter values we have already waited on. + Counters WaitedOn; + + /// \brief Counter values for last instruction issued. + Counters LastIssued; + + /// \brief Registers used by async instructions. + RegCounters UsedRegs; + + /// \brief Registers defined by async instructions. + RegCounters DefinedRegs; + + /// \brief Different export instruction types seen since last wait. + unsigned ExpInstrTypesSeen; + + /// \brief Get increment/decrement amount for this instruction. + Counters getHwCounts(MachineInstr &MI); + + /// \brief Is operand relevant for async execution? + bool isOpRelevant(MachineOperand &Op); + + /// \brief Get register interval an operand affects. + RegInterval getRegInterval(MachineOperand &Op); + + /// \brief Handle instructions async components + void pushInstruction(MachineInstr &MI); + + /// \brief Insert the actual wait instruction + bool insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Counts); + + /// \brief Resolve all operand dependencies to counter requirements + Counters handleOperands(MachineInstr &MI); + +public: + SIInsertWaits(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(static_cast(tm.getInstrInfo())), + TRI(TII->getRegisterInfo()) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { + return "SI insert wait instructions"; + } + +}; + +} // End anonymous namespace + +char SIInsertWaits::ID = 0; + +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; +const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; + +FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { + return new SIInsertWaits(tm); +} + +Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { + + uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; + Counters Result; + + Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); + + // Only consider stores or EXP for EXP_CNT + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && + (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore())); + + // LGKM may uses larger values + if (TSFlags & SIInstrFlags::LGKM_CNT) { + + MachineOperand &Op = MI.getOperand(0); + assert(Op.isReg() && "First LGKM operand must be a register!"); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + + } else { + Result.Named.LGKM = 0; + } + + return Result; +} + +bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { + + // Constants are always irrelevant + if (!Op.isReg()) + return false; + + // Defines are always relevant + if (Op.isDef()) + return true; + + // For exports all registers are relevant + MachineInstr &MI = *Op.getParent(); + if (MI.getOpcode() == AMDGPU::EXP) + return true; + + // For stores the stored value is also relevant + if (!MI.getDesc().mayStore()) + return false; + + for (MachineInstr::mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); I != E; ++I) { + + if (I->isReg() && I->isUse()) + return Op.isIdenticalTo(*I); + } + + return false; +} + +RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { + + if (!Op.isReg()) + return std::make_pair(0, 0); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize(); + + assert(Size >= 4); + + RegInterval Result; + Result.first = TRI.getEncodingValue(Reg); + Result.second = Result.first + Size / 4; + + return Result; +} + +void SIInsertWaits::pushInstruction(MachineInstr &MI) { + + // Get the hardware counter increments and sum them up + Counters Increment = getHwCounts(MI); + unsigned Sum = 0; + + for (unsigned i = 0; i < 3; ++i) { + LastIssued.Array[i] += Increment.Array[i]; + Sum += Increment.Array[i]; + } + + // If we don't increase anything then that's it + if (Sum == 0) + return; + + // Remember which export instructions we have seen + if (Increment.Named.EXP) { + ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2; + } + + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + + MachineOperand &Op = MI.getOperand(i); + if (!isOpRelevant(Op)) + continue; + + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + // Remember which registers we define + if (Op.isDef()) + DefinedRegs[j] = LastIssued; + + // and which one we are using + if (Op.isUse()) + UsedRegs[j] = LastIssued; + } + } +} + +bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Required) { + + // End of program? No need to wait on anything + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + return false; + + // Figure out if the async instructions execute in order + bool Ordered[3]; + + // VM_CNT is always ordered + Ordered[0] = true; + + // EXP_CNT is unordered if we have both EXP & VM-writes + Ordered[1] = ExpInstrTypesSeen == 3; + + // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS + Ordered[2] = false; + + // The values we are going to put into the S_WAITCNT instruction + Counters Counts = WaitCounts; + + // Do we really need to wait? + bool NeedWait = false; + + for (unsigned i = 0; i < 3; ++i) { + + if (Required.Array[i] <= WaitedOn.Array[i]) + continue; + + NeedWait = true; + + if (Ordered[i]) { + unsigned Value = LastIssued.Array[i] - Required.Array[i]; + + // adjust the value to the real hardware posibilities + Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); + + } else + Counts.Array[i] = 0; + + // Remember on what we have waited on + WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + } + + if (!NeedWait) + return false; + + // Reset EXP_CNT instruction types + if (Counts.Named.EXP == 0) + ExpInstrTypesSeen = 0; + + // Build the wait instruction + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm((Counts.Named.VM & 0xF) | + ((Counts.Named.EXP & 0x7) << 4) | + ((Counts.Named.LGKM & 0x7) << 8)); + + return true; +} + +/// \brief helper function for handleOperands +static void increaseCounters(Counters &Dst, const Counters &Src) { + + for (unsigned i = 0; i < 3; ++i) + Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); +} + +Counters SIInsertWaits::handleOperands(MachineInstr &MI) { + + Counters Result = ZeroCounts; + + // For each register affected by this + // instruction increase the result sequence + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + + MachineOperand &Op = MI.getOperand(i); + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + if (Op.isDef()) + increaseCounters(Result, UsedRegs[j]); + + if (Op.isUse()) + increaseCounters(Result, DefinedRegs[j]); + } + } + + return Result; +} + +bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { + + bool Changes = false; + + MRI = &MF.getRegInfo(); + + WaitedOn = ZeroCounts; + LastIssued = ZeroCounts; + + memset(&UsedRegs, 0, sizeof(UsedRegs)); + memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + + Changes |= insertWait(MBB, I, handleOperands(*I)); + pushInstruction(*I); + } + + // Wait for everything at the end of the MBB + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + } + + return Changes; +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 631f6c00cc..783cd9f4cb 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -55,7 +55,9 @@ public: namespace SIInstrFlags { enum Flags { // First 4 bits are the instruction encoding - NEED_WAIT = 1 << 4 + VM_CNT = 1 << 4, + EXP_CNT = 1 << 5, + LGKM_CNT = 1 << 6 }; } diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 873a451e99..8ff2d6db16 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -42,11 +42,14 @@ class InstSI pattern> : AMDGPUInst { field bits<4> EncodingType = 0; - field bits<1> NeedWait = 0; + field bits<1> VM_CNT = 0; + field bits<1> EXP_CNT = 0; + field bits<1> LGKM_CNT = 0; let TSFlags{3-0} = EncodingType; - let TSFlags{4} = NeedWait; - + let TSFlags{4} = VM_CNT; + let TSFlags{5} = EXP_CNT; + let TSFlags{6} = LGKM_CNT; } class Enc32 pattern> : @@ -140,8 +143,7 @@ def EXP : Enc64< let Inst{63-56} = VSRC3; let EncodingType = 0; //SIInstrEncodingType::EXP - let NeedWait = 1; - let usesCustomInserter = 1; + let EXP_CNT = 1; } class MIMG op, dag outs, dag ins, string asm, list pattern> : @@ -174,11 +176,10 @@ class MIMG op, dag outs, dag ins, string asm, list pattern> : let Inst{47-40} = VDATA; let Inst{52-48} = SRSRC; let Inst{57-53} = SSAMP; - let EncodingType = 2; //SIInstrEncodingType::MIMG - let NeedWait = 1; - let usesCustomInserter = 1; + let VM_CNT = 1; + let EXP_CNT = 1; } class MTBUF op, dag outs, dag ins, string asm, list pattern> : @@ -215,8 +216,9 @@ class MTBUF op, dag outs, dag ins, string asm, list pattern> : let Inst{63-56} = SOFFSET; let EncodingType = 3; //SIInstrEncodingType::MTBUF - let NeedWait = 1; - let usesCustomInserter = 1; + let VM_CNT = 1; + let EXP_CNT = 1; + let neverHasSideEffects = 1; } @@ -252,8 +254,9 @@ class MUBUF op, dag outs, dag ins, string asm, list pattern> : let Inst{63-56} = SOFFSET; let EncodingType = 4; //SIInstrEncodingType::MUBUF - let NeedWait = 1; - let usesCustomInserter = 1; + let VM_CNT = 1; + let EXP_CNT = 1; + let neverHasSideEffects = 1; } @@ -276,8 +279,7 @@ class SMRD op, dag outs, dag ins, string asm, list pattern> : let Inst{31-27} = 0x18; //encoding let EncodingType = 5; //SIInstrEncodingType::SMRD - let NeedWait = 1; - let usesCustomInserter = 1; + let LGKM_CNT = 1; } class SOP1 op, dag outs, dag ins, string asm, list pattern> : -- cgit v1.2.3-18-g5258