diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2012-12-11 21:25:42 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2012-12-11 21:25:42 +0000 |
commit | f98f2ce29e6e2996fa58f38979143eceaa818335 (patch) | |
tree | 86dffe7414e6657874db8ac36e5ddcf7d41b2d9c /lib/Target/R600/SILowerControlFlow.cpp | |
parent | 57ac1f458a754f30cf500410b438fb260f9b8fe5 (diff) |
Add R600 backend
A new backend supporting AMD GPUs: Radeon HD2XXX - HD7XXX
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169915 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600/SILowerControlFlow.cpp')
-rw-r--r-- | lib/Target/R600/SILowerControlFlow.cpp | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp new file mode 100644 index 0000000000..277b647f67 --- /dev/null +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -0,0 +1,191 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF) +/// to predicated instructions. +/// +/// All control flow (except loops) is handled using predicated instructions and +/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector +/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs +/// by writting to the 64-bit EXEC register (each bit corresponds to a +/// single vector ALU). Typically, for predicates, a vector ALU will write +/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each +/// Vector ALU) and then the ScalarALU will AND the VCC register with the +/// EXEC to update the predicates. +/// +/// For example: +/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 +/// SI_IF_NZ %VCC +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 +/// ELSE +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 +/// ENDIF +/// +/// becomes: +/// +/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask +/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_CBRANCH_EXECZ label0 // This instruction is an +/// // optimization which allows us to +/// // branch if all the bits of +/// // EXEC are zero. +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch +/// +/// label0: +/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block +/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_BRANCH_EXECZ label1 // Use our branch optimization +/// // instruction again. +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block +/// label1: +/// %EXEC = S_OR_B64 %EXEC, %SGPR2 // Re-enable saved exec mask bits +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +class SILowerControlFlowPass : public MachineFunctionPass { + +private: + static char ID; + const TargetInstrInfo *TII; + std::vector<unsigned> PredicateStack; + std::vector<unsigned> UnusedRegisters; + + unsigned allocReg(); + void freeReg(unsigned Reg); + +public: + SILowerControlFlowPass(TargetMachine &tm) : + MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { + return "SI Lower control flow instructions"; + } + +}; + +} // End anonymous namespace + +char SILowerControlFlowPass::ID = 0; + +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { + return new SILowerControlFlowPass(tm); +} + +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { + + // Find all the unused registers that can be used for the predicate stack. + for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), + S = AMDGPU::SReg_64RegClass.end(); + I != S; ++I) { + unsigned Reg = *I; + if (!MF.getRegInfo().isPhysRegUsed(Reg)) { + UnusedRegisters.insert(UnusedRegisters.begin(), Reg); + } + } + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + unsigned Reg; + switch (MI.getOpcode()) { + default: break; + case AMDGPU::SI_IF_NZ: + Reg = allocReg(); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + Reg) + .addOperand(MI.getOperand(0)); // VCC + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), + Reg) + .addReg(Reg) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + PredicateStack.push_back(Reg); + break; + + case AMDGPU::ELSE: + Reg = PredicateStack.back(); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), + Reg) + .addReg(Reg); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), + AMDGPU::EXEC) + .addReg(Reg) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + + case AMDGPU::ENDIF: + Reg = PredicateStack.back(); + PredicateStack.pop_back(); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + freeReg(Reg); + + if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL && + PredicateStack.empty()) { + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3) + .addReg(AMDGPU::EXEC); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0); + + // ... and terminate wavefront + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); + } + MI.eraseFromParent(); + break; + } + } + } + return true; +} + +unsigned SILowerControlFlowPass::allocReg() { + + assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack"); + unsigned Reg = UnusedRegisters.back(); + UnusedRegisters.pop_back(); + return Reg; +} + +void SILowerControlFlowPass::freeReg(unsigned Reg) { + + UnusedRegisters.push_back(Reg); +} |