diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2012-12-19 22:10:31 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2012-12-19 22:10:31 +0000 |
commit | 6b7d99d47321ebb478b22afd2e317fe89d2149db (patch) | |
tree | 548126753462b8985bf846b022d39b2c51e96f96 /lib/Target/R600/SILowerControlFlow.cpp | |
parent | 6eebe47060eec7e3a4ae95d4b4835869108f9c07 (diff) |
R600: New control flow for SI v2
This patch replaces the control flow handling with a new
pass which structurize the graph before transforming it to
machine instruction. This has a couple of different advantages
and currently fixes 20 piglit tests without a single regression.
It is now a general purpose transformation that could be not
only be used for SI/R6xx, but also for other hardware
implementations that use a form of structurized control flow.
v2: further cleanup, fixes and documentation
Patch by: Christian König
Signed-off-by: Christian König <deathsimple@vodafone.de>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
Tested-by: Michel Dänzer <michel.daenzer@amd.com>
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170591 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600/SILowerControlFlow.cpp')
-rw-r--r-- | lib/Target/R600/SILowerControlFlow.cpp | 279 |
1 files changed, 190 insertions, 89 deletions
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 277b647f67..1abcb8805a 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -8,10 +8,10 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF) -/// to predicated instructions. +/// \brief This pass lowers the pseudo control flow instructions to real +/// machine instructions. /// -/// All control flow (except loops) is handled using predicated instructions and +/// All control flow is handled using predicated instructions and /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs /// by writting to the 64-bit EXEC register (each bit corresponds to a @@ -22,17 +22,17 @@ /// /// For example: /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 -/// SI_IF_NZ %VCC +/// %SGPR0 = SI_IF %VCC /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 -/// ELSE +/// %SGPR0 = SI_ELSE %SGPR0 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 -/// ENDIF +/// SI_END_CF %SGPR0 /// /// becomes: /// /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask -/// S_CBRANCH_EXECZ label0 // This instruction is an +/// S_CBRANCH_EXECZ label0 // This instruction is an optional /// // optimization which allows us to /// // branch if all the bits of /// // EXEC are zero. @@ -45,7 +45,7 @@ /// // instruction again. /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block /// label1: -/// %EXEC = S_OR_B64 %EXEC, %SGPR2 // Re-enable saved exec mask bits +/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -65,11 +65,14 @@ class SILowerControlFlowPass : public MachineFunctionPass { private: static char ID; const TargetInstrInfo *TII; - std::vector<unsigned> PredicateStack; - std::vector<unsigned> UnusedRegisters; - unsigned allocReg(); - void freeReg(unsigned Reg); + void If(MachineInstr &MI); + void Else(MachineInstr &MI); + void Break(MachineInstr &MI); + void IfBreak(MachineInstr &MI); + void ElseBreak(MachineInstr &MI); + void Loop(MachineInstr &MI); + void EndCf(MachineInstr &MI); public: SILowerControlFlowPass(TargetMachine &tm) : @@ -91,101 +94,199 @@ FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { return new SILowerControlFlowPass(tm); } +void SILowerControlFlowPass::If(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) + .addReg(Vcc); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Else(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + .addReg(Src); // Saved EXEC + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Break(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Vcc) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Saved = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Saved) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Loop(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Src = MI.getOperand(0).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)) + .addReg(AMDGPU::EXEC); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::EndCf(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - // Find all the unused registers that can be used for the predicate stack. - for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), - S = AMDGPU::SReg_64RegClass.end(); - I != S; ++I) { - unsigned Reg = *I; - if (!MF.getRegInfo().isPhysRegUsed(Reg)) { - UnusedRegisters.insert(UnusedRegisters.begin(), Reg); - } - } + bool HaveCf = false; - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); - I != MBB.end(); I = Next) { + I != MBB.end(); I = Next) { + Next = llvm::next(I); MachineInstr &MI = *I; - unsigned Reg; switch (MI.getOpcode()) { default: break; - case AMDGPU::SI_IF_NZ: - Reg = allocReg(); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), - Reg) - .addOperand(MI.getOperand(0)); // VCC - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), - Reg) - .addReg(Reg) - .addReg(AMDGPU::EXEC); - MI.eraseFromParent(); - PredicateStack.push_back(Reg); + case AMDGPU::SI_IF: + If(MI); + break; + + case AMDGPU::SI_ELSE: + Else(MI); + break; + + case AMDGPU::SI_BREAK: + Break(MI); + break; + + case AMDGPU::SI_IF_BREAK: + IfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + ElseBreak(MI); break; - case AMDGPU::ELSE: - Reg = PredicateStack.back(); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), - Reg) - .addReg(Reg); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), - AMDGPU::EXEC) - .addReg(Reg) - .addReg(AMDGPU::EXEC); - MI.eraseFromParent(); + case AMDGPU::SI_LOOP: + Loop(MI); break; - case AMDGPU::ENDIF: - Reg = PredicateStack.back(); - PredicateStack.pop_back(); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - freeReg(Reg); - - if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL && - PredicateStack.empty()) { - // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); - - // Exec mask is zero: Export to NULL target... - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0); - - // ... and terminate wavefront - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); - } - MI.eraseFromParent(); + case AMDGPU::SI_END_CF: + HaveCf = true; + EndCf(MI); break; } } } - return true; -} -unsigned SILowerControlFlowPass::allocReg() { + // TODO: What is this good for? + unsigned ShaderType = MF.getInfo<SIMachineFunctionInfo>()->ShaderType; + if (HaveCf && ShaderType == ShaderType::PIXEL) { + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { - assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack"); - unsigned Reg = UnusedRegisters.back(); - UnusedRegisters.pop_back(); - return Reg; -} + MachineBasicBlock &MBB = *BI; + if (MBB.succ_empty()) { -void SILowerControlFlowPass::freeReg(unsigned Reg) { + MachineInstr &MI = *MBB.getFirstNonPHI(); + DebugLoc DL = MI.getDebugLoc(); - UnusedRegisters.push_back(Reg); + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3) + .addReg(AMDGPU::EXEC); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0); + + // ... and terminate wavefront + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ENDPGM)); + } + } + } + + return true; } |