diff options
author | Alexander Kornienko <alexfh@google.com> | 2013-03-14 10:51:38 +0000 |
---|---|---|
committer | Alexander Kornienko <alexfh@google.com> | 2013-03-14 10:51:38 +0000 |
commit | 647735c781c5b37061ee03d6e9e6c7dda92218e2 (patch) | |
tree | 5a5e56606d41060263048b5a5586b3d2380898ba /lib/Target/R600 | |
parent | 6aed25d93d1cfcde5809a73ffa7dc1b0d6396f66 (diff) | |
parent | f635ef401786c84df32090251a8cf45981ecca33 (diff) |
Updating branches/google/stable to r176857
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/stable@177040 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600')
107 files changed, 23432 insertions, 0 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h new file mode 100644 index 0000000000..e099a9fc31 --- /dev/null +++ b/lib/Target/R600/AMDGPU.h @@ -0,0 +1,49 @@ +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_H +#define AMDGPU_H + +#include "AMDGPUTargetMachine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class FunctionPass; +class AMDGPUTargetMachine; + +// R600 Passes +FunctionPass* createR600KernelParametersPass(const DataLayout *TD); +FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); + +// SI Passes +FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); +FunctionPass *createSIInsertWaits(TargetMachine &tm); + +// Passes common to R600 and SI +Pass *createAMDGPUStructurizeCFGPass(); +FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); +FunctionPass* createAMDGPUIndirectAddressingPass(TargetMachine &tm); + +} // End namespace llvm + +namespace ShaderType { + enum Type { + PIXEL = 0, + VERTEX = 1, + GEOMETRY = 2, + COMPUTE = 3 + }; +} + +#endif // AMDGPU_H diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td new file mode 100644 index 0000000000..1a26c77d6b --- /dev/null +++ b/lib/Target/R600/AMDGPU.td @@ -0,0 +1,41 @@ +//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +// Include AMDIL TD files +include "AMDILBase.td" + + +def AMDGPUInstrInfo : InstrInfo { + let guessInstructionProperties = 1; +} + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// +def AMDGPUAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + int Variant = 0; + bit isMCAsmWriter = 1; +} + +def AMDGPU : Target { + // Pull in Instruction Info: + let InstructionSet = AMDGPUInstrInfo; + let AssemblyWriters = [AMDGPUAsmWriter]; +} + +// Include AMDGPU TD files +include "R600Schedule.td" +include "SISchedule.td" +include "Processors.td" +include "AMDGPUInstrInfo.td" +include "AMDGPUIntrinsics.td" +include "AMDGPURegisterInfo.td" +include "AMDGPUInstructions.td" +include "AMDGPUCallingConv.td" diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp new file mode 100644 index 0000000000..f6001445f4 --- /dev/null +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -0,0 +1,145 @@ +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// The AMDGPUAsmPrinter is used to print both assembly string and also binary +/// code. When passed an MCAsmStreamer it prints assembly and when passed +/// an MCObjectStreamer it outputs binary code. +// +//===----------------------------------------------------------------------===// +// + + +#include "AMDGPUAsmPrinter.h" +#include "AMDGPU.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +using namespace llvm; + + +static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, + MCStreamer &Streamer) { + return new AMDGPUAsmPrinter(tm, Streamer); +} + +extern "C" void LLVMInitializeR600AsmPrinter() { + TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); +} + +/// We need to override this function so we can avoid +/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle. +bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + if (STM.dumpCode()) { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + MF.dump(); +#endif + } + SetupMachineFunction(MF); + if (OutStreamer.hasRawTextSupport()) { + OutStreamer.EmitRawText("@" + MF.getName() + ":"); + } + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + EmitProgramInfo(MF); + } + EmitFunctionBody(); + return false; +} + +void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { + unsigned MaxSGPR = 0; + unsigned MaxVGPR = 0; + bool VCCUsed = false; + const SIRegisterInfo * RI = + static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + MachineOperand & MO = MI.getOperand(op_idx); + unsigned maxUsed; + unsigned width = 0; + bool isSGPR = false; + unsigned reg; + unsigned hwReg; + if (!MO.isReg()) { + continue; + } + reg = MO.getReg(); + if (reg == AMDGPU::VCC) { + VCCUsed = true; + continue; + } + switch (reg) { + default: break; + case AMDGPU::EXEC: + case AMDGPU::M0: + continue; + } + + if (AMDGPU::SReg_32RegClass.contains(reg)) { + isSGPR = true; + width = 1; + } else if (AMDGPU::VReg_32RegClass.contains(reg)) { + isSGPR = false; + width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + isSGPR = true; + width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(reg)) { + isSGPR = false; + width = 2; + } else if (AMDGPU::SReg_128RegClass.contains(reg)) { + isSGPR = true; + width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(reg)) { + isSGPR = false; + width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(reg)) { + isSGPR = true; + width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(reg)) { + isSGPR = false; + width = 8; + } else if (AMDGPU::VReg_512RegClass.contains(reg)) { + isSGPR = false; + width = 16; + } else { + assert(!"Unknown register class"); + } + hwReg = RI->getEncodingValue(reg) & 0xff; + maxUsed = hwReg + width - 1; + if (isSGPR) { + MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; + } else { + MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; + } + } + } + } + if (VCCUsed) { + MaxSGPR += 2; + } + SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); + OutStreamer.EmitIntValue(MaxSGPR + 1, 4); + OutStreamer.EmitIntValue(MaxVGPR + 1, 4); + OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); +} diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h new file mode 100644 index 0000000000..3812282b17 --- /dev/null +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -0,0 +1,44 @@ +//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_ASMPRINTER_H +#define AMDGPU_ASMPRINTER_H + +#include "llvm/CodeGen/AsmPrinter.h" + +namespace llvm { + +class AMDGPUAsmPrinter : public AsmPrinter { + +public: + explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "AMDGPU Assembly Printer"; + } + + /// \brief Emit register usage information so that the GPU driver + /// can correctly setup the GPU state. + void EmitProgramInfo(MachineFunction &MF); + + /// Implemented in AMDGPUMCInstLower.cpp + virtual void EmitInstruction(const MachineInstr *MI); +}; + +} // End anonymous llvm + +#endif //AMDGPU_ASMPRINTER_H diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td new file mode 100644 index 0000000000..45ae37ef0c --- /dev/null +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -0,0 +1,42 @@ +//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the AMD Radeon GPUs. +// +//===----------------------------------------------------------------------===// + +// Inversion of CCIfInReg +class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} + +// Calling convention for SI +def CC_SI : CallingConv<[ + + CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15 + ]>>>, + + CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ] + >>>, + + CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 + ]>>> + +]>; + +def CC_AMDGPU : CallingConv<[ + CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"# + "->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>> +]>; diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp new file mode 100644 index 0000000000..50297d1f60 --- /dev/null +++ b/lib/Target/R600/AMDGPUConvertToISA.cpp @@ -0,0 +1,62 @@ +//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass lowers AMDIL machine instructions to the appropriate +/// hardware instructions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +namespace { + +class AMDGPUConvertToISAPass : public MachineFunctionPass { + +private: + static char ID; + TargetMachine &TM; + +public: + AMDGPUConvertToISAPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const {return "AMDGPU Convert to ISA";} + +}; + +} // End anonymous namespace + +char AMDGPUConvertToISAPass::ID = 0; + +FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) { + return new AMDGPUConvertToISAPass(tm); +} + +bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) { + const AMDGPUInstrInfo * TII = + static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + TII->convertToISA(MI, MF, MBB.findDebugLoc(I)); + } + } + return false; +} diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp new file mode 100644 index 0000000000..815d6f71c3 --- /dev/null +++ b/lib/Target/R600/AMDGPUFrameLowering.cpp @@ -0,0 +1,122 @@ +//===----------------------- AMDGPUFrameLowering.cpp ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#include "AMDGPUFrameLowering.h" +#include "AMDGPURegisterInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) { } + +AMDGPUFrameLowering::~AMDGPUFrameLowering() { } + +unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { + + // XXX: Hardcoding to 1 for now. + // + // I think the StackWidth should stored as metadata associated with the + // MachineFunction. This metadata can either be added by a frontend, or + // calculated by a R600 specific LLVM IR pass. + // + // The StackWidth determines how stack objects are laid out in memory. + // For a vector stack variable, like: int4 stack[2], the data will be stored + // in the following ways depending on the StackWidth. + // + // StackWidth = 1: + // + // T0.X = stack[0].x + // T1.X = stack[0].y + // T2.X = stack[0].z + // T3.X = stack[0].w + // T4.X = stack[1].x + // T5.X = stack[1].y + // T6.X = stack[1].z + // T7.X = stack[1].w + // + // StackWidth = 2: + // + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T1.X = stack[0].z + // T1.Y = stack[0].w + // T2.X = stack[1].x + // T2.Y = stack[1].y + // T3.X = stack[1].z + // T3.Y = stack[1].w + // + // StackWidth = 4: + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T0.Z = stack[0].z + // T0.W = stack[0].w + // T1.X = stack[1].x + // T1.Y = stack[1].y + // T1.Z = stack[1].z + // T1.W = stack[1].w + return 1; +} + +/// \returns The number of registers allocated for \p FI. +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Offset = 0; + int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; + + for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { + const AllocaInst *Alloca = MFI->getObjectAllocation(i); + unsigned ArrayElements; + const Type *AllocaType = Alloca->getAllocatedType(); + const Type *ElementType; + + if (AllocaType->isArrayTy()) { + ArrayElements = AllocaType->getArrayNumElements(); + ElementType = AllocaType->getArrayElementType(); + } else { + ArrayElements = 1; + ElementType = AllocaType; + } + + unsigned VectorElements; + if (ElementType->isVectorTy()) { + VectorElements = ElementType->getVectorNumElements(); + } else { + VectorElements = 1; + } + + Offset += (VectorElements / getStackWidth(MF)) * ArrayElements; + } + return Offset; +} + +const TargetFrameLowering::SpillSlot * +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { + NumEntries = 0; + return 0; +} +void +AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { +} +void +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { +} + +bool +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { + return false; +} diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h new file mode 100644 index 0000000000..cf5742ee09 --- /dev/null +++ b/lib/Target/R600/AMDGPUFrameLowering.h @@ -0,0 +1,44 @@ +//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface to describe a layout of a stack frame on a AMDIL target +/// machine. +// +//===----------------------------------------------------------------------===// +#ifndef AMDILFRAME_LOWERING_H +#define AMDILFRAME_LOWERING_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +/// \brief Information about the stack frame layout on the AMDGPU targets. +/// +/// It holds the direction of the stack growth, the known stack alignment on +/// entry to each function, and the offset to the locals area. +/// See TargetFrameInfo for more comments. +class AMDGPUFrameLowering : public TargetFrameLowering { +public: + AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1); + virtual ~AMDGPUFrameLowering(); + + /// \returns The number of 32-bit sub-registers that are used when storing + /// values to the stack. + virtual unsigned getStackWidth(const MachineFunction &MF) const; + virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const; + virtual void emitPrologue(MachineFunction &MF) const; + virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + virtual bool hasFP(const MachineFunction &MF) const; +}; +} // namespace llvm +#endif // AMDILFRAME_LOWERING_H diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp new file mode 100644 index 0000000000..5995b6f5e8 --- /dev/null +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -0,0 +1,412 @@ +//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is the parent TargetLowering class for hardware code gen +/// targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDGPURegisterInfo.h" +#include "AMDILIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" + +using namespace llvm; + +#include "AMDGPUGenCallingConv.inc" + +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : + TargetLowering(TM, new TargetLoweringObjectFileELF()) { + + // Initialize target lowering borrowed from AMDIL + InitAMDILLowering(); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // Library functions. These default to Expand, but we have instructions + // for them. + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FPOW, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + + // Lower floating point store/load to integer store/load to reduce the number + // of patterns in tablegen. + setOperationAction(ISD::STORE, MVT::f32, Promote); + AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + + setOperationAction(ISD::STORE, MVT::v4f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::LOAD, MVT::f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + setOperationAction(ISD::UREM, MVT::i32, Expand); +} + +//===---------------------------------------------------------------------===// +// TargetLowering Callbacks +//===---------------------------------------------------------------------===// + +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { + + State.AnalyzeFormalArguments(Ins, CC_AMDGPU); +} + +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc DL, SelectionDAG &DAG) const { + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); +} + +//===---------------------------------------------------------------------===// +// Target specific lowering +//===---------------------------------------------------------------------===// + +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) + const { + switch (Op.getOpcode()) { + default: + Op.getNode()->dump(); + assert(0 && "Custom lowering code for this" + "instruction is not implemented yet!"); + break; + // AMDIL DAG lowering + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::SREM: return LowerSREM(Op, DAG); + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + // AMDGPU DAG lowering + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + } + return Op; +} + +SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + switch (IntrinsicID) { + default: return Op; + case AMDGPUIntrinsic::AMDIL_abs: + return LowerIntrinsicIABS(Op, DAG); + case AMDGPUIntrinsic::AMDIL_exp: + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_lrp: + return LowerIntrinsicLRP(Op, DAG); + case AMDGPUIntrinsic::AMDIL_fraction: + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDIL_max: + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imax: + return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umax: + return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_min: + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imin: + return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umin: + return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_round_nearest: + return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + } +} + +///IABS(a) = SMAX(sub(0, a), a) +SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, + SelectionDAG &DAG) const { + + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), + Op.getOperand(1)); + + return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); +} + +/// Linear Interpolation +/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) +SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, + DAG.getConstantFP(1.0f, MVT::f32), + Op.getOperand(1)); + SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, + Op.getOperand(3)); + return DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), + OneSubAC); +} + +/// \brief Generate Min/Max node +SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue True = Op.getOperand(2); + SDValue False = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + + if (VT != MVT::f32 || + !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { + return SDValue(); + } + + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + switch (CCOpcode) { + case ISD::SETOEQ: + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + case ISD::SETUEQ: + case ISD::SETEQ: + case ISD::SETFALSE: + case ISD::SETFALSE2: + case ISD::SETTRUE: + case ISD::SETTRUE2: + case ISD::SETUO: + case ISD::SETO: + assert(0 && "Operation should already be optimised !"); + case ISD::SETULE: + case ISD::SETULT: + case ISD::SETOLE: + case ISD::SETOLT: + case ISD::SETLE: + case ISD::SETLT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); + else + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGE: + case ISD::SETOGE: + case ISD::SETUGT: + case ISD::SETOGT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); + else + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); + } + case ISD::SETCC_INVALID: + assert(0 && "Invalid setcc condcode !"); + } + return Op; +} + + + +SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + + SmallVector<SDValue, 8> Results; + + // RCP = URECIP(Den) = 2^32 / Den + e + // e is rounding error. + SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); + + // RCP_LO = umulo(RCP, Den) */ + SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); + + // RCP_HI = mulhu (RCP, Den) */ + SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); + + // NEG_RCP_LO = -RCP_LO + SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), + RCP_LO); + + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) + SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), + NEG_RCP_LO, RCP_LO, + ISD::SETEQ); + // Calculate the rounding error from the URECIP instruction + // E = mulhu(ABS_RCP_LO, RCP) + SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); + + // RCP_A_E = RCP + E + SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); + + // RCP_S_E = RCP - E + SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); + + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) + SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), + RCP_A_E, RCP_S_E, + ISD::SETEQ); + // Quotient = mulhu(Tmp0, Num) + SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); + + // Num_S_Remainder = Quotient * Den + SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); + + // Remainder = Num - Num_S_Remainder + SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); + + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) + SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, + DAG.getConstant(-1, VT), + DAG.getConstant(0, VT), + ISD::SETGE); + // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0) + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder, + DAG.getConstant(0, VT), + DAG.getConstant(-1, VT), + DAG.getConstant(0, VT), + ISD::SETGE); + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero + SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, + Remainder_GE_Zero); + + // Calculate Division result: + + // Quotient_A_One = Quotient + 1 + SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, + DAG.getConstant(1, VT)); + + // Quotient_S_One = Quotient - 1 + SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, + DAG.getConstant(1, VT)); + + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) + SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), + Quotient, Quotient_A_One, ISD::SETEQ); + + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) + Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), + Quotient_S_One, Div, ISD::SETEQ); + + // Calculate Rem result: + + // Remainder_S_Den = Remainder - Den + SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); + + // Remainder_A_Den = Remainder + Den + SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); + + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) + SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), + Remainder, Remainder_S_Den, ISD::SETEQ); + + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) + Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), + Remainder_A_Den, Rem, ISD::SETEQ); + SDValue Ops[2]; + Ops[0] = Div; + Ops[1] = Rem; + return DAG.getMergeValues(Ops, 2, DL); +} + +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->isExactlyValue(1.0); + } + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + return C->isAllOnesValue(); + } + return false; +} + +bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->getValueAPF().isZero(); + } + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + return C->isNullValue(); + } + return false; +} + +SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned VirtualRegister; + if (!MRI.isLiveIn(Reg)) { + VirtualRegister = MRI.createVirtualRegister(RC); + MRI.addLiveIn(Reg, VirtualRegister); + } else { + VirtualRegister = MRI.getLiveInVirtReg(Reg); + } + return DAG.getRegister(VirtualRegister, VT); +} + +#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; + +const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + // AMDIL DAG nodes + NODE_NAME_CASE(CALL); + NODE_NAME_CASE(UMUL); + NODE_NAME_CASE(DIV_INF); + NODE_NAME_CASE(RET_FLAG); + NODE_NAME_CASE(BRANCH_COND); + + // AMDGPU DAG nodes + NODE_NAME_CASE(DWORDADDR) + NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(FMAX) + NODE_NAME_CASE(SMAX) + NODE_NAME_CASE(UMAX) + NODE_NAME_CASE(FMIN) + NODE_NAME_CASE(SMIN) + NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(REGISTER_LOAD) + NODE_NAME_CASE(REGISTER_STORE) + } +} diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h new file mode 100644 index 0000000000..f31b6466bd --- /dev/null +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -0,0 +1,140 @@ +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition of the TargetLowering class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUISELLOWERING_H +#define AMDGPUISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class MachineRegisterInfo; + +class AMDGPUTargetLowering : public TargetLowering { +private: + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + +protected: + + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; + + bool isHWTrueValue(SDValue Op) const; + bool isHWFalseValue(SDValue Op) const; + + void AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const; + +public: + AMDGPUTargetLowering(TargetMachine &TM); + + virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc DL, SelectionDAG &DAG) const; + virtual SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + CLI.Callee.dump(); + llvm_unreachable("Undefined function"); + } + + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const; + virtual const char* getTargetNodeName(unsigned Opcode) const; + + virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const { + return N; + } + +// Functions defined in AMDILISelLowering.cpp +public: + + /// \brief Determine which of the bits specified in \p Mask are known to be + /// either zero or one and return them in the \p KnownZero and \p KnownOne + /// bitsets. + virtual void computeMaskedBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const; + + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, unsigned Intrinsic) const; + + /// We want to mark f32/f64 floating point values as legal. + bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + + /// We don't want to shrink f64/f32 constants. + bool ShouldShrinkFPConstant(EVT VT) const; + +private: + void InitAMDILLowering(); + SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; +}; + +namespace AMDGPUISD { + +enum { + // AMDIL ISD Opcodes + FIRST_NUMBER = ISD::BUILTIN_OP_END, + CALL, // Function call based on a single integer + UMUL, // 32bit unsigned multiplication + DIV_INF, // Divide with infinity returned on zero divisor + RET_FLAG, + BRANCH_COND, + // End AMDIL ISD Opcodes + BITALIGN, + DWORDADDR, + FRACT, + FMAX, + SMAX, + UMAX, + FMIN, + SMIN, + UMIN, + URECIP, + EXPORT, + CONST_ADDRESS, + REGISTER_LOAD, + REGISTER_STORE, + LAST_AMDGPU_ISD_NUMBER +}; + + +} // End namespace AMDGPUISD + +} // End namespace llvm + +#endif // AMDGPUISELLOWERING_H diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp new file mode 100644 index 0000000000..ed6c8ec55d --- /dev/null +++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp @@ -0,0 +1,343 @@ +//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// Instructions can use indirect addressing to index the register file as if it +/// were memory. This pass lowers RegisterLoad and RegisterStore instructions +/// to either a COPY or a MOV that uses indirect addressing. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace { + +class AMDGPUIndirectAddressingPass : public MachineFunctionPass { + +private: + static char ID; + const AMDGPUInstrInfo *TII; + + bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const; + +public: + AMDGPUIndirectAddressingPass(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(static_cast<const AMDGPUInstrInfo*>(tm.getInstrInfo())) + { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "R600 Handle indirect addressing"; } + +}; + +} // End anonymous namespace + +char AMDGPUIndirectAddressingPass::ID = 0; + +FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) { + return new AMDGPUIndirectAddressingPass(tm); +} + +bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + int IndirectBegin = TII->getIndirectIndexBegin(MF); + int IndirectEnd = TII->getIndirectIndexEnd(MF); + + if (IndirectBegin == -1) { + // No indirect addressing, we can skip this pass + assert(IndirectEnd == -1); + return false; + } + + // The map keeps track of the indirect address that is represented by + // each virtual register. The key is the register and the value is the + // indirect address it uses. + std::map<unsigned, unsigned> RegisterAddressMap; + + // First pass - Lower all of the RegisterStore instructions and track which + // registers are live. + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + // This map keeps track of the current live indirect registers. + // The key is the address and the value is the register + std::map<unsigned, unsigned> LiveAddressRegisterMap; + MachineBasicBlock &MBB = *BB; + + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + + if (!TII->isRegisterStore(MI)) { + continue; + } + + // Lower RegisterStore + + unsigned RegIndex = MI.getOperand(2).getImm(); + unsigned Channel = MI.getOperand(3).getImm(); + unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel); + const TargetRegisterClass *IndirectStoreRegClass = + TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg()); + + if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) { + // Direct register access. + unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg) + .addOperand(MI.getOperand(0)); + + RegisterAddressMap[DstReg] = Address; + LiveAddressRegisterMap[Address] = DstReg; + } else { + // Indirect register access. + MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I, + MI.getOperand(0).getReg(), // Value + Address, + MI.getOperand(1).getReg()); // Offset + for (int i = IndirectBegin; i <= IndirectEnd; ++i) { + unsigned Addr = TII->calculateIndirectAddress(i, Channel); + unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass); + MOV.addReg(DstReg, RegState::Define | RegState::Implicit); + RegisterAddressMap[DstReg] = Addr; + LiveAddressRegisterMap[Addr] = DstReg; + } + } + MI.eraseFromParent(); + } + + // Update the live-ins of the succesor blocks + for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(), + SuccEnd = MBB.succ_end(); + SuccEnd != Succ; ++Succ) { + std::map<unsigned, unsigned>::const_iterator Key, KeyEnd; + for (Key = LiveAddressRegisterMap.begin(), + KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) { + (*Succ)->addLiveIn(Key->second); + } + } + } + + // Second pass - Lower the RegisterLoad instructions + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + // Key is the address and the value is the register + std::map<unsigned, unsigned> LiveAddressRegisterMap; + MachineBasicBlock &MBB = *BB; + + MachineBasicBlock::livein_iterator LI = MBB.livein_begin(); + while (LI != MBB.livein_end()) { + std::vector<unsigned> PhiRegisters; + + // Make sure this live in is used for indirect addressing + if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) { + ++LI; + continue; + } + + unsigned Address = RegisterAddressMap[*LI]; + LiveAddressRegisterMap[Address] = *LI; + PhiRegisters.push_back(*LI); + + // Check if there are other live in registers which map to the same + // indirect address. + for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI), + LE = MBB.livein_end(); + LJ != LE; ++LJ) { + unsigned Reg = *LJ; + if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) { + continue; + } + + if (RegisterAddressMap[Reg] == Address) { + PhiRegisters.push_back(Reg); + } + } + + if (PhiRegisters.size() == 1) { + // We don't need to insert a Phi instruction, so we can just add the + // registers to the live list for the block. + LiveAddressRegisterMap[Address] = *LI; + MBB.removeLiveIn(*LI); + } else { + // We need to insert a PHI, because we have the same address being + // written in multiple predecessor blocks. + const TargetRegisterClass *PhiDstClass = + TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin())); + unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass); + MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(), + MBB.findDebugLoc(MBB.begin()), + TII->get(AMDGPU::PHI), PhiDstReg); + + for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(), + RE = PhiRegisters.end(); + RI != RE; ++RI) { + unsigned Reg = *RI; + MachineInstr *DefInst = MRI.getVRegDef(Reg); + assert(DefInst); + MachineBasicBlock *RegBlock = DefInst->getParent(); + Phi.addReg(Reg); + Phi.addMBB(RegBlock); + MBB.removeLiveIn(Reg); + } + RegisterAddressMap[PhiDstReg] = Address; + LiveAddressRegisterMap[Address] = PhiDstReg; + } + LI = MBB.livein_begin(); + } + + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + + if (!TII->isRegisterLoad(MI)) { + if (MI.getOpcode() == AMDGPU::PHI) { + continue; + } + // Check for indirect register defs + for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands(); + OpIdx < NumOperands; ++OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + if (MO.isReg() && MO.isDef() && + RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) { + unsigned Reg = MO.getReg(); + unsigned LiveAddress = RegisterAddressMap[Reg]; + // Chain the live-ins + if (LiveAddressRegisterMap.find(LiveAddress) != + RegisterAddressMap.end()) { + MI.addOperand(MachineOperand::CreateReg( + LiveAddressRegisterMap[LiveAddress], + false, // isDef + true, // isImp + true)); // isKill + } + LiveAddressRegisterMap[LiveAddress] = Reg; + } + } + continue; + } + + const TargetRegisterClass *SuperIndirectRegClass = + TII->getSuperIndirectRegClass(); + const TargetRegisterClass *IndirectLoadRegClass = + TII->getIndirectAddrLoadRegClass(); + unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass); + + unsigned RegIndex = MI.getOperand(2).getImm(); + unsigned Channel = MI.getOperand(3).getImm(); + unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel); + + if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) { + // Direct register access + unsigned Reg = LiveAddressRegisterMap[Address]; + unsigned AddrReg = IndirectLoadRegClass->getRegister(Address); + + if (regHasExplicitDef(MRI, Reg)) { + // If the register we are reading from has an explicit def, then that + // means it was written via a direct register access (i.e. COPY + // or other instruction that doesn't use indirect addressing). In + // this case we know where the value has been stored, so we can just + // issue a copy. + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), + MI.getOperand(0).getReg()) + .addReg(Reg); + } else { + // If the register we are reading has an implicit def, then that + // means it was written by an indirect register access (i.e. An + // instruction that uses indirect addressing. + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), + MI.getOperand(0).getReg()) + .addReg(AddrReg) + .addReg(Reg, RegState::Implicit); + } + } else { + // Indirect register access + + // Note on REQ_SEQUENCE instructons: You can't actually use the register + // it defines unless you have an instruction that takes the defined + // register class as an operand. + + MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDGPU::REG_SEQUENCE), + IndirectReg); + for (int i = IndirectBegin; i <= IndirectEnd; ++i) { + unsigned Addr = TII->calculateIndirectAddress(i, Channel); + if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) { + continue; + } + unsigned Reg = LiveAddressRegisterMap[Addr]; + + // We only need to use REG_SEQUENCE for explicit defs, since the + // register coalescer won't do anything with the implicit defs. + if (!regHasExplicitDef(MRI, Reg)) { + continue; + } + + // Insert a REQ_SEQUENCE instruction to force the register allocator + // to allocate the virtual register to the correct physical register. + Sequence.addReg(LiveAddressRegisterMap[Addr]); + Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr)); + } + MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I, + MI.getOperand(0).getReg(), // Value + Address, + MI.getOperand(1).getReg()); // Offset + + + + Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill); + Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit); + + } + MI.eraseFromParent(); + } + } + return false; +} + +bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI, + unsigned Reg) const { + MachineInstr *DefInstr = MRI.getVRegDef(Reg); + + if (!DefInstr) { + return false; + } + + if (DefInstr->getOpcode() == AMDGPU::PHI) { + bool Explicit = false; + for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(), + E = DefInstr->operands_end(); + I != E; ++I) { + const MachineOperand &MO = *I; + if (!MO.isReg() || MO.isDef()) { + continue; + } + + Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg()); + } + return Explicit; + } + + return DefInstr->getOperand(0).isReg() && + DefInstr->getOperand(0).getReg() == Reg; +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp new file mode 100644 index 0000000000..30f736c84c --- /dev/null +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -0,0 +1,267 @@ +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implementation of the TargetInstrInfo class that is common to all +/// AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "AMDIL.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define GET_INSTRINFO_CTOR +#define GET_INSTRMAP_INFO +#include "AMDGPUGenInstrInfo.inc" + +using namespace llvm; + +AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm) + : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { } + +const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { + return RI; +} + +bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { +// TODO: Implement this function + return false; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} + +MachineInstr * +AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { +// TODO: Implement this function + return NULL; +} +bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, + MachineBasicBlock &MBB) const { + while (iter != MBB.end()) { + switch (iter->getOpcode()) { + default: + break; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: + case AMDGPU::BRANCH: + return true; + }; + ++iter; + } + return false; +} + +MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) { + MachineBasicBlock::iterator tmp = MBB->end(); + if (!MBB->size()) { + return MBB->end(); + } + while (--tmp) { + if (tmp->getOpcode() == AMDGPU::ENDLOOP + || tmp->getOpcode() == AMDGPU::ENDIF + || tmp->getOpcode() == AMDGPU::ELSE) { + if (tmp == MBB->begin()) { + return tmp; + } else { + continue; + } + } else { + return ++tmp; + } + } + return MBB->end(); +} + +void +AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert(!"Not Implemented"); +} + +void +AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert(!"Not Implemented"); +} + +MachineInstr * +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const { +// TODO: Implement this function + return 0; +} +MachineInstr* +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr *LoadMI) const { + // TODO: Implement this function + return 0; +} +bool +AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const { + // TODO: Implement this function + return false; +} +bool +AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, + bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + // TODO: Implement this function + return false; +} + +bool +AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const { + // TODO: Implement this function + return false; +} + +unsigned +AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + // TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1 + && "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 16, + // then schedule together. + // TODO: Make the loads schedule near if it fits in a cacheline + return (NumLoads < 16 && (Offset2 - Offset1) < 16); +} + +bool +AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) + const { + // TODO: Implement this function + return true; +} +void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // TODO: Implement this function +} + +bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { + // TODO: Implement this function + return false; +} +bool +AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) + const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { + // TODO: Implement this function + return MI->getDesc().isPredicable(); +} + +bool +AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // TODO: Implement this function + return true; +} + +bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; +} + +bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; +} + + +void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AMDGPURegisterInfo & RI = getRegisterInfo(); + + for (unsigned i = 0; i < MI.getNumOperands(); i++) { + MachineOperand &MO = MI.getOperand(i); + // Convert dst regclass to one that is supported by the ISA + if (MO.isReg() && MO.isDef()) { + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg()); + const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass); + + assert(newRegClass); + + MRI.setRegClass(MO.getReg(), newRegClass); + } + } + } +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h new file mode 100644 index 0000000000..3909e4e105 --- /dev/null +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -0,0 +1,206 @@ +//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Contains the definition of a TargetInstrInfo class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUINSTRUCTIONINFO_H +#define AMDGPUINSTRUCTIONINFO_H + +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include <map> + +#define GET_INSTRINFO_HEADER +#define GET_INSTRINFO_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT +#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT +#define OPCODE_IS_ZERO AMDGPU::PRED_SETE +#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE + +namespace llvm { + +class AMDGPUTargetMachine; +class MachineFunction; +class MachineInstr; +class MachineInstrBuilder; + +class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { +private: + const AMDGPURegisterInfo RI; + bool getNextBranchInstr(MachineBasicBlock::iterator &iter, + MachineBasicBlock &MBB) const; +protected: + TargetMachine &TM; +public: + explicit AMDGPUInstrInfo(TargetMachine &tm); + + virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + + MachineInstr * + convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const = 0; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + +protected: + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr *LoadMI) const; +public: + bool canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const; + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr *> &NewMIs) const; + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode *> &NewNodes) const; + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = 0) const; + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const; + + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + bool isPredicated(const MachineInstr *MI) const; + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const; + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; + bool isPredicable(MachineInstr *MI) const; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + + // Helper functions that check the opcode for status information + bool isLoadInst(llvm::MachineInstr *MI) const; + bool isExtLoadInst(llvm::MachineInstr *MI) const; + bool isSWSExtLoadInst(llvm::MachineInstr *MI) const; + bool isSExtLoadInst(llvm::MachineInstr *MI) const; + bool isZExtLoadInst(llvm::MachineInstr *MI) const; + bool isAExtLoadInst(llvm::MachineInstr *MI) const; + bool isStoreInst(llvm::MachineInstr *MI) const; + bool isTruncStoreInst(llvm::MachineInstr *MI) const; + bool isRegisterStore(const MachineInstr &MI) const; + bool isRegisterLoad(const MachineInstr &MI) const; + +//===---------------------------------------------------------------------===// +// Pure virtual funtions to be implemented by sub-classes. +//===---------------------------------------------------------------------===// + + virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg, + int64_t Imm) const = 0; + virtual unsigned getIEQOpcode() const = 0; + virtual bool isMov(unsigned opcode) const = 0; + + /// \returns the smallest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0; + + /// \returns the largest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0; + + /// \brief Calculate the "Indirect Address" for the given \p RegIndex and + /// \p Channel + /// + /// We model indirect addressing using a virtual address space that can be + /// accesed with loads and stores. The "Indirect Address" is the memory + /// address in this virtual address space that maps to the given \p RegIndex + /// and \p Channel. + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const = 0; + + /// \returns The register class to be used for storing values to an + /// "Indirect Address" . + virtual const TargetRegisterClass *getIndirectAddrStoreRegClass( + unsigned SourceReg) const = 0; + + /// \returns The register class to be used for loading values from + /// an "Indirect Address" . + virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0; + + /// \brief Build instruction(s) for an indirect register write. + /// + /// \returns The instruction that performs the indirect register write + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build instruction(s) for an indirect register read. + /// + /// \returns The instruction that performs the indirect register read + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \returns the register class whose sub registers are the set of all + /// possible registers that can be used for indirect addressing. + virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0; + + + /// \brief Convert the AMDIL MachineInstr to a supported ISA + /// MachineInstr + virtual void convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const; + +}; + +} // End llvm namespace + +#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) +#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) + +#endif // AMDGPUINSTRINFO_H diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td new file mode 100644 index 0000000000..b66ae879dc --- /dev/null +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -0,0 +1,82 @@ +//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains DAG node defintions for the AMDGPU target. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Profiles +//===----------------------------------------------------------------------===// + +def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Nodes +// + +// out = ((a << 32) | b) >> c) +// +// Can be used to optimize rtol: +// rotl(a, b) = bitalign(a, a, 32 - b) +def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>; + +// This argument to this node is a dword address. +def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; + +// out = a - floor(a) +def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; + +// out = max(a, b) a and b are floats +def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = max(a, b) a and b are signed ints +def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = max(a, b) a and b are unsigned ints +def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a and b are floats +def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a snd b are signed ints +def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// urecip - This operation is a helper for integer division, it returns the +// result of 1 / a as a fractional unsigned integer. +// out = (2^32 / a) + e +// e is rounding error +def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; + +def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>; + +def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayLoad]>; + +def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td new file mode 100644 index 0000000000..a59c775272 --- /dev/null +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -0,0 +1,266 @@ +//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains instruction defs that are common to all hw codegen +// targets. +// +//===----------------------------------------------------------------------===// + +class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction { + field bit isRegisterLoad = 0; + field bit isRegisterStore = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = NullALU; + + let TSFlags{63} = isRegisterLoad; + let TSFlags{62} = isRegisterStore; +} + +class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern> + : AMDGPUInst<outs, ins, asm, pattern> { + + field bits<32> Inst = 0xffffffff; + +} + +def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; + +def COND_EQ : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOEQ: case ISD::SETUEQ: + case ISD::SETEQ: return true;}}}] +>; + +def COND_NE : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETONE: case ISD::SETUNE: + case ISD::SETNE: return true;}}}] +>; +def COND_GT : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOGT: case ISD::SETUGT: + case ISD::SETGT: return true;}}}] +>; + +def COND_GE : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOGE: case ISD::SETUGE: + case ISD::SETGE: return true;}}}] +>; + +def COND_LT : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOLT: case ISD::SETULT: + case ISD::SETLT: return true;}}}] +>; + +def COND_LE : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOLE: case ISD::SETULE: + case ISD::SETLE: return true;}}}] +>; + +def COND_NULL : PatLeaf < + (cond), + [{return false;}] +>; + +//===----------------------------------------------------------------------===// +// Load/Store Pattern Fragments +//===----------------------------------------------------------------------===// + +def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +class Constants { +int TWO_PI = 0x40c90fdb; +int PI = 0x40490fdb; +int TWO_PI_INV = 0x3e22f983; +} +def CONST : Constants; + +def FP_ZERO : PatLeaf < + (fpimm), + [{return N->getValueAPF().isZero();}] +>; + +def FP_ONE : PatLeaf < + (fpimm), + [{return N->isExactlyValue(1.0);}] +>; + +let isCodeGenOnly = 1, isPseudo = 1 in { + +let usesCustomInserter = 1 in { + +class CLAMP <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "CLAMP $dst, $src0", + [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] +>; + +class FABS <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FABS $dst, $src0", + [(set rc:$dst, (fabs rc:$src0))] +>; + +class FNEG <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FNEG $dst, $src0", + [(set rc:$dst, (fneg rc:$src0))] +>; + +} // usesCustomInserter = 1 + +multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, + ComplexPattern addrPat> { + def RegisterLoad : AMDGPUShaderInst < + (outs dstClass:$dst), + (ins addrClass:$addr, i32imm:$chan), + "RegisterLoad $dst, $addr", + [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr, + (i32 timm:$chan)))] + > { + let isRegisterLoad = 1; + } + + def RegisterStore : AMDGPUShaderInst < + (outs), + (ins dstClass:$val, addrClass:$addr, i32imm:$chan), + "RegisterStore $val, $addr", + [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))] + > { + let isRegisterStore = 1; + } +} + +} // End isCodeGenOnly = 1, isPseudo = 1 + +/* Generic helper patterns for intrinsics */ +/* -------------------------------------- */ + +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul, + RegisterClass rc> : Pat < + (fpow rc:$src0, rc:$src1), + (exp_ieee (mul rc:$src1, (log_ieee rc:$src0))) +>; + +/* Other helper patterns */ +/* --------------------- */ + +/* Extract element pattern */ +class Extract_Element <ValueType sub_type, ValueType vec_type, + RegisterClass vec_class, int sub_idx, + SubRegIndex sub_reg>: Pat< + (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)), + (EXTRACT_SUBREG vec_class:$src, sub_reg) +>; + +/* Insert element pattern */ +class Insert_Element <ValueType elem_type, ValueType vec_type, + RegisterClass elem_class, RegisterClass vec_class, + int sub_idx, SubRegIndex sub_reg> : Pat < + + (vec_type (vector_insert (vec_type vec_class:$vec), + (elem_type elem_class:$elem), sub_idx)), + (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg) +>; + +// Vector Build pattern +class Vector1_Build <ValueType vecType, RegisterClass vectorClass, + ValueType elemType, RegisterClass elemClass> : Pat < + (vecType (build_vector (elemType elemClass:$src))), + (vecType elemClass:$src) +>; + +class Vector2_Build <ValueType vecType, RegisterClass vectorClass, + ValueType elemType, RegisterClass elemClass> : Pat < + (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1))), + (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1) +>; + +class Vector_Build <ValueType vecType, RegisterClass vectorClass, + ValueType elemType, RegisterClass elemClass> : Pat < + (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y), + (elemType elemClass:$z), (elemType elemClass:$w))), + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), elemClass:$x, sub0), elemClass:$y, sub1), + elemClass:$z, sub2), elemClass:$w, sub3) +>; + +class Vector8_Build <ValueType vecType, RegisterClass vectorClass, + ValueType elemType, RegisterClass elemClass> : Pat < + (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1), + (elemType elemClass:$sub2), (elemType elemClass:$sub3), + (elemType elemClass:$sub4), (elemType elemClass:$sub5), + (elemType elemClass:$sub6), (elemType elemClass:$sub7))), + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1), + elemClass:$sub2, sub2), elemClass:$sub3, sub3), + elemClass:$sub4, sub4), elemClass:$sub5, sub5), + elemClass:$sub6, sub6), elemClass:$sub7, sub7) +>; + +class Vector16_Build <ValueType vecType, RegisterClass vectorClass, + ValueType elemType, RegisterClass elemClass> : Pat < + (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1), + (elemType elemClass:$sub2), (elemType elemClass:$sub3), + (elemType elemClass:$sub4), (elemType elemClass:$sub5), + (elemType elemClass:$sub6), (elemType elemClass:$sub7), + (elemType elemClass:$sub8), (elemType elemClass:$sub9), + (elemType elemClass:$sub10), (elemType elemClass:$sub11), + (elemType elemClass:$sub12), (elemType elemClass:$sub13), + (elemType elemClass:$sub14), (elemType elemClass:$sub15))), + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1), + elemClass:$sub2, sub2), elemClass:$sub3, sub3), + elemClass:$sub4, sub4), elemClass:$sub5, sub5), + elemClass:$sub6, sub6), elemClass:$sub7, sub7), + elemClass:$sub8, sub8), elemClass:$sub9, sub9), + elemClass:$sub10, sub10), elemClass:$sub11, sub11), + elemClass:$sub12, sub12), elemClass:$sub13, sub13), + elemClass:$sub14, sub14), elemClass:$sub15, sub15) +>; + +// bitconvert pattern +class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat < + (dt (bitconvert (st rc:$src0))), + (dt rc:$src0) +>; + +class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < + (vt (AMDGPUdwordaddr (vt rc:$addr))), + (vt rc:$addr) +>; + +include "R600Instructions.td" + +include "SIInstrInfo.td" + diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td new file mode 100644 index 0000000000..eecb25b04f --- /dev/null +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -0,0 +1,60 @@ +//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines intrinsics that are used by all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "AMDGPU", isTarget = 1 in { + + def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; + def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; + def int_AMDGPU_kilp : Intrinsic<[], [], []>; + def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; +} + +let TargetPrefix = "TGSI", isTarget = 1 in { + + def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; +} + +include "SIIntrinsics.td" diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp new file mode 100644 index 0000000000..1dc1c657df --- /dev/null +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -0,0 +1,83 @@ +//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUMCInstLower.h" +#include "AMDGPUAsmPrinter.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/Constants.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx): + Ctx(ctx) +{ } + +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_FPImmediate: { + const APFloat &FloatValue = MO.getFPImm()->getValueAPF(); + assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle && + "Only floating point immediates are supported at the moment."); + MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat()); + break; + } + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_Register: + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MO.getMBB()->getSymbol(), Ctx)); + } + OutMI.addOperand(MCOp); + } +} + +void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { + AMDGPUMCInstLower MCInstLowering(OutContext); + + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = MI; + ++I; + while (I != MBB->end() && I->isInsideBundle()) { + MCInst MCBundleInst; + const MachineInstr *BundledInst = I; + MCInstLowering.lower(BundledInst, MCBundleInst); + OutStreamer.EmitInstruction(MCBundleInst); + ++I; + } + } else { + MCInst TmpInst; + MCInstLowering.lower(MI, TmpInst); + OutStreamer.EmitInstruction(TmpInst); + } +} diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h new file mode 100644 index 0000000000..d7d538e925 --- /dev/null +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -0,0 +1,34 @@ +//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_MCINSTLOWER_H +#define AMDGPU_MCINSTLOWER_H + +namespace llvm { + +class MCInst; +class MCContext; +class MachineInstr; + +class AMDGPUMCInstLower { + + MCContext &Ctx; + +public: + AMDGPUMCInstLower(MCContext &ctx); + + /// \brief Lower a MachineInstr to an MCInst + void lower(const MachineInstr *MI, MCInst &OutMI) const; + +}; + +} // End namespace llvm + +#endif //AMDGPU_MCINSTLOWER_H diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp new file mode 100644 index 0000000000..fe994d2d05 --- /dev/null +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -0,0 +1,75 @@ +//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm, + const TargetInstrInfo &tii) +: AMDGPUGenRegisterInfo(0), + TM(tm), + TII(tii) + { } + +//===----------------------------------------------------------------------===// +// Function handling callbacks - Functions are a seldom used feature of GPUS, so +// they are not supported at this time. +//===----------------------------------------------------------------------===// + +const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; + +const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + return &CalleeSavedReg; +} + +void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const { + assert(!"Subroutines not supported yet"); +} + +unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { + assert(!"Subroutines not supported yet"); + return 0; +} + +unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { + + switch(IndirectIndex) { + case 0: return AMDGPU::sub0; + case 1: return AMDGPU::sub1; + case 2: return AMDGPU::sub2; + case 3: return AMDGPU::sub3; + case 4: return AMDGPU::sub4; + case 5: return AMDGPU::sub5; + case 6: return AMDGPU::sub6; + case 7: return AMDGPU::sub7; + case 8: return AMDGPU::sub8; + case 9: return AMDGPU::sub9; + case 10: return AMDGPU::sub10; + case 11: return AMDGPU::sub11; + case 12: return AMDGPU::sub12; + case 13: return AMDGPU::sub13; + case 14: return AMDGPU::sub14; + case 15: return AMDGPU::sub15; + default: llvm_unreachable("indirect index out of range"); + } +} + +#define GET_REGINFO_TARGET_DESC +#include "AMDGPUGenRegisterInfo.inc" diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h new file mode 100644 index 0000000000..1fc88e7455 --- /dev/null +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -0,0 +1,66 @@ +//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief TargetRegisterInfo interface that is implemented by all hw codegen +/// targets. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUREGISTERINFO_H +#define AMDGPUREGISTERINFO_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +namespace llvm { + +class AMDGPUTargetMachine; +class TargetInstrInfo; + +struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { + TargetMachine &TM; + const TargetInstrInfo &TII; + static const uint16_t CalleeSavedReg; + + AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii); + + virtual BitVector getReservedRegs(const MachineFunction &MF) const { + assert(!"Unimplemented"); return BitVector(); + } + + /// \param RC is an AMDIL reg class. + /// + /// \returns The ISA reg class that is equivalent to \p RC. + virtual const TargetRegisterClass * getISARegClass( + const TargetRegisterClass * RC) const { + assert(!"Unimplemented"); return NULL; + } + + virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { + assert(!"Unimplemented"); return NULL; + } + + const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const; + unsigned getFrameRegister(const MachineFunction &MF) const; + + unsigned getIndirectSubReg(unsigned IndirectIndex) const; + +}; + +} // End namespace llvm + +#endif // AMDIDSAREGISTERINFO_H diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td new file mode 100644 index 0000000000..b5aca0347f --- /dev/null +++ b/lib/Target/R600/AMDGPURegisterInfo.td @@ -0,0 +1,25 @@ +//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tablegen register definitions common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let Namespace = "AMDGPU" in { + +foreach Index = 0-15 in { + def sub#Index : SubRegIndex; +} + +def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; + +} + +include "R600RegisterInfo.td" +include "SIRegisterInfo.td" diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp new file mode 100644 index 0000000000..b723433c16 --- /dev/null +++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp @@ -0,0 +1,894 @@ +//===-- AMDGPUStructurizeCFG.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// The pass implemented in this file transforms the programs control flow +/// graph into a form that's suitable for code generation on hardware that +/// implements control flow by execution masking. This currently includes all +/// AMD GPUs but may as well be useful for other types of hardware. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionIterator.h" +#include "llvm/Analysis/RegionPass.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Support/PatternMatch.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +namespace { + +// Definition of the complex types used in this pass. + +typedef std::pair<BasicBlock *, Value *> BBValuePair; + +typedef SmallVector<RegionNode*, 8> RNVector; +typedef SmallVector<BasicBlock*, 8> BBVector; +typedef SmallVector<BranchInst*, 8> BranchVector; +typedef SmallVector<BBValuePair, 2> BBValueVector; + +typedef SmallPtrSet<BasicBlock *, 8> BBSet; + +typedef DenseMap<PHINode *, BBValueVector> PhiMap; +typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap; +typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap; +typedef DenseMap<BasicBlock *, Value *> BBPredicates; +typedef DenseMap<BasicBlock *, BBPredicates> PredMap; +typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap; +typedef DenseMap<BasicBlock *, BBVector> BB2BBVecMap; + +// The name for newly created blocks. + +static const char *FlowBlockName = "Flow"; + +/// @brief Find the nearest common dominator for multiple BasicBlocks +/// +/// Helper class for AMDGPUStructurizeCFG +/// TODO: Maybe move into common code +class NearestCommonDominator { + + DominatorTree *DT; + + DTN2UnsignedMap IndexMap; + + BasicBlock *Result; + unsigned ResultIndex; + bool ExplicitMentioned; + +public: + /// \brief Start a new query + NearestCommonDominator(DominatorTree *DomTree) { + DT = DomTree; + Result = 0; + } + + /// \brief Add BB to the resulting dominator + void addBlock(BasicBlock *BB, bool Remember = true) { + + DomTreeNode *Node = DT->getNode(BB); + + if (Result == 0) { + unsigned Numbering = 0; + for (;Node;Node = Node->getIDom()) + IndexMap[Node] = ++Numbering; + Result = BB; + ResultIndex = 1; + ExplicitMentioned = Remember; + return; + } + + for (;Node;Node = Node->getIDom()) + if (IndexMap.count(Node)) + break; + else + IndexMap[Node] = 0; + + assert(Node && "Dominator tree invalid!"); + + unsigned Numbering = IndexMap[Node]; + if (Numbering > ResultIndex) { + Result = Node->getBlock(); + ResultIndex = Numbering; + ExplicitMentioned = Remember && (Result == BB); + } else if (Numbering == ResultIndex) { + ExplicitMentioned |= Remember; + } + } + + /// \brief Is "Result" one of the BBs added with "Remember" = True? + bool wasResultExplicitMentioned() { + return ExplicitMentioned; + } + + /// \brief Get the query result + BasicBlock *getResult() { + return Result; + } +}; + +/// @brief Transforms the control flow graph on one single entry/exit region +/// at a time. +/// +/// After the transform all "If"/"Then"/"Else" style control flow looks like +/// this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 | +/// | / +/// |/ +/// 3 +/// || Where: +/// | | 1 = "If" block, calculates the condition +/// 4 | 2 = "Then" subregion, runs if the condition is true +/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow +/// |/ 4 = "Else" optional subregion, runs if the condition is false +/// 5 5 = "End" block, also rejoins the control flow +/// \endverbatim +/// +/// Control flow is expressed as a branch where the true exit goes into the +/// "Then"/"Else" region, while the false exit skips the region +/// The condition for the optional "Else" region is expressed as a PHI node. +/// The incomming values of the PHI node are true for the "If" edge and false +/// for the "Then" edge. +/// +/// Additionally to that even complicated loops look like this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 ^ Where: +/// | / 1 = "Entry" block +/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block +/// 3 3 = "Flow" block, with back edge to entry block +/// | +/// \endverbatim +/// +/// The back edge of the "Flow" block is always on the false side of the branch +/// while the true side continues the general flow. So the loop condition +/// consist of a network of PHI nodes where the true incoming values expresses +/// breaks and the false values expresses continue states. +class AMDGPUStructurizeCFG : public RegionPass { + + static char ID; + + Type *Boolean; + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + + Function *Func; + Region *ParentRegion; + + DominatorTree *DT; + + RNVector Order; + BBSet Visited; + + BBPhiMap DeletedPhis; + BB2BBVecMap AddedPhis; + + PredMap Predicates; + BranchVector Conditions; + + BB2BBMap Loops; + PredMap LoopPreds; + BranchVector LoopConds; + + RegionNode *PrevNode; + + void orderNodes(); + + void analyzeLoops(RegionNode *N); + + Value *invert(Value *Condition); + + Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert); + + void gatherPredicates(RegionNode *N); + + void collectInfos(); + + void insertConditions(bool Loops); + + void delPhiValues(BasicBlock *From, BasicBlock *To); + + void addPhiValues(BasicBlock *From, BasicBlock *To); + + void setPhiValues(); + + void killTerminator(BasicBlock *BB); + + void changeExit(RegionNode *Node, BasicBlock *NewExit, + bool IncludeDominator); + + BasicBlock *getNextFlow(BasicBlock *Dominator); + + BasicBlock *needPrefix(bool NeedEmpty); + + BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed); + + void setPrevNode(BasicBlock *BB); + + bool dominatesPredicates(BasicBlock *BB, RegionNode *Node); + + bool isPredictableTrue(RegionNode *Node); + + void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd); + + void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd); + + void createFlow(); + + void rebuildSSA(); + +public: + AMDGPUStructurizeCFG(): + RegionPass(ID) { + + initializeRegionInfoPass(*PassRegistry::getPassRegistry()); + } + + using Pass::doInitialization; + virtual bool doInitialization(Region *R, RGPassManager &RGM); + + virtual bool runOnRegion(Region *R, RGPassManager &RGM); + + virtual const char *getPassName() const { + return "AMDGPU simplify control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + RegionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char AMDGPUStructurizeCFG::ID = 0; + +/// \brief Initialize the types and constants used in the pass +bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { + LLVMContext &Context = R->getEntry()->getContext(); + + Boolean = Type::getInt1Ty(Context); + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + + return false; +} + +/// \brief Build up the general order of nodes +void AMDGPUStructurizeCFG::orderNodes() { + scc_iterator<Region *> I = scc_begin(ParentRegion), + E = scc_end(ParentRegion); + for (Order.clear(); I != E; ++I) { + std::vector<RegionNode *> &Nodes = *I; + Order.append(Nodes.begin(), Nodes.end()); + } +} + +/// \brief Determine the end of the loops +void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) { + + if (N->isSubRegion()) { + // Test for exit as back edge + BasicBlock *Exit = N->getNodeAs<Region>()->getExit(); + if (Visited.count(Exit)) + Loops[Exit] = N->getEntry(); + + } else { + // Test for sucessors as back edge + BasicBlock *BB = N->getNodeAs<BasicBlock>(); + BranchInst *Term = cast<BranchInst>(BB->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + + if (Visited.count(Succ)) + Loops[Succ] = BB; + } + } +} + +/// \brief Invert the given condition +Value *AMDGPUStructurizeCFG::invert(Value *Condition) { + + // First: Check if it's a constant + if (Condition == BoolTrue) + return BoolFalse; + + if (Condition == BoolFalse) + return BoolTrue; + + if (Condition == BoolUndef) + return BoolUndef; + + // Second: If the condition is already inverted, return the original value + if (match(Condition, m_Not(m_Value(Condition)))) + return Condition; + + // Third: Check all the users for an invert + BasicBlock *Parent = cast<Instruction>(Condition)->getParent(); + for (Value::use_iterator I = Condition->use_begin(), + E = Condition->use_end(); I != E; ++I) { + + Instruction *User = dyn_cast<Instruction>(*I); + if (!User || User->getParent() != Parent) + continue; + + if (match(*I, m_Not(m_Specific(Condition)))) + return *I; + } + + // Last option: Create a new instruction + return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); +} + +/// \brief Build the condition for one edge +Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, + bool Invert) { + Value *Cond = Invert ? BoolFalse : BoolTrue; + if (Term->isConditional()) { + Cond = Term->getCondition(); + + if (Idx != Invert) + Cond = invert(Cond); + } + return Cond; +} + +/// \brief Analyze the predecessors of each block and build up predicates +void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) { + + RegionInfo *RI = ParentRegion->getRegionInfo(); + BasicBlock *BB = N->getEntry(); + BBPredicates &Pred = Predicates[BB]; + BBPredicates &LPred = LoopPreds[BB]; + + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + PI != PE; ++PI) { + + // Ignore it if it's a branch from outside into our region entry + if (!ParentRegion->contains(*PI)) + continue; + + Region *R = RI->getRegionFor(*PI); + if (R == ParentRegion) { + + // It's a top level block in our region + BranchInst *Term = cast<BranchInst>((*PI)->getTerminator()); + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + if (Succ != BB) + continue; + + if (Visited.count(*PI)) { + // Normal forward edge + if (Term->isConditional()) { + // Try to treat it like an ELSE block + BasicBlock *Other = Term->getSuccessor(!i); + if (Visited.count(Other) && !Loops.count(Other) && + !Pred.count(Other) && !Pred.count(*PI)) { + + Pred[Other] = BoolFalse; + Pred[*PI] = BoolTrue; + continue; + } + } + Pred[*PI] = buildCondition(Term, i, false); + + } else { + // Back edge + LPred[*PI] = buildCondition(Term, i, true); + } + } + + } else { + + // It's an exit from a sub region + while(R->getParent() != ParentRegion) + R = R->getParent(); + + // Edge from inside a subregion to its entry, ignore it + if (R == N) + continue; + + BasicBlock *Entry = R->getEntry(); + if (Visited.count(Entry)) + Pred[Entry] = BoolTrue; + else + LPred[Entry] = BoolFalse; + } + } +} + +/// \brief Collect various loop and predicate infos +void AMDGPUStructurizeCFG::collectInfos() { + + // Reset predicate + Predicates.clear(); + + // and loop infos + Loops.clear(); + LoopPreds.clear(); + + // Reset the visited nodes + Visited.clear(); + + for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); + OI != OE; ++OI) { + + // Analyze all the conditions leading to a node + gatherPredicates(*OI); + + // Remember that we've seen this node + Visited.insert((*OI)->getEntry()); + + // Find the last back edges + analyzeLoops(*OI); + } +} + +/// \brief Insert the missing branch conditions +void AMDGPUStructurizeCFG::insertConditions(bool Loops) { + BranchVector &Conds = Loops ? LoopConds : Conditions; + Value *Default = Loops ? BoolTrue : BoolFalse; + SSAUpdater PhiInserter; + + for (BranchVector::iterator I = Conds.begin(), + E = Conds.end(); I != E; ++I) { + + BranchInst *Term = *I; + assert(Term->isConditional()); + + BasicBlock *Parent = Term->getParent(); + BasicBlock *SuccTrue = Term->getSuccessor(0); + BasicBlock *SuccFalse = Term->getSuccessor(1); + + PhiInserter.Initialize(Boolean, ""); + PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default); + PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default); + + BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue]; + + NearestCommonDominator Dominator(DT); + Dominator.addBlock(Parent, false); + + Value *ParentValue = 0; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (PI->first == Parent) { + ParentValue = PI->second; + break; + } + PhiInserter.AddAvailableValue(PI->first, PI->second); + Dominator.addBlock(PI->first); + } + + if (ParentValue) { + Term->setCondition(ParentValue); + } else { + if (!Dominator.wasResultExplicitMentioned()) + PhiInserter.AddAvailableValue(Dominator.getResult(), Default); + + Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent)); + } + } +} + +/// \brief Remove all PHI values coming from "From" into "To" and remember +/// them in DeletedPhis +void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { + PhiMap &Map = DeletedPhis[To]; + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa<PHINode>(*I);) { + + PHINode &Phi = cast<PHINode>(*I++); + while (Phi.getBasicBlockIndex(From) != -1) { + Value *Deleted = Phi.removeIncomingValue(From, false); + Map[&Phi].push_back(std::make_pair(From, Deleted)); + } + } +} + +/// \brief Add a dummy PHI value as soon as we knew the new predecessor +void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa<PHINode>(*I);) { + + PHINode &Phi = cast<PHINode>(*I++); + Value *Undef = UndefValue::get(Phi.getType()); + Phi.addIncoming(Undef, From); + } + AddedPhis[To].push_back(From); +} + +/// \brief Add the real PHI value as soon as everything is set up +void AMDGPUStructurizeCFG::setPhiValues() { + + SSAUpdater Updater; + for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end(); + AI != AE; ++AI) { + + BasicBlock *To = AI->first; + BBVector &From = AI->second; + + if (!DeletedPhis.count(To)) + continue; + + PhiMap &Map = DeletedPhis[To]; + for (PhiMap::iterator PI = Map.begin(), PE = Map.end(); + PI != PE; ++PI) { + + PHINode *Phi = PI->first; + Value *Undef = UndefValue::get(Phi->getType()); + Updater.Initialize(Phi->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(To, Undef); + + NearestCommonDominator Dominator(DT); + Dominator.addBlock(To, false); + for (BBValueVector::iterator VI = PI->second.begin(), + VE = PI->second.end(); VI != VE; ++VI) { + + Updater.AddAvailableValue(VI->first, VI->second); + Dominator.addBlock(VI->first); + } + + if (!Dominator.wasResultExplicitMentioned()) + Updater.AddAvailableValue(Dominator.getResult(), Undef); + + for (BBVector::iterator FI = From.begin(), FE = From.end(); + FI != FE; ++FI) { + + int Idx = Phi->getBasicBlockIndex(*FI); + assert(Idx != -1); + Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI)); + } + } + + DeletedPhis.erase(To); + } + assert(DeletedPhis.empty()); +} + +/// \brief Remove phi values from all successors and then remove the terminator. +void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) { + TerminatorInst *Term = BB->getTerminator(); + if (!Term) + return; + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); + SI != SE; ++SI) { + + delPhiValues(BB, *SI); + } + + Term->eraseFromParent(); +} + +/// \brief Let node exit(s) point to NewExit +void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, + bool IncludeDominator) { + + if (Node->isSubRegion()) { + Region *SubRegion = Node->getNodeAs<Region>(); + BasicBlock *OldExit = SubRegion->getExit(); + BasicBlock *Dominator = 0; + + // Find all the edges from the sub region to the exit + for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit); + I != E;) { + + BasicBlock *BB = *I++; + if (!SubRegion->contains(BB)) + continue; + + // Modify the edges to point to the new exit + delPhiValues(BB, OldExit); + BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit); + addPhiValues(BB, NewExit); + + // Find the new dominator (if requested) + if (IncludeDominator) { + if (!Dominator) + Dominator = BB; + else + Dominator = DT->findNearestCommonDominator(Dominator, BB); + } + } + + // Change the dominator (if requested) + if (Dominator) + DT->changeImmediateDominator(NewExit, Dominator); + + // Update the region info + SubRegion->replaceExit(NewExit); + + } else { + BasicBlock *BB = Node->getNodeAs<BasicBlock>(); + killTerminator(BB); + BranchInst::Create(NewExit, BB); + addPhiValues(BB, NewExit); + if (IncludeDominator) + DT->changeImmediateDominator(NewExit, BB); + } +} + +/// \brief Create a new flow node and update dominator tree and region info +BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) { + LLVMContext &Context = Func->getContext(); + BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : + Order.back()->getEntry(); + BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, + Func, Insert); + DT->addNewBlock(Flow, Dominator); + ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); + return Flow; +} + +/// \brief Create a new or reuse the previous node as flow node +BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) { + + BasicBlock *Entry = PrevNode->getEntry(); + + if (!PrevNode->isSubRegion()) { + killTerminator(Entry); + if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end()) + return Entry; + + } + + // create a new flow node + BasicBlock *Flow = getNextFlow(Entry); + + // and wire it up + changeExit(PrevNode, Flow, true); + PrevNode = ParentRegion->getBBNode(Flow); + return Flow; +} + +/// \brief Returns the region exit if possible, otherwise just a new flow node +BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow, + bool ExitUseAllowed) { + + if (Order.empty() && ExitUseAllowed) { + BasicBlock *Exit = ParentRegion->getExit(); + DT->changeImmediateDominator(Exit, Flow); + addPhiValues(Flow, Exit); + return Exit; + } + return getNextFlow(Flow); +} + +/// \brief Set the previous node +void AMDGPUStructurizeCFG::setPrevNode(BasicBlock *BB) { + PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0; +} + +/// \brief Does BB dominate all the predicates of Node ? +bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) { + BBPredicates &Preds = Predicates[Node->getEntry()]; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (!DT->dominates(BB, PI->first)) + return false; + } + return true; +} + +/// \brief Can we predict that this node will always be called? +bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) { + + BBPredicates &Preds = Predicates[Node->getEntry()]; + bool Dominated = false; + + // Regionentry is always true + if (PrevNode == 0) + return true; + + for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + + if (I->second != BoolTrue) + return false; + + if (!Dominated && DT->dominates(I->first, PrevNode->getEntry())) + Dominated = true; + } + + // TODO: The dominator check is too strict + return Dominated; +} + +/// Take one node from the order vector and wire it up +void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed, + BasicBlock *LoopEnd) { + + RegionNode *Node = Order.pop_back_val(); + Visited.insert(Node->getEntry()); + + if (isPredictableTrue(Node)) { + // Just a linear flow + if (PrevNode) { + changeExit(PrevNode, Node->getEntry(), true); + } + PrevNode = Node; + + } else { + // Insert extra prefix node (or reuse last one) + BasicBlock *Flow = needPrefix(false); + + // Insert extra postfix node (or use exit instead) + BasicBlock *Entry = Node->getEntry(); + BasicBlock *Next = needPostfix(Flow, ExitUseAllowed); + + // let it point to entry and next block + Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow)); + addPhiValues(Flow, Entry); + DT->changeImmediateDominator(Entry, Flow); + + PrevNode = Node; + while (!Order.empty() && !Visited.count(LoopEnd) && + dominatesPredicates(Entry, Order.back())) { + handleLoops(false, LoopEnd); + } + + changeExit(PrevNode, Next, false); + setPrevNode(Next); + } +} + +void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed, + BasicBlock *LoopEnd) { + RegionNode *Node = Order.back(); + BasicBlock *LoopStart = Node->getEntry(); + + if (!Loops.count(LoopStart)) { + wireFlow(ExitUseAllowed, LoopEnd); + return; + } + + if (!isPredictableTrue(Node)) + LoopStart = needPrefix(true); + + LoopEnd = Loops[Node->getEntry()]; + wireFlow(false, LoopEnd); + while (!Visited.count(LoopEnd)) { + handleLoops(false, LoopEnd); + } + + // Create an extra loop end node + LoopEnd = needPrefix(false); + BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed); + LoopConds.push_back(BranchInst::Create(Next, LoopStart, + BoolUndef, LoopEnd)); + addPhiValues(LoopEnd, LoopStart); + setPrevNode(Next); +} + +/// After this function control flow looks like it should be, but +/// branches and PHI nodes only have undefined conditions. +void AMDGPUStructurizeCFG::createFlow() { + + BasicBlock *Exit = ParentRegion->getExit(); + bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit); + + DeletedPhis.clear(); + AddedPhis.clear(); + Conditions.clear(); + LoopConds.clear(); + + PrevNode = 0; + Visited.clear(); + + while (!Order.empty()) { + handleLoops(EntryDominatesExit, 0); + } + + if (PrevNode) + changeExit(PrevNode, Exit, EntryDominatesExit); + else + assert(EntryDominatesExit); +} + +/// Handle a rare case where the disintegrated nodes instructions +/// no longer dominate all their uses. Not sure if this is really nessasary +void AMDGPUStructurizeCFG::rebuildSSA() { + SSAUpdater Updater; + for (Region::block_iterator I = ParentRegion->block_begin(), + E = ParentRegion->block_end(); + I != E; ++I) { + + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + + bool Initialized = false; + for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { + + Next = I->getNext(); + + Instruction *User = cast<Instruction>(I->getUser()); + if (User->getParent() == BB) { + continue; + + } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(*I) == BB) + continue; + } + + if (DT->dominates(II, User)) + continue; + + if (!Initialized) { + Value *Undef = UndefValue::get(II->getType()); + Updater.Initialize(II->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(BB, II); + Initialized = true; + } + Updater.RewriteUseAfterInsertions(*I); + } + } + } +} + +/// \brief Run the transformation for each region found +bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { + if (R->isTopLevelRegion()) + return false; + + Func = R->getEntry()->getParent(); + ParentRegion = R; + + DT = &getAnalysis<DominatorTree>(); + + orderNodes(); + collectInfos(); + createFlow(); + insertConditions(false); + insertConditions(true); + setPhiValues(); + rebuildSSA(); + + // Cleanup + Order.clear(); + Visited.clear(); + DeletedPhis.clear(); + AddedPhis.clear(); + Predicates.clear(); + Conditions.clear(); + Loops.clear(); + LoopPreds.clear(); + LoopConds.clear(); + + return true; +} + +/// \brief Create the pass +Pass *llvm::createAMDGPUStructurizeCFGPass() { + return new AMDGPUStructurizeCFG(); +} diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp new file mode 100644 index 0000000000..0f356a1c3f --- /dev/null +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -0,0 +1,87 @@ +//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" + +using namespace llvm; + +#define GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "AMDGPUGenSubtargetInfo.inc" + +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : + AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) { + InstrItins = getInstrItineraryForCPU(CPU); + + memset(CapsOverride, 0, sizeof(*CapsOverride) + * AMDGPUDeviceInfo::MaxNumberCapabilities); + // Default card + StringRef GPU = CPU; + Is64bit = false; + DefaultSize[0] = 64; + DefaultSize[1] = 1; + DefaultSize[2] = 1; + ParseSubtargetFeatures(GPU, FS); + DevName = GPU; + Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit); +} + +AMDGPUSubtarget::~AMDGPUSubtarget() { + delete Device; +} + +bool +AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const { + assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities && + "Caps index is out of bounds!"); + return CapsOverride[caps]; +} +bool +AMDGPUSubtarget::is64bit() const { + return Is64bit; +} +bool +AMDGPUSubtarget::isTargetELF() const { + return false; +} +size_t +AMDGPUSubtarget::getDefaultSize(uint32_t dim) const { + if (dim > 3) { + return 1; + } else { + return DefaultSize[dim]; + } +} + +std::string +AMDGPUSubtarget::getDataLayout() const { + if (!Device) { + return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" + "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64"); + } + return Device->getDataLayout(); +} + +std::string +AMDGPUSubtarget::getDeviceName() const { + return DevName; +} +const AMDGPUDevice * +AMDGPUSubtarget::device() const { + return Device; +} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h new file mode 100644 index 0000000000..1973fc6d54 --- /dev/null +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -0,0 +1,65 @@ +//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUSUBTARGET_H +#define AMDGPUSUBTARGET_H +#include "AMDILDevice.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_SUBTARGETINFO_HEADER +#include "AMDGPUGenSubtargetInfo.inc" + +#define MAX_CB_SIZE (1 << 16) + +namespace llvm { + +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { +private: + bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities]; + const AMDGPUDevice *Device; + size_t DefaultSize[3]; + std::string DevName; + bool Is64bit; + bool Is32on64bit; + bool DumpCode; + bool R600ALUInst; + + InstrItineraryData InstrItins; + +public: + AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); + virtual ~AMDGPUSubtarget(); + + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + bool isOverride(AMDGPUDeviceInfo::Caps) const; + bool is64bit() const; + + // Helper functions to simplify if statements + bool isTargetELF() const; + const AMDGPUDevice* device() const; + std::string getDataLayout() const; + std::string getDeviceName() const; + virtual size_t getDefaultSize(uint32_t dim) const; + bool dumpCode() const { return DumpCode; } + bool r600ALUEncoding() const { return R600ALUInst; } + +}; + +} // End namespace llvm + +#endif // AMDGPUSUBTARGET_H diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp new file mode 100644 index 0000000000..0185747544 --- /dev/null +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -0,0 +1,162 @@ +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU target machine contains all of the hardware specific +/// information needed to emit code for R600 and SI GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPU.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/PassManager.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include <llvm/CodeGen/Passes.h> + +using namespace llvm; + +extern "C" void LLVMInitializeR600Target() { + // Register the target + RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget); +} + +static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { + return new ScheduleDAGMI(C, new R600SchedStrategy()); +} + +static MachineSchedRegistry +SchedCustomRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OptLevel +) +: + LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), + Subtarget(TT, CPU, FS), + Layout(Subtarget.getDataLayout()), + FrameLowering(TargetFrameLowering::StackGrowsUp, + Subtarget.device()->getStackAlignment(), 0), + IntrinsicInfo(this), + InstrItins(&Subtarget.getInstrItineraryData()) { + // TLInfo uses InstrInfo so it must be initialized after. + if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + InstrInfo = new R600InstrInfo(*this); + TLInfo = new R600TargetLowering(*this); + } else { + InstrInfo = new SIInstrInfo(*this); + TLInfo = new SITargetLowering(*this); + } +} + +AMDGPUTargetMachine::~AMDGPUTargetMachine() { +} + +namespace { +class AMDGPUPassConfig : public TargetPassConfig { +public: + AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + enablePass(&MachineSchedulerID); + MachineSchedRegistry::setDefault(createR600MachineScheduler); + } + } + + AMDGPUTargetMachine &getAMDGPUTargetMachine() const { + return getTM<AMDGPUTargetMachine>(); + } + + virtual bool addPreISel(); + virtual bool addInstSelector(); + virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + virtual bool addPreSched2(); + virtual bool addPreEmitPass(); +}; +} // End of anonymous namespace + +TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { + return new AMDGPUPassConfig(this, PM); +} + +bool +AMDGPUPassConfig::addPreISel() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + addPass(createAMDGPUStructurizeCFGPass()); + addPass(createSIAnnotateControlFlowPass()); + } + return false; +} + +bool AMDGPUPassConfig::addInstSelector() { + addPass(createAMDGPUPeepholeOpt(*TM)); + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + // This callbacks this pass uses are not implemented yet on SI. + addPass(createAMDGPUIndirectAddressingPass(*TM)); + } + return false; +} + +bool AMDGPUPassConfig::addPreRegAlloc() { + addPass(createAMDGPUConvertToISAPass(*TM)); + return false; +} + +bool AMDGPUPassConfig::addPostRegAlloc() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + addPass(createSIInsertWaits(*TM)); + } + return false; +} + +bool AMDGPUPassConfig::addPreSched2() { + + addPass(&IfConverterID); + return false; +} + +bool AMDGPUPassConfig::addPreEmitPass() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + addPass(createAMDGPUCFGPreparationPass(*TM)); + addPass(createAMDGPUCFGStructurizerPass(*TM)); + addPass(createR600ExpandSpecialInstrsPass(*TM)); + addPass(&FinalizeMachineBundlesID); + } else { + addPass(createSILowerControlFlowPass(*TM)); + } + + return false; +} + diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h new file mode 100644 index 0000000000..2afe7873a9 --- /dev/null +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -0,0 +1,70 @@ +//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_TARGET_MACHINE_H +#define AMDGPU_TARGET_MACHINE_H + +#include "AMDGPUFrameLowering.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "AMDILIntrinsicInfo.h" +#include "R600ISelLowering.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/IR/DataLayout.h" + +namespace llvm { + +MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT); + +class AMDGPUTargetMachine : public LLVMTargetMachine { + + AMDGPUSubtarget Subtarget; + const DataLayout Layout; + AMDGPUFrameLowering FrameLowering; + AMDGPUIntrinsicInfo IntrinsicInfo; + const AMDGPUInstrInfo * InstrInfo; + AMDGPUTargetLowering * TLInfo; + const InstrItineraryData* InstrItins; + +public: + AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + ~AMDGPUTargetMachine(); + virtual const AMDGPUFrameLowering* getFrameLowering() const { + return &FrameLowering; + } + virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const { + return &IntrinsicInfo; + } + virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;} + virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; } + virtual const AMDGPURegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + virtual AMDGPUTargetLowering * getTargetLowering() const { + return TLInfo; + } + virtual const InstrItineraryData* getInstrItineraryData() const { + return InstrItins; + } + virtual const DataLayout* getDataLayout() const { return &Layout; } + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); +}; + +} // End namespace llvm + +#endif // AMDGPU_TARGET_MACHINE_H diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h new file mode 100644 index 0000000000..39ab664d10 --- /dev/null +++ b/lib/Target/R600/AMDIL.h @@ -0,0 +1,121 @@ +//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// This file contains the entry points for global functions defined in the LLVM +/// AMDGPU back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDIL_H +#define AMDIL_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetMachine.h" + +#define ARENA_SEGMENT_RESERVED_UAVS 12 +#define DEFAULT_ARENA_UAV_ID 8 +#define DEFAULT_RAW_UAV_ID 7 +#define GLOBAL_RETURN_RAW_UAV_ID 11 +#define HW_MAX_NUM_CB 8 +#define MAX_NUM_UNIQUE_UAVS 8 +#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8 +#define OPENCL_MAX_READ_IMAGES 128 +#define OPENCL_MAX_WRITE_IMAGES 8 +#define OPENCL_MAX_SAMPLERS 16 + +// The next two values can never be zero, as zero is the ID that is +// used to assert against. +#define DEFAULT_LDS_ID 1 +#define DEFAULT_GDS_ID 1 +#define DEFAULT_SCRATCH_ID 1 +#define DEFAULT_VEC_SLOTS 8 + +#define OCL_DEVICE_RV710 0x0001 +#define OCL_DEVICE_RV730 0x0002 +#define OCL_DEVICE_RV770 0x0004 +#define OCL_DEVICE_CEDAR 0x0008 +#define OCL_DEVICE_REDWOOD 0x0010 +#define OCL_DEVICE_JUNIPER 0x0020 +#define OCL_DEVICE_CYPRESS 0x0040 +#define OCL_DEVICE_CAICOS 0x0080 +#define OCL_DEVICE_TURKS 0x0100 +#define OCL_DEVICE_BARTS 0x0200 +#define OCL_DEVICE_CAYMAN 0x0400 +#define OCL_DEVICE_ALL 0x3FFF + +/// The number of function ID's that are reserved for +/// internal compiler usage. +const unsigned int RESERVED_FUNCS = 1024; + +namespace llvm { +class AMDGPUInstrPrinter; +class FunctionPass; +class MCAsmInfo; +class raw_ostream; +class Target; +class TargetMachine; + +// Instruction selection passes. +FunctionPass* + createAMDGPUISelDag(TargetMachine &TM); +FunctionPass* + createAMDGPUPeepholeOpt(TargetMachine &TM); + +// Pre emit passes. +FunctionPass* + createAMDGPUCFGPreparationPass(TargetMachine &TM); +FunctionPass* + createAMDGPUCFGStructurizerPass(TargetMachine &TM); + +extern Target TheAMDGPUTarget; +} // end namespace llvm; + +// Include device information enumerations +#include "AMDILDeviceInfo.h" + +namespace llvm { +/// OpenCL uses address spaces to differentiate between +/// various memory regions on the hardware. On the CPU +/// all of the address spaces point to the same memory, +/// however on the GPU, each address space points to +/// a seperate piece of memory that is unique from other +/// memory locations. +namespace AMDGPUAS { +enum AddressSpaces { + PRIVATE_ADDRESS = 0, ///< Address space for private memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, ///< Address space for constant memory + LOCAL_ADDRESS = 3, ///< Address space for local memory. + REGION_ADDRESS = 4, ///< Address space for region memory. + ADDRESS_NONE = 5, ///< Address space for unknown memory. + PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + LAST_ADDRESS = 24 +}; + +} // namespace AMDGPUAS + +} // end namespace llvm +#endif // AMDIL_H diff --git a/lib/Target/R600/AMDIL7XXDevice.cpp b/lib/Target/R600/AMDIL7XXDevice.cpp new file mode 100644 index 0000000000..ea6ac34f57 --- /dev/null +++ b/lib/Target/R600/AMDIL7XXDevice.cpp @@ -0,0 +1,115 @@ +//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// \file +//==-----------------------------------------------------------------------===// +#include "AMDIL7XXDevice.h" +#include "AMDGPUSubtarget.h" +#include "AMDILDevice.h" + +using namespace llvm; + +AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) { + setCaps(); + std::string name = mSTM->getDeviceName(); + if (name == "rv710") { + DeviceFlag = OCL_DEVICE_RV710; + } else if (name == "rv730") { + DeviceFlag = OCL_DEVICE_RV730; + } else { + DeviceFlag = OCL_DEVICE_RV770; + } +} + +AMDGPU7XXDevice::~AMDGPU7XXDevice() { +} + +void AMDGPU7XXDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::LocalMem); +} + +size_t AMDGPU7XXDevice::getMaxLDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_700; + } + return 0; +} + +size_t AMDGPU7XXDevice::getWavefrontSize() const { + return AMDGPUDevice::HalfWavefrontSize; +} + +uint32_t AMDGPU7XXDevice::getGeneration() const { + return AMDGPUDeviceInfo::HD4XXX; +} + +uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const { + switch (DeviceID) { + default: + assert(0 && "ID type passed in is unknown!"); + break; + case GLOBAL_ID: + case CONSTANT_ID: + case RAW_UAV_ID: + case ARENA_UAV_ID: + break; + case LDS_ID: + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return DEFAULT_LDS_ID; + } + break; + case SCRATCH_ID: + if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) { + return DEFAULT_SCRATCH_ID; + } + break; + case GDS_ID: + assert(0 && "GDS UAV ID is not supported on this chip"); + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { + return DEFAULT_GDS_ID; + } + break; + }; + + return 0; +} + +uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const { + return 1; +} + +AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) { + setCaps(); +} + +AMDGPU770Device::~AMDGPU770Device() { +} + +void AMDGPU770Device::setCaps() { + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { + mSWBits.set(AMDGPUDeviceInfo::FMA); + mHWBits.set(AMDGPUDeviceInfo::DoubleOps); + } + mSWBits.set(AMDGPUDeviceInfo::BarrierDetect); + mHWBits.reset(AMDGPUDeviceInfo::LongOps); + mSWBits.set(AMDGPUDeviceInfo::LongOps); + mSWBits.set(AMDGPUDeviceInfo::LocalMem); +} + +size_t AMDGPU770Device::getWavefrontSize() const { + return AMDGPUDevice::WavefrontSize; +} + +AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) { +} + +AMDGPU710Device::~AMDGPU710Device() { +} + +size_t AMDGPU710Device::getWavefrontSize() const { + return AMDGPUDevice::QuarterWavefrontSize; +} diff --git a/lib/Target/R600/AMDIL7XXDevice.h b/lib/Target/R600/AMDIL7XXDevice.h new file mode 100644 index 0000000000..1cf4ca415a --- /dev/null +++ b/lib/Target/R600/AMDIL7XXDevice.h @@ -0,0 +1,72 @@ +//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +/// \file +/// \brief Interface for the subtarget data classes. +/// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef AMDIL7XXDEVICEIMPL_H +#define AMDIL7XXDEVICEIMPL_H +#include "AMDILDevice.h" + +namespace llvm { +class AMDGPUSubtarget; + +//===----------------------------------------------------------------------===// +// 7XX generation of devices and their respective sub classes +//===----------------------------------------------------------------------===// + +/// \brief The AMDGPU7XXDevice class represents the generic 7XX device. +/// +/// All 7XX devices are derived from this class. The AMDGPU7XX device will only +/// support the minimal features that are required to be considered OpenCL 1.0 +/// compliant and nothing more. +class AMDGPU7XXDevice : public AMDGPUDevice { +public: + AMDGPU7XXDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPU7XXDevice(); + virtual size_t getMaxLDSSize() const; + virtual size_t getWavefrontSize() const; + virtual uint32_t getGeneration() const; + virtual uint32_t getResourceID(uint32_t DeviceID) const; + virtual uint32_t getMaxNumUAVs() const; + +protected: + virtual void setCaps(); +}; + +/// \brief The AMDGPU770Device class represents the RV770 chip and it's +/// derivative cards. +/// +/// The difference between this device and the base class is this device device +/// adds support for double precision and has a larger wavefront size. +class AMDGPU770Device : public AMDGPU7XXDevice { +public: + AMDGPU770Device(AMDGPUSubtarget *ST); + virtual ~AMDGPU770Device(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; + +/// \brief The AMDGPU710Device class derives from the 7XX base class. +/// +/// This class is a smaller derivative, so we need to overload some of the +/// functions in order to correctly specify this information. +class AMDGPU710Device : public AMDGPU7XXDevice { +public: + AMDGPU710Device(AMDGPUSubtarget *ST); + virtual ~AMDGPU710Device(); + virtual size_t getWavefrontSize() const; +}; + +} // namespace llvm +#endif // AMDILDEVICEIMPL_H diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td new file mode 100644 index 0000000000..c12cedcf7f --- /dev/null +++ b/lib/Target/R600/AMDILBase.td @@ -0,0 +1,85 @@ +//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +// Dummy Instruction itineraries for pseudo instructions +def ALU_NULL : FuncUnit; +def NullALU : InstrItinClass; + +//===----------------------------------------------------------------------===// +// AMDIL Subtarget features. +//===----------------------------------------------------------------------===// +def FeatureFP64 : SubtargetFeature<"fp64", + "CapsOverride[AMDGPUDeviceInfo::DoubleOps]", + "true", + "Enable 64bit double precision operations">; +def FeatureByteAddress : SubtargetFeature<"byte_addressable_store", + "CapsOverride[AMDGPUDeviceInfo::ByteStores]", + "true", + "Enable byte addressable stores">; +def FeatureBarrierDetect : SubtargetFeature<"barrier_detect", + "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]", + "true", + "Enable duplicate barrier detection(HD5XXX or later).">; +def FeatureImages : SubtargetFeature<"images", + "CapsOverride[AMDGPUDeviceInfo::Images]", + "true", + "Enable image functions">; +def FeatureMultiUAV : SubtargetFeature<"multi_uav", + "CapsOverride[AMDGPUDeviceInfo::MultiUAV]", + "true", + "Generate multiple UAV code(HD5XXX family or later)">; +def FeatureMacroDB : SubtargetFeature<"macrodb", + "CapsOverride[AMDGPUDeviceInfo::MacroDB]", + "true", + "Use internal macrodb, instead of macrodb in driver">; +def FeatureNoAlias : SubtargetFeature<"noalias", + "CapsOverride[AMDGPUDeviceInfo::NoAlias]", + "true", + "assert that all kernel argument pointers are not aliased">; +def FeatureNoInline : SubtargetFeature<"no-inline", + "CapsOverride[AMDGPUDeviceInfo::NoInline]", + "true", + "specify whether to not inline functions">; + +def Feature64BitPtr : SubtargetFeature<"64BitPtr", + "Is64bit", + "false", + "Specify if 64bit addressing should be used.">; + +def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr", + "Is32on64bit", + "false", + "Specify if 64bit sized pointers with 32bit addressing should be used.">; +def FeatureDebug : SubtargetFeature<"debug", + "CapsOverride[AMDGPUDeviceInfo::Debug]", + "true", + "Debug mode is enabled, so disable hardware accelerated address spaces.">; +def FeatureDumpCode : SubtargetFeature <"DumpCode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", + "R600ALUInst", + "false", + "Older version of ALU instructions encoding.">; + + +//===----------------------------------------------------------------------===// +// Register File, Calling Conv, Instruction Descriptions +//===----------------------------------------------------------------------===// + + +include "AMDILRegisterInfo.td" +include "AMDILInstrInfo.td" + diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp new file mode 100644 index 0000000000..b0cd0f9756 --- /dev/null +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -0,0 +1,3051 @@ +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// + +#define DEBUGME 0 +#define DEBUG_TYPE "structcfg" + +#include "AMDGPUInstrInfo.h" +#include "AMDIL.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DominatorInternals.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +// TODO: move-begin. + +//===----------------------------------------------------------------------===// +// +// Statistics for CFGStructurizer. +// +//===----------------------------------------------------------------------===// + +STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " + "matched"); +STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " + "matched"); +STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break " + "pattern matched"); +STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " + "pattern matched"); +STATISTIC(numLoopPatternMatch, "CFGStructurizer number of loop pattern " + "matched"); +STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); +STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); + +//===----------------------------------------------------------------------===// +// +// Miscellaneous utility for CFGStructurizer. +// +//===----------------------------------------------------------------------===// +namespace llvmCFGStruct { +#define SHOWNEWINSTR(i) \ + if (DEBUGME) errs() << "New instr: " << *i << "\n" + +#define SHOWNEWBLK(b, msg) \ +if (DEBUGME) { \ + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + errs() << "\n"; \ +} + +#define SHOWBLK_DETAIL(b, msg) \ +if (DEBUGME) { \ + if (b) { \ + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + b->print(errs()); \ + errs() << "\n"; \ + } \ +} + +#define INVALIDSCCNUM -1 +#define INVALIDREGNUM 0 + +template<class LoopinfoT> +void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) { + for (typename LoopinfoT::iterator iter = LoopInfo.begin(), + iterEnd = LoopInfo.end(); + iter != iterEnd; ++iter) { + (*iter)->print(OS, 0); + } +} + +template<class NodeT> +void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) { + size_t sz = Src.size(); + for (size_t i = 0; i < sz/2; ++i) { + NodeT *t = Src[i]; + Src[i] = Src[sz - i - 1]; + Src[sz - i - 1] = t; + } +} + +} //end namespace llvmCFGStruct + +//===----------------------------------------------------------------------===// +// +// supporting data structure for CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct { +template<class PassT> +struct CFGStructTraits { +}; + +template <class InstrT> +class BlockInformation { +public: + bool isRetired; + int sccNum; + //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr; + //Instructions defining the corresponding successor. + BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {} +}; + +template <class BlockT, class InstrT, class RegiT> +class LandInformation { +public: + BlockT *landBlk; + std::set<RegiT> breakInitRegs; //Registers that need to "reg = 0", before + //WHILELOOP(thisloop) init before entering + //thisloop. + std::set<RegiT> contInitRegs; //Registers that need to "reg = 0", after + //WHILELOOP(thisloop) init after entering + //thisloop. + std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop + //land block, branch cond on this reg. + std::set<RegiT> breakOnRegs; //registers that need to "if (reg) break + //endif" after ENDLOOP(thisloop) break + //outerLoopOf(thisLoop). + std::set<RegiT> contOnRegs; //registers that need to "if (reg) continue + //endif" after ENDLOOP(thisloop) continue on + //outerLoopOf(thisLoop). + LandInformation() : landBlk(NULL) {} +}; + +} //end of namespace llvmCFGStruct + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct { +// bixia TODO: port it to BasicBlock, not just MachineBasicBlock. +template<class PassT> +class CFGStructurizer { +public: + typedef enum { + Not_SinglePath = 0, + SinglePath_InPath = 1, + SinglePath_NotInPath = 2 + } PathToKind; + +public: + typedef typename PassT::InstructionType InstrT; + typedef typename PassT::FunctionType FuncT; + typedef typename PassT::DominatortreeType DomTreeT; + typedef typename PassT::PostDominatortreeType PostDomTreeT; + typedef typename PassT::DomTreeNodeType DomTreeNodeT; + typedef typename PassT::LoopinfoType LoopInfoT; + + typedef GraphTraits<FuncT *> FuncGTraits; + //typedef FuncGTraits::nodes_iterator BlockIterator; + typedef typename FuncT::iterator BlockIterator; + + typedef typename FuncGTraits::NodeType BlockT; + typedef GraphTraits<BlockT *> BlockGTraits; + typedef GraphTraits<Inverse<BlockT *> > InvBlockGTraits; + //typedef BlockGTraits::succ_iterator InstructionIterator; + typedef typename BlockT::iterator InstrIterator; + + typedef CFGStructTraits<PassT> CFGTraits; + typedef BlockInformation<InstrT> BlockInfo; + typedef std::map<BlockT *, BlockInfo *> BlockInfoMap; + + typedef int RegiT; + typedef typename PassT::LoopType LoopT; + typedef LandInformation<BlockT, InstrT, RegiT> LoopLandInfo; + typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap; + //landing info for loop break + typedef SmallVector<BlockT *, 32> BlockTSmallerVector; + +public: + CFGStructurizer(); + ~CFGStructurizer(); + + /// Perform the CFG structurization + bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri); + + /// Perform the CFG preparation + bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri); + +private: + void reversePredicateSetter(typename BlockT::iterator); + void orderBlocks(); + void printOrderedBlocks(llvm::raw_ostream &OS); + int patternMatch(BlockT *CurBlock); + int patternMatchGroup(BlockT *CurBlock); + + int serialPatternMatch(BlockT *CurBlock); + int ifPatternMatch(BlockT *CurBlock); + int switchPatternMatch(BlockT *CurBlock); + int loopendPatternMatch(BlockT *CurBlock); + int loopPatternMatch(BlockT *CurBlock); + + int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); + int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); + //int loopWithoutBreak(BlockT *); + + void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop, + BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock); + void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop, + BlockT *ContBlock, LoopT *contLoop); + bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block); + int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock); + int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock); + int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock, BlockT **LandBlockPtr); + void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock, BlockT *LandBlock, + bool Detail = false); + PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock, + bool AllowSideEntry = true); + BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock, + bool AllowSideEntry = true); + int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock); + void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock); + + void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock, + BlockT *TrueBlock, BlockT *FalseBlock, + BlockT *LandBlock); + void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand); + void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock, + BlockT *ExitLandBlock, RegiT SetReg); + void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock, + RegiT SetReg); + BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep, + std::set<BlockT*> &ExitBlockSet, + BlockT *ExitLandBlk); + BlockT *addLoopEndbranchBlock(LoopT *LoopRep, + BlockTSmallerVector &ExitingBlocks, + BlockTSmallerVector &ExitBlocks); + BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep); + void removeUnconditionalBranch(BlockT *SrcBlock); + void removeRedundantConditionalBranch(BlockT *SrcBlock); + void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks); + + void removeSuccessor(BlockT *SrcBlock); + BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock); + BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock); + + void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock, + InstrIterator InsertPos); + + void recordSccnum(BlockT *SrcBlock, int SCCNum); + int getSCCNum(BlockT *srcBlk); + + void retireBlock(BlockT *DstBlock, BlockT *SrcBlock); + bool isRetiredBlock(BlockT *SrcBlock); + bool isActiveLoophead(BlockT *CurBlock); + bool needMigrateBlock(BlockT *Block); + + BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock, + BlockTSmallerVector &exitBlocks, + std::set<BlockT*> &ExitBlockSet); + void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL); + BlockT *getLoopLandBlock(LoopT *LoopRep); + LoopLandInfo *getLoopLandInfo(LoopT *LoopRep); + + void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum); + void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum); + void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum); + void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum); + void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum); + + bool hasBackEdge(BlockT *curBlock); + unsigned getLoopDepth (LoopT *LoopRep); + int countActiveBlock( + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart, + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd); + BlockT *findNearestCommonPostDom(std::set<BlockT *>&); + BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2); + +private: + DomTreeT *domTree; + PostDomTreeT *postDomTree; + LoopInfoT *loopInfo; + PassT *passRep; + FuncT *funcRep; + + BlockInfoMap blockInfoMap; + LoopLandInfoMap loopLandInfoMap; + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks; + const AMDGPURegisterInfo *TRI; + +}; //template class CFGStructurizer + +template<class PassT> CFGStructurizer<PassT>::CFGStructurizer() + : domTree(NULL), postDomTree(NULL), loopInfo(NULL) { +} + +template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() { + for (typename BlockInfoMap::iterator I = blockInfoMap.begin(), + E = blockInfoMap.end(); I != E; ++I) { + delete I->second; + } +} + +template<class PassT> +bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass, + const AMDGPURegisterInfo * tri) { + passRep = &pass; + funcRep = &func; + TRI = tri; + + bool changed = false; + + //FIXME: if not reducible flow graph, make it so ??? + + if (DEBUGME) { + errs() << "AMDGPUCFGStructurizer::prepare\n"; + } + + loopInfo = CFGTraits::getLoopInfo(pass); + if (DEBUGME) { + errs() << "LoopInfo:\n"; + PrintLoopinfo(*loopInfo, errs()); + } + + orderBlocks(); + if (DEBUGME) { + errs() << "Ordered blocks:\n"; + printOrderedBlocks(errs()); + } + + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks; + + for (typename LoopInfoT::iterator iter = loopInfo->begin(), + iterEnd = loopInfo->end(); + iter != iterEnd; ++iter) { + LoopT* loopRep = (*iter); + BlockTSmallerVector exitingBlks; + loopRep->getExitingBlocks(exitingBlks); + + if (exitingBlks.size() == 0) { + BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep); + if (dummyExitBlk != NULL) + retBlks.push_back(dummyExitBlk); + } + } + + // Remove unconditional branch instr. + // Add dummy exit block iff there are multiple returns. + + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end(); + iterBlk != iterEndBlk; + ++iterBlk) { + BlockT *curBlk = *iterBlk; + removeUnconditionalBranch(curBlk); + removeRedundantConditionalBranch(curBlk); + if (CFGTraits::isReturnBlock(curBlk)) { + retBlks.push_back(curBlk); + } + assert(curBlk->succ_size() <= 2); + } //for + + if (retBlks.size() >= 2) { + addDummyExitBlock(retBlks); + changed = true; + } + + return changed; +} //CFGStructurizer::prepare + +template<class PassT> +bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass, + const AMDGPURegisterInfo * tri) { + passRep = &pass; + funcRep = &func; + TRI = tri; + + //Assume reducible CFG... + if (DEBUGME) { + errs() << "AMDGPUCFGStructurizer::run\n"; + func.viewCFG(); + } + + domTree = CFGTraits::getDominatorTree(pass); + if (DEBUGME) { + domTree->print(errs(), (const llvm::Module*)0); + } + + postDomTree = CFGTraits::getPostDominatorTree(pass); + if (DEBUGME) { + postDomTree->print(errs()); + } + + loopInfo = CFGTraits::getLoopInfo(pass); + if (DEBUGME) { + errs() << "LoopInfo:\n"; + PrintLoopinfo(*loopInfo, errs()); + } + + orderBlocks(); +#ifdef STRESSTEST + //Use the worse block ordering to test the algorithm. + ReverseVector(orderedBlks); +#endif + + if (DEBUGME) { + errs() << "Ordered blocks:\n"; + printOrderedBlocks(errs()); + } + int numIter = 0; + bool finish = false; + BlockT *curBlk; + bool makeProgress = false; + int numRemainedBlk = countActiveBlock(orderedBlks.begin(), + orderedBlks.end()); + + do { + ++numIter; + if (DEBUGME) { + errs() << "numIter = " << numIter + << ", numRemaintedBlk = " << numRemainedBlk << "\n"; + } + + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlk = orderedBlks.begin(); + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlkEnd = orderedBlks.end(); + + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + sccBeginIter = iterBlk; + BlockT *sccBeginBlk = NULL; + int sccNumBlk = 0; // The number of active blocks, init to a + // maximum possible number. + int sccNumIter; // Number of iteration in this SCC. + + while (iterBlk != iterBlkEnd) { + curBlk = *iterBlk; + + if (sccBeginBlk == NULL) { + sccBeginIter = iterBlk; + sccBeginBlk = curBlk; + sccNumIter = 0; + sccNumBlk = numRemainedBlk; // Init to maximum possible number. + if (DEBUGME) { + errs() << "start processing SCC" << getSCCNum(sccBeginBlk); + errs() << "\n"; + } + } + + if (!isRetiredBlock(curBlk)) { + patternMatch(curBlk); + } + + ++iterBlk; + + bool contNextScc = true; + if (iterBlk == iterBlkEnd + || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) { + // Just finish one scc. + ++sccNumIter; + int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk); + if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) { + if (DEBUGME) { + errs() << "Can't reduce SCC " << getSCCNum(curBlk) + << ", sccNumIter = " << sccNumIter; + errs() << "doesn't make any progress\n"; + } + contNextScc = true; + } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) { + sccNumBlk = sccRemainedNumBlk; + iterBlk = sccBeginIter; + contNextScc = false; + if (DEBUGME) { + errs() << "repeat processing SCC" << getSCCNum(curBlk) + << "sccNumIter = " << sccNumIter << "\n"; + func.viewCFG(); + } + } else { + // Finish the current scc. + contNextScc = true; + } + } else { + // Continue on next component in the current scc. + contNextScc = false; + } + + if (contNextScc) { + sccBeginBlk = NULL; + } + } //while, "one iteration" over the function. + + BlockT *entryBlk = FuncGTraits::nodes_begin(&func); + if (entryBlk->succ_size() == 0) { + finish = true; + if (DEBUGME) { + errs() << "Reduce to one block\n"; + } + } else { + int newnumRemainedBlk + = countActiveBlock(orderedBlks.begin(), orderedBlks.end()); + // consider cloned blocks ?? + if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) { + makeProgress = true; + numRemainedBlk = newnumRemainedBlk; + } else { + makeProgress = false; + if (DEBUGME) { + errs() << "No progress\n"; + } + } + } + } while (!finish && makeProgress); + + // Misc wrap up to maintain the consistency of the Function representation. + CFGTraits::wrapup(FuncGTraits::nodes_begin(&func)); + + // Detach retired Block, release memory. + for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(), + iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) { + if ((*iterMap).second && (*iterMap).second->isRetired) { + assert(((*iterMap).first)->getNumber() != -1); + if (DEBUGME) { + errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n"; + } + (*iterMap).first->eraseFromParent(); //Remove from the parent Function. + } + delete (*iterMap).second; + } + blockInfoMap.clear(); + + // clear loopLandInfoMap + for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(), + iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) { + delete (*iterMap).second; + } + loopLandInfoMap.clear(); + + if (DEBUGME) { + func.viewCFG(); + } + + if (!finish) { + assert(!"IRREDUCIBL_CF"); + } + + return true; +} //CFGStructurizer::run + +/// Print the ordered Blocks. +/// +template<class PassT> +void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) { + size_t i = 0; + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end(); + iterBlk != iterBlkEnd; + ++iterBlk, ++i) { + os << "BB" << (*iterBlk)->getNumber(); + os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; + if (i != 0 && i % 10 == 0) { + os << "\n"; + } else { + os << " "; + } + } +} //printOrderedBlocks + +/// Compute the reversed DFS post order of Blocks +/// +template<class PassT> void CFGStructurizer<PassT>::orderBlocks() { + int sccNum = 0; + BlockT *bb; + for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep), + sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) { + std::vector<BlockT *> &sccNext = *sccIter; + for (typename std::vector<BlockT *>::const_iterator + blockIter = sccNext.begin(), blockEnd = sccNext.end(); + blockIter != blockEnd; ++blockIter) { + bb = *blockIter; + orderedBlks.push_back(bb); + recordSccnum(bb, sccNum); + } + } + + //walk through all the block in func to check for unreachable + for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep), + blockEnd1 = FuncGTraits::nodes_end(funcRep); + blockIter1 != blockEnd1; ++blockIter1) { + BlockT *bb = &(*blockIter1); + sccNum = getSCCNum(bb); + if (sccNum == INVALIDSCCNUM) { + errs() << "unreachable block BB" << bb->getNumber() << "\n"; + } + } +} //orderBlocks + +template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) { + int numMatch = 0; + int curMatch; + + if (DEBUGME) { + errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n"; + } + + while ((curMatch = patternMatchGroup(curBlk)) > 0) { + numMatch += curMatch; + } + + if (DEBUGME) { + errs() << "End patternMatch BB" << curBlk->getNumber() + << ", numMatch = " << numMatch << "\n"; + } + + return numMatch; +} //patternMatch + +template<class PassT> +int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) { + int numMatch = 0; + numMatch += serialPatternMatch(curBlk); + numMatch += ifPatternMatch(curBlk); + numMatch += loopendPatternMatch(curBlk); + numMatch += loopPatternMatch(curBlk); + return numMatch; +}//patternMatchGroup + +template<class PassT> +int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) { + if (curBlk->succ_size() != 1) { + return 0; + } + + BlockT *childBlk = *curBlk->succ_begin(); + if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) { + return 0; + } + + mergeSerialBlock(curBlk, childBlk); + ++numSerialPatternMatch; + return 1; +} //serialPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) { + //two edges + if (curBlk->succ_size() != 2) { + return 0; + } + + if (hasBackEdge(curBlk)) { + return 0; + } + + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk); + if (branchInstr == NULL) { + return 0; + } + + assert(CFGTraits::isCondBranch(branchInstr)); + + BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr); + BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr); + BlockT *landBlk; + int cloned = 0; + + // TODO: Simplify + if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1 + && *trueBlk->succ_begin() == *falseBlk->succ_begin()) { + landBlk = *trueBlk->succ_begin(); + } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) { + landBlk = NULL; + } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) { + landBlk = falseBlk; + falseBlk = NULL; + } else if (falseBlk->succ_size() == 1 + && *falseBlk->succ_begin() == trueBlk) { + landBlk = trueBlk; + trueBlk = NULL; + } else if (falseBlk->succ_size() == 1 + && isSameloopDetachedContbreak(trueBlk, falseBlk)) { + landBlk = *falseBlk->succ_begin(); + } else if (trueBlk->succ_size() == 1 + && isSameloopDetachedContbreak(falseBlk, trueBlk)) { + landBlk = *trueBlk->succ_begin(); + } else { + return handleJumpintoIf(curBlk, trueBlk, falseBlk); + } + + // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the + // new BB created for landBlk==NULL may introduce new challenge to the + // reduction process. + if (landBlk != NULL && + ((trueBlk && trueBlk->pred_size() > 1) + || (falseBlk && falseBlk->pred_size() > 1))) { + cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk); + } + + if (trueBlk && trueBlk->pred_size() > 1) { + trueBlk = cloneBlockForPredecessor(trueBlk, curBlk); + ++cloned; + } + + if (falseBlk && falseBlk->pred_size() > 1) { + falseBlk = cloneBlockForPredecessor(falseBlk, curBlk); + ++cloned; + } + + mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk); + + ++numIfPatternMatch; + + numClonedBlock += cloned; + + return 1 + cloned; +} //ifPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) { + return 0; +} //switchPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + typename std::vector<LoopT *> nestedLoops; + while (loopRep) { + nestedLoops.push_back(loopRep); + loopRep = loopRep->getParentLoop(); + } + + if (nestedLoops.size() == 0) { + return 0; + } + + // Process nested loop outside->inside, so "continue" to a outside loop won't + // be mistaken as "break" of the current loop. + int num = 0; + for (typename std::vector<LoopT *>::reverse_iterator + iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend(); + iter != iterEnd; ++iter) { + loopRep = *iter; + + if (getLoopLandBlock(loopRep) != NULL) { + continue; + } + + BlockT *loopHeader = loopRep->getHeader(); + + int numBreak = loopbreakPatternMatch(loopRep, loopHeader); + + if (numBreak == -1) { + break; + } + + int numCont = loopcontPatternMatch(loopRep, loopHeader); + num += numBreak + numCont; + } + + return num; +} //loopendPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) { + if (curBlk->succ_size() != 0) { + return 0; + } + + int numLoop = 0; + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + while (loopRep && loopRep->getHeader() == curBlk) { + LoopLandInfo *loopLand = getLoopLandInfo(loopRep); + if (loopLand) { + BlockT *landBlk = loopLand->landBlk; + assert(landBlk); + if (!isRetiredBlock(landBlk)) { + mergeLooplandBlock(curBlk, loopLand); + ++numLoop; + } + } + loopRep = loopRep->getParentLoop(); + } + + numLoopPatternMatch += numLoop; + + return numLoop; +} //loopPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep, + BlockT *loopHeader) { + BlockTSmallerVector exitingBlks; + loopRep->getExitingBlocks(exitingBlks); + + if (DEBUGME) { + errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n"; + } + + if (exitingBlks.size() == 0) { + setLoopLandBlock(loopRep); + return 0; + } + + // Compute the corresponding exitBlks and exit block set. + BlockTSmallerVector exitBlks; + std::set<BlockT *> exitBlkSet; + for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(), + iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) { + BlockT *exitingBlk = *iter; + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); + exitBlks.push_back(exitBlk); + exitBlkSet.insert(exitBlk); //non-duplicate insert + } + + assert(exitBlkSet.size() > 0); + assert(exitBlks.size() == exitingBlks.size()); + + if (DEBUGME) { + errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n"; + } + + // Find exitLandBlk. + BlockT *exitLandBlk = NULL; + int numCloned = 0; + int numSerial = 0; + + if (exitBlkSet.size() == 1) { + exitLandBlk = *exitBlkSet.begin(); + } else { + exitLandBlk = findNearestCommonPostDom(exitBlkSet); + + if (exitLandBlk == NULL) { + return -1; + } + + bool allInPath = true; + bool allNotInPath = true; + for (typename std::set<BlockT*>::const_iterator + iter = exitBlkSet.begin(), + iterEnd = exitBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *exitBlk = *iter; + + PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true); + if (DEBUGME) { + errs() << "BB" << exitBlk->getNumber() + << " to BB" << exitLandBlk->getNumber() << " PathToKind=" + << pathKind << "\n"; + } + + allInPath = allInPath && (pathKind == SinglePath_InPath); + allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath); + + if (!allInPath && !allNotInPath) { + if (DEBUGME) { + errs() << "singlePath check fail\n"; + } + return -1; + } + } // check all exit blocks + + if (allNotInPath) { + + // TODO: Simplify, maybe separate function? + LoopT *parentLoopRep = loopRep->getParentLoop(); + BlockT *parentLoopHeader = NULL; + if (parentLoopRep) + parentLoopHeader = parentLoopRep->getHeader(); + + if (exitLandBlk == parentLoopHeader && + (exitLandBlk = relocateLoopcontBlock(parentLoopRep, + loopRep, + exitBlkSet, + exitLandBlk)) != NULL) { + if (DEBUGME) { + errs() << "relocateLoopcontBlock success\n"; + } + } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep, + exitingBlks, + exitBlks)) != NULL) { + if (DEBUGME) { + errs() << "insertEndbranchBlock success\n"; + } + } else { + if (DEBUGME) { + errs() << "loop exit fail\n"; + } + return -1; + } + } + + // Handle side entry to exit path. + exitBlks.clear(); + exitBlkSet.clear(); + for (typename BlockTSmallerVector::iterator iterExiting = + exitingBlks.begin(), + iterExitingEnd = exitingBlks.end(); + iterExiting != iterExitingEnd; ++iterExiting) { + BlockT *exitingBlk = *iterExiting; + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); + BlockT *newExitBlk = exitBlk; + + if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) { + newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk); + ++numCloned; + } + + numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk); + + exitBlks.push_back(newExitBlk); + exitBlkSet.insert(newExitBlk); + } + + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), + iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit) { + BlockT *exitBlk = *iterExit; + numSerial += serialPatternMatch(exitBlk); + } + + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), + iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit) { + BlockT *exitBlk = *iterExit; + if (exitBlk->pred_size() > 1) { + if (exitBlk != exitLandBlk) { + return -1; + } + } else { + if (exitBlk != exitLandBlk && + (exitBlk->succ_size() != 1 || + *exitBlk->succ_begin() != exitLandBlk)) { + return -1; + } + } + } + } // else + + exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet); + + // Fold break into the breaking block. Leverage across level breaks. + assert(exitingBlks.size() == exitBlks.size()); + for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(), + iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit, ++iterExiting) { + BlockT *exitBlk = *iterExit; + BlockT *exitingBlk = *iterExiting; + assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk); + LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk); + handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk); + } + + int numBreak = static_cast<int>(exitingBlks.size()); + numLoopbreakPatternMatch += numBreak; + numClonedBlock += numCloned; + return numBreak + numSerial + numCloned; +} //loopbreakPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep, + BlockT *loopHeader) { + int numCont = 0; + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk; + for (typename InvBlockGTraits::ChildIteratorType iter = + InvBlockGTraits::child_begin(loopHeader), + iterEnd = InvBlockGTraits::child_end(loopHeader); + iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + if (loopRep->contains(curBlk)) { + handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk), + loopHeader, loopRep); + contBlk.push_back(curBlk); + ++numCont; + } + } + + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator + iter = contBlk.begin(), iterEnd = contBlk.end(); + iter != iterEnd; ++iter) { + (*iter)->removeSuccessor(loopHeader); + } + + numLoopcontPatternMatch += numCont; + + return numCont; +} //loopcontPatternMatch + + +template<class PassT> +bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk, + BlockT *src2Blk) { + // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the + // same loop with LoopLandInfo without explicitly keeping track of + // loopContBlks and loopBreakBlks, this is a method to get the information. + // + if (src1Blk->succ_size() == 0) { + LoopT *loopRep = loopInfo->getLoopFor(src1Blk); + if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + if (theEntry != NULL) { + if (DEBUGME) { + errs() << "isLoopContBreakBlock yes src1 = BB" + << src1Blk->getNumber() + << " src2 = BB" << src2Blk->getNumber() << "\n"; + } + return true; + } + } + } + return false; +} //isSameloopDetachedContbreak + +template<class PassT> +int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk) { + int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk); + if (num == 0) { + if (DEBUGME) { + errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; + } + num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk); + } + return num; +} + +template<class PassT> +int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk) { + int num = 0; + BlockT *downBlk; + + //trueBlk could be the common post dominator + downBlk = trueBlk; + + if (DEBUGME) { + errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber() + << " true = BB" << trueBlk->getNumber() + << ", numSucc=" << trueBlk->succ_size() + << " false = BB" << falseBlk->getNumber() << "\n"; + } + + while (downBlk) { + if (DEBUGME) { + errs() << "check down = BB" << downBlk->getNumber(); + } + + if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) { + if (DEBUGME) { + errs() << " working\n"; + } + + num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk); + num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk); + + numClonedBlock += num; + num += serialPatternMatch(*headBlk->succ_begin()); + num += serialPatternMatch(*(++headBlk->succ_begin())); + num += ifPatternMatch(headBlk); + assert(num > 0); + + break; + } + if (DEBUGME) { + errs() << " not working\n"; + } + downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL; + } // walk down the postDomTree + + return num; +} //handleJumpintoIf + +template<class PassT> +void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT *landBlk, + bool detail) { + errs() << "head = BB" << headBlk->getNumber() + << " size = " << headBlk->size(); + if (detail) { + errs() << "\n"; + headBlk->print(errs()); + errs() << "\n"; + } + + if (trueBlk) { + errs() << ", true = BB" << trueBlk->getNumber() << " size = " + << trueBlk->size() << " numPred = " << trueBlk->pred_size(); + if (detail) { + errs() << "\n"; + trueBlk->print(errs()); + errs() << "\n"; + } + } + if (falseBlk) { + errs() << ", false = BB" << falseBlk->getNumber() << " size = " + << falseBlk->size() << " numPred = " << falseBlk->pred_size(); + if (detail) { + errs() << "\n"; + falseBlk->print(errs()); + errs() << "\n"; + } + } + if (landBlk) { + errs() << ", land = BB" << landBlk->getNumber() << " size = " + << landBlk->size() << " numPred = " << landBlk->pred_size(); + if (detail) { + errs() << "\n"; + landBlk->print(errs()); + errs() << "\n"; + } + } + + errs() << "\n"; +} //showImproveSimpleJumpintoIf + +template<class PassT> +int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT **plandBlk) { + bool migrateTrue = false; + bool migrateFalse = false; + + BlockT *landBlk = *plandBlk; + + assert((trueBlk == NULL || trueBlk->succ_size() <= 1) + && (falseBlk == NULL || falseBlk->succ_size() <= 1)); + + if (trueBlk == falseBlk) { + return 0; + } + + migrateTrue = needMigrateBlock(trueBlk); + migrateFalse = needMigrateBlock(falseBlk); + + if (!migrateTrue && !migrateFalse) { + return 0; + } + + // If we need to migrate either trueBlk and falseBlk, migrate the rest that + // have more than one predecessors. without doing this, its predecessor + // rather than headBlk will have undefined value in initReg. + if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) { + migrateTrue = true; + } + if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) { + migrateFalse = true; + } + + if (DEBUGME) { + errs() << "before improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); + } + + // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk + // + // new: headBlk => if () {initReg = 1; org trueBlk branch} else + // {initReg = 0; org falseBlk branch } + // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} + // => org landBlk + // if landBlk->pred_size() > 2, put the about if-else inside + // if (initReg !=2) {...} + // + // add initReg = initVal to headBlk + + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + unsigned initReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + if (!migrateTrue || !migrateFalse) { + int initVal = migrateTrue ? 0 : 1; + CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal); + } + + int numNewBlk = 0; + + if (landBlk == NULL) { + landBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(landBlk); //insert to function + + if (trueBlk) { + trueBlk->addSuccessor(landBlk); + } else { + headBlk->addSuccessor(landBlk); + } + + if (falseBlk) { + falseBlk->addSuccessor(landBlk); + } else { + headBlk->addSuccessor(landBlk); + } + + numNewBlk ++; + } + + bool landBlkHasOtherPred = (landBlk->pred_size() > 2); + + //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" + typename BlockT::iterator insertPos = + CFGTraits::getInstrPos + (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep)); + + if (landBlkHasOtherPred) { + unsigned immReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2); + unsigned cmpResReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + + CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg, + initReg, immReg); + CFGTraits::insertCondBranchBefore(landBlk, insertPos, + AMDGPU::IF_PREDICATE_SET, passRep, + cmpResReg, DebugLoc()); + } + + CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET, + passRep, initReg, DebugLoc()); + + if (migrateTrue) { + migrateInstruction(trueBlk, landBlk, insertPos); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 1). + CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1); + } + CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep); + + if (migrateFalse) { + migrateInstruction(falseBlk, landBlk, insertPos); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 0) + CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0); + } + + if (landBlkHasOtherPred) { + // add endif + CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep); + + // put initReg = 2 to other predecessors of landBlk + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), + predIterEnd = landBlk->pred_end(); predIter != predIterEnd; + ++predIter) { + BlockT *curBlk = *predIter; + if (curBlk != trueBlk && curBlk != falseBlk) { + CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2); + } + } //for + } + if (DEBUGME) { + errs() << "result from improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); + } + + // update landBlk + *plandBlk = landBlk; + + return numNewBlk; +} //improveSimpleJumpintoIf + +template<class PassT> +void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk, + LoopT *exitingLoop, + BlockT *exitBlk, + LoopT *exitLoop, + BlockT *landBlk) { + if (DEBUGME) { + errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop) + << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n"; + } + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + + RegiT initReg = INVALIDREGNUM; + if (exitingLoop != exitLoop) { + initReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + assert(initReg != INVALIDREGNUM); + addLoopBreakInitReg(exitLoop, initReg); + while (exitingLoop != exitLoop && exitingLoop) { + addLoopBreakOnReg(exitingLoop, initReg); + exitingLoop = exitingLoop->getParentLoop(); + } + assert(exitingLoop == exitLoop); + } + + mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg); + +} //handleLoopbreak + +template<class PassT> +void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk, + LoopT *contingLoop, + BlockT *contBlk, + LoopT *contLoop) { + if (DEBUGME) { + errs() << "loopcontPattern cont = BB" << contingBlk->getNumber() + << " header = BB" << contBlk->getNumber() << "\n"; + + errs() << "Trying to continue loop-depth = " + << getLoopDepth(contLoop) + << " from loop-depth = " << getLoopDepth(contingLoop) << "\n"; + } + + RegiT initReg = INVALIDREGNUM; + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + if (contingLoop != contLoop) { + initReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + assert(initReg != INVALIDREGNUM); + addLoopContInitReg(contLoop, initReg); + while (contingLoop && contingLoop->getParentLoop() != contLoop) { + addLoopBreakOnReg(contingLoop, initReg); //not addLoopContOnReg + contingLoop = contingLoop->getParentLoop(); + } + assert(contingLoop && contingLoop->getParentLoop() == contLoop); + addLoopContOnReg(contingLoop, initReg); + } + + settleLoopcontBlock(contingBlk, contBlk, initReg); +} //handleLoopcontBlock + +template<class PassT> +void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) { + if (DEBUGME) { + errs() << "serialPattern BB" << dstBlk->getNumber() + << " <= BB" << srcBlk->getNumber() << "\n"; + } + dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end()); + + dstBlk->removeSuccessor(srcBlk); + CFGTraits::cloneSuccessorList(dstBlk, srcBlk); + + removeSuccessor(srcBlk); + retireBlock(dstBlk, srcBlk); +} //mergeSerialBlock + +template<class PassT> +void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr, + BlockT *curBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT *landBlk) { + if (DEBUGME) { + errs() << "ifPattern BB" << curBlk->getNumber(); + errs() << "{ "; + if (trueBlk) { + errs() << "BB" << trueBlk->getNumber(); + } + errs() << " } else "; + errs() << "{ "; + if (falseBlk) { + errs() << "BB" << falseBlk->getNumber(); + } + errs() << " }\n "; + errs() << "landBlock: "; + if (landBlk == NULL) { + errs() << "NULL"; + } else { + errs() << "BB" << landBlk->getNumber(); + } + errs() << "\n"; + } + + int oldOpcode = branchInstr->getOpcode(); + DebugLoc branchDL = branchInstr->getDebugLoc(); + +// transform to +// if cond +// trueBlk +// else +// falseBlk +// endif +// landBlk + + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(curBlk, branchInstr); + CFGTraits::insertCondBranchBefore(branchInstrPos, + CFGTraits::getBranchNzeroOpcode(oldOpcode), + passRep, + branchDL); + + if (trueBlk) { + curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end()); + curBlk->removeSuccessor(trueBlk); + if (landBlk && trueBlk->succ_size()!=0) { + trueBlk->removeSuccessor(landBlk); + } + retireBlock(curBlk, trueBlk); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep); + + if (falseBlk) { + curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(), + falseBlk->end()); + curBlk->removeSuccessor(falseBlk); + if (landBlk && falseBlk->succ_size() != 0) { + falseBlk->removeSuccessor(landBlk); + } + retireBlock(curBlk, falseBlk); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep); + + branchInstr->eraseFromParent(); + + if (landBlk && trueBlk && falseBlk) { + curBlk->addSuccessor(landBlk); + } + +} //mergeIfthenelseBlock + +template<class PassT> +void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk, + LoopLandInfo *loopLand) { + BlockT *landBlk = loopLand->landBlk; + + if (DEBUGME) { + errs() << "loopPattern header = BB" << dstBlk->getNumber() + << " land = BB" << landBlk->getNumber() << "\n"; + } + + // Loop contInitRegs are init at the beginning of the loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->contInitRegs.begin(), + iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + + /* we last inserterd the DebugLoc in the + * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk. + * search for the DebugLoc in the that statement. + * if not found, we have to insert the empty/default DebugLoc */ + InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk); + DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc(); + + CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak); + // Loop breakInitRegs are init before entering the loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->breakInitRegs.begin(), + iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + // Loop endbranchInitRegs are init before entering the loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->endbranchInitRegs.begin(), + iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + + /* we last inserterd the DebugLoc in the continue statement in the current dstBlk + * search for the DebugLoc in the continue statement. + * if not found, we have to insert the empty/default DebugLoc */ + InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk); + DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc(); + + CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue); + // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this + // loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->breakOnRegs.begin(), + iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep, + *iter); + } + + // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this + // loop. + for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(), + iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32, + passRep, *iter); + } + + dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end()); + + for (typename BlockT::succ_iterator iter = landBlk->succ_begin(), + iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) { + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of. + } + + removeSuccessor(landBlk); + retireBlock(dstBlk, landBlk); +} //mergeLooplandBlock + +template<class PassT> +void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) { + while (I--) { + if (I->getOpcode() == AMDGPU::PRED_X) { + switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) { + case OPCODE_IS_ZERO_INT: + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT); + return; + case OPCODE_IS_NOT_ZERO_INT: + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT); + return; + case OPCODE_IS_ZERO: + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO); + return; + case OPCODE_IS_NOT_ZERO: + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO); + return; + default: + assert(0 && "PRED_X Opcode invalid!"); + } + } + } +} + +template<class PassT> +void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk, + BlockT *exitBlk, + BlockT *exitLandBlk, + RegiT setReg) { + if (DEBUGME) { + errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber() + << " exit = BB" << exitBlk->getNumber() + << " land = BB" << exitLandBlk->getNumber() << "\n"; + } + + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk); + assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); + + DebugLoc DL = branchInstr->getDebugLoc(); + + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); + + // transform exitingBlk to + // if ( ) { + // exitBlk (if exitBlk != exitLandBlk) + // setReg = 1 + // break + // }endif + // successor = {orgSuccessor(exitingBlk) - exitBlk} + + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(exitingBlk, branchInstr); + + if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) { + //break_logical + + if (trueBranch != exitBlk) { + reversePredicateSetter(branchInstrPos); + } + CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL); + } else { + if (trueBranch != exitBlk) { + reversePredicateSetter(branchInstr); + } + CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL); + if (exitBlk != exitLandBlk) { + //splice is insert-before ... + exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(), + exitBlk->end()); + } + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep); + } //if_logical + + //now branchInst can be erase safely + branchInstr->eraseFromParent(); + + //now take care of successors, retire blocks + exitingBlk->removeSuccessor(exitBlk); + if (exitBlk != exitLandBlk) { + //splice is insert-before ... + exitBlk->removeSuccessor(exitLandBlk); + retireBlock(exitingBlk, exitBlk); + } + +} //mergeLoopbreakBlock + +template<class PassT> +void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk, + BlockT *contBlk, + RegiT setReg) { + if (DEBUGME) { + errs() << "settleLoopcontBlock conting = BB" + << contingBlk->getNumber() + << ", cont = BB" << contBlk->getNumber() << "\n"; + } + + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk); + if (branchInstr) { + assert(CFGTraits::isCondBranch(branchInstr)); + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(contingBlk, branchInstr); + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); + int oldOpcode = branchInstr->getOpcode(); + DebugLoc DL = branchInstr->getDebugLoc(); + + // transform contingBlk to + // if () { + // move instr after branchInstr + // continue + // or + // setReg = 1 + // break + // }endif + // successor = {orgSuccessor(contingBlk) - loopHeader} + + bool useContinueLogical = + (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr); + + if (useContinueLogical == false) { + int branchOpcode = + trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode) + : CFGTraits::getBranchZeroOpcode(oldOpcode); + + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); + + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL); + } else { + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL); + } + + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL); + } else { + int branchOpcode = + trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode) + : CFGTraits::getContinueZeroOpcode(oldOpcode); + + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); + } + + branchInstr->eraseFromParent(); + } else { + // if we've arrived here then we've already erased the branch instruction + // travel back up the basic block to see the last reference of our debug location + // we've just inserted that reference here so it should be representative + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); + } else { + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); + } + } //else + +} //settleLoopcontBlock + +// BBs in exitBlkSet are determined as in break-path for loopRep, +// before we can put code for BBs as inside loop-body for loopRep +// check whether those BBs are determined as cont-BB for parentLoopRep +// earlier. +// If so, generate a new BB newBlk +// (1) set newBlk common successor of BBs in exitBlkSet +// (2) change the continue-instr in BBs in exitBlkSet to break-instr +// (3) generate continue-instr in newBlk +// +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep, + LoopT *loopRep, + std::set<BlockT *> &exitBlkSet, + BlockT *exitLandBlk) { + std::set<BlockT *> endBlkSet; + + + + for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(), + iterEnd = exitBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *exitBlk = *iter; + BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk); + + if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL) + return NULL; + + endBlkSet.insert(endBlk); + } + + BlockT *newBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(newBlk); //insert to function + CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep); + SHOWNEWBLK(newBlk, "New continue block: "); + + for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(), + iterEnd = endBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *endBlk = *iter; + InstrT *contInstr = CFGTraits::getContinueInstr(endBlk); + if (contInstr) { + contInstr->eraseFromParent(); + } + endBlk->addSuccessor(newBlk); + if (DEBUGME) { + errs() << "Add new continue Block to BB" + << endBlk->getNumber() << " successors\n"; + } + } + + return newBlk; +} //relocateLoopcontBlock + + +// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as +// LoopLandBlock. This BB branch on the loop endBranchInit register to the +// pathes corresponding to the loop exiting branches. + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep, + BlockTSmallerVector &exitingBlks, + BlockTSmallerVector &exitBlks) { + const AMDGPUInstrInfo *tii = + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo()); + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + + RegiT endBranchReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + assert(endBranchReg >= 0); + + // reg = 0 before entering the loop + addLoopEndbranchInitReg(loopRep, endBranchReg); + + uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size()); + assert(numBlks >=2 && numBlks == exitBlks.size()); + + BlockT *preExitingBlk = exitingBlks[0]; + BlockT *preExitBlk = exitBlks[0]; + BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(preBranchBlk); //insert to function + SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: "); + + BlockT *newLandBlk = preBranchBlk; + + CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk, + newLandBlk); + preExitingBlk->removeSuccessor(preExitBlk); + preExitingBlk->addSuccessor(newLandBlk); + + //it is redundant to add reg = 0 to exitingBlks[0] + + // For 1..n th exiting path (the last iteration handles two pathes) create the + // branch to the previous path and the current path. + for (uint32_t i = 1; i < numBlks; ++i) { + BlockT *curExitingBlk = exitingBlks[i]; + BlockT *curExitBlk = exitBlks[i]; + BlockT *curBranchBlk; + + if (i == numBlks - 1) { + curBranchBlk = curExitBlk; + } else { + curBranchBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(curBranchBlk); //insert to function + SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: "); + } + + // Add reg = i to exitingBlks[i]. + CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep, + endBranchReg, i); + + // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge + // (exitingBlks[i], newLandBlk). + CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk, + newLandBlk); + curExitingBlk->removeSuccessor(curExitBlk); + curExitingBlk->addSuccessor(newLandBlk); + + // add to preBranchBlk the branch instruction: + // if (endBranchReg == preVal) + // preExitBlk + // else + // curBranchBlk + // + // preValReg = i - 1 + + DebugLoc DL; + RegiT preValReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + + preBranchBlk->insert(preBranchBlk->begin(), + tii->getMovImmInstr(preBranchBlk->getParent(), preValReg, + i - 1)); + + // condResReg = (endBranchReg == preValReg) + RegiT condResReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg) + .addReg(endBranchReg).addReg(preValReg); + + BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32)) + .addMBB(preExitBlk).addReg(condResReg); + + preBranchBlk->addSuccessor(preExitBlk); + preBranchBlk->addSuccessor(curBranchBlk); + + // Update preExitingBlk, preExitBlk, preBranchBlk. + preExitingBlk = curExitingBlk; + preExitBlk = curExitBlk; + preBranchBlk = curBranchBlk; + + } //end for 1 .. n blocks + + return newLandBlk; +} //addLoopEndbranchBlock + +template<class PassT> +typename CFGStructurizer<PassT>::PathToKind +CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk, + bool allowSideEntry) { + assert(dstBlk); + + if (srcBlk == dstBlk) { + return SinglePath_InPath; + } + + while (srcBlk && srcBlk->succ_size() == 1) { + srcBlk = *srcBlk->succ_begin(); + if (srcBlk == dstBlk) { + return SinglePath_InPath; + } + + if (!allowSideEntry && srcBlk->pred_size() > 1) { + return Not_SinglePath; + } + } + + if (srcBlk && srcBlk->succ_size()==0) { + return SinglePath_NotInPath; + } + + return Not_SinglePath; +} //singlePathTo + +// If there is a single path from srcBlk to dstBlk, return the last block before +// dstBlk If there is a single path from srcBlk->end without dstBlk, return the +// last block in the path Otherwise, return NULL +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk, + bool allowSideEntry) { + assert(dstBlk); + + if (srcBlk == dstBlk) { + return srcBlk; + } + + if (srcBlk->succ_size() == 0) { + return srcBlk; + } + + while (srcBlk && srcBlk->succ_size() == 1) { + BlockT *preBlk = srcBlk; + + srcBlk = *srcBlk->succ_begin(); + if (srcBlk == NULL) { + return preBlk; + } + + if (!allowSideEntry && srcBlk->pred_size() > 1) { + return NULL; + } + } + + if (srcBlk && srcBlk->succ_size()==0) { + return srcBlk; + } + + return NULL; + +} //singlePathEnd + +template<class PassT> +int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk, + BlockT *dstBlk) { + int cloned = 0; + assert(preBlk->isSuccessor(srcBlk)); + while (srcBlk && srcBlk != dstBlk) { + assert(srcBlk->succ_size() == 1); + if (srcBlk->pred_size() > 1) { + srcBlk = cloneBlockForPredecessor(srcBlk, preBlk); + ++cloned; + } + + preBlk = srcBlk; + srcBlk = *srcBlk->succ_begin(); + } + + return cloned; +} //cloneOnSideEntryTo + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk, + BlockT *predBlk) { + assert(predBlk->isSuccessor(curBlk) && + "succBlk is not a prececessor of curBlk"); + + BlockT *cloneBlk = CFGTraits::clone(curBlk); //clone instructions + CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk); + //srcBlk, oldBlk, newBlk + + predBlk->removeSuccessor(curBlk); + predBlk->addSuccessor(cloneBlk); + + // add all successor to cloneBlk + CFGTraits::cloneSuccessorList(cloneBlk, curBlk); + + numClonedInstr += curBlk->size(); + + if (DEBUGME) { + errs() << "Cloned block: " << "BB" + << curBlk->getNumber() << "size " << curBlk->size() << "\n"; + } + + SHOWNEWBLK(cloneBlk, "result of Cloned block: "); + + return cloneBlk; +} //cloneBlockForPredecessor + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep, + BlockT *exitingBlk) { + BlockT *exitBlk = NULL; + + for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(), + iterSuccEnd = exitingBlk->succ_end(); + iterSucc != iterSuccEnd; ++iterSucc) { + BlockT *curBlk = *iterSucc; + if (!loopRep->contains(curBlk)) { + assert(exitBlk == NULL); + exitBlk = curBlk; + } + } + + assert(exitBlk != NULL); + + return exitBlk; +} //exitingBlock2ExitBlock + +template<class PassT> +void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk, + BlockT *dstBlk, + InstrIterator insertPos) { + InstrIterator spliceEnd; + //look for the input branchinstr, not the AMDGPU branchinstr + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); + if (branchInstr == NULL) { + if (DEBUGME) { + errs() << "migrateInstruction don't see branch instr\n" ; + } + spliceEnd = srcBlk->end(); + } else { + if (DEBUGME) { + errs() << "migrateInstruction see branch instr\n" ; + branchInstr->dump(); + } + spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr); + } + if (DEBUGME) { + errs() << "migrateInstruction before splice dstSize = " << dstBlk->size() + << "srcSize = " << srcBlk->size() << "\n"; + } + + //splice insert before insertPos + dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd); + + if (DEBUGME) { + errs() << "migrateInstruction after splice dstSize = " << dstBlk->size() + << "srcSize = " << srcBlk->size() << "\n"; + } +} //migrateInstruction + +// normalizeInfiniteLoopExit change +// B1: +// uncond_br LoopHeader +// +// to +// B1: +// cond_br 1 LoopHeader dummyExit +// and return the newly added dummy exit block +// +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) { + BlockT *loopHeader; + BlockT *loopLatch; + loopHeader = LoopRep->getHeader(); + loopLatch = LoopRep->getLoopLatch(); + BlockT *dummyExitBlk = NULL; + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + if (loopHeader!=NULL && loopLatch!=NULL) { + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch); + if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) { + dummyExitBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(dummyExitBlk); //insert to function + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); + + if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n"; + + typename BlockT::iterator insertPos = + CFGTraits::getInstrPos(loopLatch, branchInstr); + unsigned immReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1); + InstrT *newInstr = + CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep); + MachineInstrBuilder MIB(*funcRep, newInstr); + MIB.addMBB(loopHeader); + MIB.addReg(immReg, false); + + SHOWNEWINSTR(newInstr); + + branchInstr->eraseFromParent(); + loopLatch->addSuccessor(dummyExitBlk); + } + } + + return dummyExitBlk; +} //normalizeInfiniteLoopExit + +template<class PassT> +void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) { + InstrT *branchInstr; + + // I saw two unconditional branch in one basic block in example + // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. + while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk)) + && CFGTraits::isUncondBranch(branchInstr)) { + if (DEBUGME) { + errs() << "Removing unconditional branch instruction" ; + branchInstr->dump(); + } + branchInstr->eraseFromParent(); + } +} //removeUnconditionalBranch + +template<class PassT> +void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) { + if (srcBlk->succ_size() == 2) { + BlockT *blk1 = *srcBlk->succ_begin(); + BlockT *blk2 = *(++srcBlk->succ_begin()); + + if (blk1 == blk2) { + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); + assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); + if (DEBUGME) { + errs() << "Removing unneeded conditional branch instruction" ; + branchInstr->dump(); + } + branchInstr->eraseFromParent(); + SHOWNEWBLK(blk1, "Removing redundant successor"); + srcBlk->removeSuccessor(blk1); + } + } +} //removeRedundantConditionalBranch + +template<class PassT> +void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*, + DEFAULT_VEC_SLOTS> &retBlks) { + BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(dummyExitBlk); //insert to function + CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep); + + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter = + retBlks.begin(), + iterEnd = retBlks.end(); iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + InstrT *curInstr = CFGTraits::getReturnInstr(curBlk); + if (curInstr) { + curInstr->eraseFromParent(); + } + curBlk->addSuccessor(dummyExitBlk); + if (DEBUGME) { + errs() << "Add dummyExitBlock to BB" << curBlk->getNumber() + << " successors\n"; + } + } //for + + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: "); +} //addDummyExitBlock + +template<class PassT> +void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) { + while (srcBlk->succ_size()) { + srcBlk->removeSuccessor(*srcBlk->succ_begin()); + } +} + +template<class PassT> +void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) { + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; + + if (srcBlkInfo == NULL) { + srcBlkInfo = new BlockInfo(); + } + + srcBlkInfo->sccNum = sccNum; +} + +template<class PassT> +int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) { + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; + return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM; +} + +template<class PassT> +void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) { + if (DEBUGME) { + errs() << "Retiring BB" << srcBlk->getNumber() << "\n"; + } + + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; + + if (srcBlkInfo == NULL) { + srcBlkInfo = new BlockInfo(); + } + + srcBlkInfo->isRetired = true; + assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0 + && "can't retire block yet"); +} + +template<class PassT> +bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) { + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; + return (srcBlkInfo && srcBlkInfo->isRetired); +} + +template<class PassT> +bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + while (loopRep && loopRep->getHeader() == curBlk) { + LoopLandInfo *loopLand = getLoopLandInfo(loopRep); + + if(loopLand == NULL) + return true; + + BlockT *landBlk = loopLand->landBlk; + assert(landBlk); + if (!isRetiredBlock(landBlk)) { + return true; + } + + loopRep = loopRep->getParentLoop(); + } + + return false; +} //isActiveLoophead + +template<class PassT> +bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) { + const unsigned blockSizeThreshold = 30; + const unsigned cloneInstrThreshold = 100; + + bool multiplePreds = blk && (blk->pred_size() > 1); + + if(!multiplePreds) + return false; + + unsigned blkSize = blk->size(); + return ((blkSize > blockSizeThreshold) + && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold)); +} //needMigrateBlock + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk, + BlockTSmallerVector &exitBlks, + std::set<BlockT *> &exitBlkSet) { + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks; //in exit path blocks + + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), + predIterEnd = landBlk->pred_end(); + predIter != predIterEnd; ++predIter) { + BlockT *curBlk = *predIter; + if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) { + inpathBlks.push_back(curBlk); + } + } //for + + //if landBlk has predecessors that are not in the given loop, + //create a new block + BlockT *newLandBlk = landBlk; + if (inpathBlks.size() != landBlk->pred_size()) { + newLandBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(newLandBlk); //insert to function + newLandBlk->addSuccessor(landBlk); + for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter = + inpathBlks.begin(), + iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk); + //srcBlk, oldBlk, newBlk + curBlk->removeSuccessor(landBlk); + curBlk->addSuccessor(newLandBlk); + } + for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) { + if (exitBlks[i] == landBlk) { + exitBlks[i] = newLandBlk; + } + } + SHOWNEWBLK(newLandBlk, "NewLandingBlock: "); + } + + setLoopLandBlock(loopRep, newLandBlk); + + return newLandBlk; +} // recordLoopbreakLand + +template<class PassT> +void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + assert(theEntry->landBlk == NULL); + + if (blk == NULL) { + blk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(blk); //insert to function + SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: "); + } + + theEntry->landBlk = blk; + + if (DEBUGME) { + errs() << "setLoopLandBlock loop-header = BB" + << loopRep->getHeader()->getNumber() + << " landing-block = BB" << blk->getNumber() << "\n"; + } +} // setLoopLandBlock + +template<class PassT> +void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + + theEntry->breakOnRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopBreakOnReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopBreakOnReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->contOnRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopContOnReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopContOnReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->breakInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopBreakInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopBreakInitReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->contInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopContInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopContInitReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep, + RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->endbranchInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopEndbranchInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopEndbranchInitReg + +template<class PassT> +typename CFGStructurizer<PassT>::LoopLandInfo * +CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + return theEntry; +} // getLoopLandInfo + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + return theEntry ? theEntry->landBlk : NULL; +} // getLoopLandBlock + + +template<class PassT> +bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + if (loopRep == NULL) + return false; + + BlockT *loopHeader = loopRep->getHeader(); + + return curBlk->isSuccessor(loopHeader); + +} //hasBackEdge + +template<class PassT> +unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) { + return loopRep ? loopRep->getLoopDepth() : 0; +} //getLoopDepth + +template<class PassT> +int CFGStructurizer<PassT>::countActiveBlock +(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart, + typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) { + int count = 0; + while (iterStart != iterEnd) { + if (!isRetiredBlock(*iterStart)) { + ++count; + } + ++iterStart; + } + + return count; +} //countActiveBlock + +// This is work around solution for findNearestCommonDominator not avaiable to +// post dom a proper fix should go to Dominators.h. + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT* +CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) { + + if (postDomTree->dominates(blk1, blk2)) { + return blk1; + } + if (postDomTree->dominates(blk2, blk1)) { + return blk2; + } + + DomTreeNodeT *node1 = postDomTree->getNode(blk1); + DomTreeNodeT *node2 = postDomTree->getNode(blk2); + + // Handle newly cloned node. + if (node1 == NULL && blk1->succ_size() == 1) { + return findNearestCommonPostDom(*blk1->succ_begin(), blk2); + } + if (node2 == NULL && blk2->succ_size() == 1) { + return findNearestCommonPostDom(blk1, *blk2->succ_begin()); + } + + if (node1 == NULL || node2 == NULL) { + return NULL; + } + + node1 = node1->getIDom(); + while (node1) { + if (postDomTree->dominates(node1, node2)) { + return node1->getBlock(); + } + node1 = node1->getIDom(); + } + + return NULL; +} + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::findNearestCommonPostDom +(typename std::set<BlockT *> &blks) { + BlockT *commonDom; + typename std::set<BlockT *>::const_iterator iter = blks.begin(); + typename std::set<BlockT *>::const_iterator iterEnd = blks.end(); + for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) { + BlockT *curBlk = *iter; + if (curBlk != commonDom) { + commonDom = findNearestCommonPostDom(curBlk, commonDom); + } + } + + if (DEBUGME) { + errs() << "Common post dominator for exit blocks is "; + if (commonDom) { + errs() << "BB" << commonDom->getNumber() << "\n"; + } else { + errs() << "NULL\n"; + } + } + + return commonDom; +} //findNearestCommonPostDom + +} //end namespace llvm + +//todo: move-end + + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer for AMDGPU +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm { +class AMDGPUCFGStructurizer : public MachineFunctionPass { +public: + typedef MachineInstr InstructionType; + typedef MachineFunction FunctionType; + typedef MachineBasicBlock BlockType; + typedef MachineLoopInfo LoopinfoType; + typedef MachineDominatorTree DominatortreeType; + typedef MachinePostDominatorTree PostDominatortreeType; + typedef MachineDomTreeNode DomTreeNodeType; + typedef MachineLoop LoopType; + +protected: + TargetMachine &TM; + const TargetInstrInfo *TII; + const AMDGPURegisterInfo *TRI; + +public: + AMDGPUCFGStructurizer(char &pid, TargetMachine &tm); + const TargetInstrInfo *getTargetInstrInfo() const; + +private: + +}; + +} //end of namespace llvm +AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm) +: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()), + TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) { +} + +const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const { + return TII; +} +//===----------------------------------------------------------------------===// +// +// CFGPrepare +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm { +class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer { +public: + static char ID; + +public: + AMDGPUCFGPrepare(TargetMachine &tm); + + virtual const char *getPassName() const; + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + + bool runOnMachineFunction(MachineFunction &F); + +private: + +}; + +char AMDGPUCFGPrepare::ID = 0; +} //end of namespace llvm + +AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm) + : AMDGPUCFGStructurizer(ID, tm ) { +} +const char *AMDGPUCFGPrepare::getPassName() const { + return "AMD IL Control Flow Graph Preparation Pass"; +} + +void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addRequired<MachineFunctionAnalysis>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); +} + +//===----------------------------------------------------------------------===// +// +// CFGPerform +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm { +class AMDGPUCFGPerform : public AMDGPUCFGStructurizer { +public: + static char ID; + +public: + AMDGPUCFGPerform(TargetMachine &tm); + virtual const char *getPassName() const; + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnMachineFunction(MachineFunction &F); + +private: + +}; + +char AMDGPUCFGPerform::ID = 0; +} //end of namespace llvm + + AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm) +: AMDGPUCFGStructurizer(ID, tm) { +} + +const char *AMDGPUCFGPerform::getPassName() const { + return "AMD IL Control Flow Graph structurizer Pass"; +} + +void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addRequired<MachineFunctionAnalysis>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); +} + +//===----------------------------------------------------------------------===// +// +// CFGStructTraits<AMDGPUCFGStructurizer> +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct { +// this class is tailor to the AMDGPU backend +template<> +struct CFGStructTraits<AMDGPUCFGStructurizer> { + typedef int RegiT; + + static int getBranchNzeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; + default: + assert(0 && "internal error"); + } + return -1; + } + + static int getBranchZeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; + default: + assert(0 && "internal error"); + } + return -1; + } + + static int getContinueNzeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; + default: + assert(0 && "internal error"); + }; + return -1; + } + + static int getContinueZeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; + default: + assert(0 && "internal error"); + } + return -1; + } + + static MachineBasicBlock *getTrueBranch(MachineInstr *instr) { + return instr->getOperand(0).getMBB(); + } + + static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) { + instr->getOperand(0).setMBB(blk); + } + + static MachineBasicBlock * + getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) { + assert(blk->succ_size() == 2); + MachineBasicBlock *trueBranch = getTrueBranch(instr); + MachineBasicBlock::succ_iterator iter = blk->succ_begin(); + MachineBasicBlock::succ_iterator iterNext = iter; + ++iterNext; + + return (*iter == trueBranch) ? *iterNext : *iter; + } + + static bool isCondBranch(MachineInstr *instr) { + switch (instr->getOpcode()) { + case AMDGPU::JUMP_COND: + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: + break; + default: + return false; + } + return true; + } + + static bool isUncondBranch(MachineInstr *instr) { + switch (instr->getOpcode()) { + case AMDGPU::JUMP: + case AMDGPU::BRANCH: + return true; + default: + return false; + } + return true; + } + + static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) { + //get DebugLoc from the first MachineBasicBlock instruction with debug info + DebugLoc DL; + for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) { + MachineInstr *instr = &(*iter); + if (instr->getDebugLoc().isUnknown() == false) { + DL = instr->getDebugLoc(); + } + } + return DL; + } + + static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + MachineInstr *instr = &*iter; + if (instr && (isCondBranch(instr) || isUncondBranch(instr))) { + return instr; + } + return NULL; + } + + // The correct naming for this is getPossibleLoopendBlockBranchInstr. + // + // BB with backward-edge could have move instructions after the branch + // instruction. Such move instruction "belong to" the loop backward-edge. + // + static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) { + const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>( + blk->getParent()->getTarget().getInstrInfo()); + + for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(), + iterEnd = blk->rend(); iter != iterEnd; ++iter) { + // FIXME: Simplify + MachineInstr *instr = &*iter; + if (instr) { + if (isCondBranch(instr) || isUncondBranch(instr)) { + return instr; + } else if (!TII->isMov(instr->getOpcode())) { + break; + } + } + } + return NULL; + } + + static MachineInstr *getReturnInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + if (iter != blk->rend()) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDGPU::RETURN) { + return instr; + } + } + return NULL; + } + + static MachineInstr *getContinueInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + if (iter != blk->rend()) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDGPU::CONTINUE) { + return instr; + } + } + return NULL; + } + + static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) { + for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) { + return instr; + } + } + return NULL; + } + + static bool isReturnBlock(MachineBasicBlock *blk) { + MachineInstr *instr = getReturnInstr(blk); + bool isReturn = (blk->succ_size() == 0); + if (instr) { + assert(isReturn); + } else if (isReturn) { + if (DEBUGME) { + errs() << "BB" << blk->getNumber() + <<" is return block without RETURN instr\n"; + } + } + + return isReturn; + } + + static MachineBasicBlock::iterator + getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) { + assert(instr->getParent() == blk && "instruction doesn't belong to block"); + MachineBasicBlock::iterator iter = blk->begin(); + MachineBasicBlock::iterator iterEnd = blk->end(); + while (&(*iter) != instr && iter != iterEnd) { + ++iter; + } + + assert(iter != iterEnd); + return iter; + }//getInstrPos + + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep) { + return insertInstrBefore(blk,newOpcode,passRep,DebugLoc()); + } //insertInstrBefore + + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep, DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL); + + MachineBasicBlock::iterator res; + if (blk->begin() != blk->end()) { + blk->insert(blk->begin(), newInstr); + } else { + blk->push_back(newInstr); + } + + SHOWNEWINSTR(newInstr); + + return newInstr; + } //insertInstrBefore + + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep) { + insertInstrEnd(blk,newOpcode,passRep,DebugLoc()); + } //insertInstrEnd + + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep, DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = blk->getParent() + ->CreateMachineInstr(tii->get(newOpcode), DL); + + blk->push_back(newInstr); + //assume the instruction doesn't take any reg operand ... + + SHOWNEWINSTR(newInstr); + } //insertInstrEnd + + static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos, + int newOpcode, + AMDGPUCFGStructurizer *passRep) { + MachineInstr *oldInstr = &(*instrPos); + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), + DebugLoc()); + + blk->insert(instrPos, newInstr); + //assume the instruction doesn't take any reg operand ... + + SHOWNEWINSTR(newInstr); + return newInstr; + } //insertInstrBefore + + static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos, + int newOpcode, + AMDGPUCFGStructurizer *passRep, + DebugLoc DL) { + MachineInstr *oldInstr = &(*instrPos); + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineFunction *MF = blk->getParent(); + MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL); + + blk->insert(instrPos, newInstr); + MachineInstrBuilder MIB(*MF, newInstr); + MIB.addReg(oldInstr->getOperand(1).getReg(), false); + + SHOWNEWINSTR(newInstr); + //erase later oldInstr->eraseFromParent(); + } //insertCondBranchBefore + + static void insertCondBranchBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator insertPos, + int newOpcode, + AMDGPUCFGStructurizer *passRep, + RegiT regNum, + DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineFunction *MF = blk->getParent(); + + MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL); + + //insert before + blk->insert(insertPos, newInstr); + MachineInstrBuilder(*MF, newInstr).addReg(regNum, false); + + SHOWNEWINSTR(newInstr); + } //insertCondBranchBefore + + static void insertCondBranchEnd(MachineBasicBlock *blk, + int newOpcode, + AMDGPUCFGStructurizer *passRep, + RegiT regNum) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineFunction *MF = blk->getParent(); + MachineInstr *newInstr = + MF->CreateMachineInstr(tii->get(newOpcode), DebugLoc()); + + blk->push_back(newInstr); + MachineInstrBuilder(*MF, newInstr).addReg(regNum, false); + + SHOWNEWINSTR(newInstr); + } //insertCondBranchEnd + + + static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos, + AMDGPUCFGStructurizer *passRep, + RegiT regNum, int regVal) { + MachineInstr *oldInstr = &(*instrPos); + const AMDGPUInstrInfo *tii = + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo()); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum, + regVal); + blk->insert(instrPos, newInstr); + + SHOWNEWINSTR(newInstr); + } //insertAssignInstrBefore + + static void insertAssignInstrBefore(MachineBasicBlock *blk, + AMDGPUCFGStructurizer *passRep, + RegiT regNum, int regVal) { + const AMDGPUInstrInfo *tii = + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo()); + + MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum, + regVal); + if (blk->begin() != blk->end()) { + blk->insert(blk->begin(), newInstr); + } else { + blk->push_back(newInstr); + } + + SHOWNEWINSTR(newInstr); + + } //insertInstrBefore + + static void insertCompareInstrBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator instrPos, + AMDGPUCFGStructurizer *passRep, + RegiT dstReg, RegiT src1Reg, + RegiT src2Reg) { + const AMDGPUInstrInfo *tii = + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo()); + MachineFunction *MF = blk->getParent(); + MachineInstr *newInstr = + MF->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc()); + + MachineInstrBuilder MIB(*MF, newInstr); + MIB.addReg(dstReg, RegState::Define); //set target + MIB.addReg(src1Reg); //set src value + MIB.addReg(src2Reg); //set src value + + blk->insert(instrPos, newInstr); + SHOWNEWINSTR(newInstr); + + } //insertCompareInstrBefore + + static void cloneSuccessorList(MachineBasicBlock *dstBlk, + MachineBasicBlock *srcBlk) { + for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(), + iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) { + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of + } + } //cloneSuccessorList + + static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) { + MachineFunction *func = srcBlk->getParent(); + MachineBasicBlock *newBlk = func->CreateMachineBasicBlock(); + func->push_back(newBlk); //insert to function + for (MachineBasicBlock::iterator iter = srcBlk->begin(), + iterEnd = srcBlk->end(); + iter != iterEnd; ++iter) { + MachineInstr *instr = func->CloneMachineInstr(iter); + newBlk->push_back(instr); + } + return newBlk; + } + + //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because + //the AMDGPU instruction is not recognized as terminator fix this and retire + //this routine + static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk, + MachineBasicBlock *oldBlk, + MachineBasicBlock *newBlk) { + MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk); + if (branchInstr && isCondBranch(branchInstr) && + getTrueBranch(branchInstr) == oldBlk) { + setTrueBranch(branchInstr, newBlk); + } + } + + static void wrapup(MachineBasicBlock *entryBlk) { + assert((!entryBlk->getParent()->getJumpTableInfo() + || entryBlk->getParent()->getJumpTableInfo()->isEmpty()) + && "found a jump table"); + + //collect continue right before endloop + SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr; + MachineBasicBlock::iterator pre = entryBlk->begin(); + MachineBasicBlock::iterator iterEnd = entryBlk->end(); + MachineBasicBlock::iterator iter = pre; + while (iter != iterEnd) { + if (pre->getOpcode() == AMDGPU::CONTINUE + && iter->getOpcode() == AMDGPU::ENDLOOP) { + contInstr.push_back(pre); + } + pre = iter; + ++iter; + } //end while + + //delete continue right before endloop + for (unsigned i = 0; i < contInstr.size(); ++i) { + contInstr[i]->eraseFromParent(); + } + + // TODO to fix up jump table so later phase won't be confused. if + // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but + // there isn't such an interface yet. alternatively, replace all the other + // blocks in the jump table with the entryBlk //} + + } //wrapup + + static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) { + return &pass.getAnalysis<MachineDominatorTree>(); + } + + static MachinePostDominatorTree* + getPostDominatorTree(AMDGPUCFGStructurizer &pass) { + return &pass.getAnalysis<MachinePostDominatorTree>(); + } + + static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) { + return &pass.getAnalysis<MachineLoopInfo>(); + } +}; // template class CFGStructTraits +} //end of namespace llvm + +// createAMDGPUCFGPreparationPass- Returns a pass +FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm + ) { + return new AMDGPUCFGPrepare(tm ); +} + +bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) { + return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func, + *this, + TRI); +} + +// createAMDGPUCFGStructurizerPass- Returns a pass +FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm + ) { + return new AMDGPUCFGPerform(tm ); +} + +bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) { + return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func, + *this, + TRI); +} diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp new file mode 100644 index 0000000000..db8e01ea40 --- /dev/null +++ b/lib/Target/R600/AMDILDevice.cpp @@ -0,0 +1,132 @@ +//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#include "AMDILDevice.h" +#include "AMDGPUSubtarget.h" + +using namespace llvm; +// Default implementation for all of the classes. +AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) { + mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities); + mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities); + setCaps(); + DeviceFlag = OCL_DEVICE_ALL; +} + +AMDGPUDevice::~AMDGPUDevice() { + mHWBits.clear(); + mSWBits.clear(); +} + +size_t AMDGPUDevice::getMaxGDSSize() const { + return 0; +} + +uint32_t +AMDGPUDevice::getDeviceFlag() const { + return DeviceFlag; +} + +size_t AMDGPUDevice::getMaxNumCBs() const { + if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) { + return HW_MAX_NUM_CB; + } + + return 0; +} + +size_t AMDGPUDevice::getMaxCBSize() const { + if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) { + return MAX_CB_SIZE; + } + + return 0; +} + +size_t AMDGPUDevice::getMaxScratchSize() const { + return 65536; +} + +uint32_t AMDGPUDevice::getStackAlignment() const { + return 16; +} + +void AMDGPUDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::HalfOps); + mSWBits.set(AMDGPUDeviceInfo::ByteOps); + mSWBits.set(AMDGPUDeviceInfo::ShortOps); + mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod); + if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) { + mSWBits.set(AMDGPUDeviceInfo::NoInline); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) { + mSWBits.set(AMDGPUDeviceInfo::MacroDB); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { + mSWBits.set(AMDGPUDeviceInfo::ConstantMem); + } else { + mHWBits.set(AMDGPUDeviceInfo::ConstantMem); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { + mSWBits.set(AMDGPUDeviceInfo::PrivateMem); + } else { + mHWBits.set(AMDGPUDeviceInfo::PrivateMem); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) { + mSWBits.set(AMDGPUDeviceInfo::BarrierDetect); + } + mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps); + mSWBits.set(AMDGPUDeviceInfo::LongOps); +} + +AMDGPUDeviceInfo::ExecutionMode +AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const { + if (mHWBits[Caps]) { + assert(!mSWBits[Caps] && "Cannot set both SW and HW caps"); + return AMDGPUDeviceInfo::Hardware; + } + + if (mSWBits[Caps]) { + assert(!mHWBits[Caps] && "Cannot set both SW and HW caps"); + return AMDGPUDeviceInfo::Software; + } + + return AMDGPUDeviceInfo::Unsupported; + +} + +bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const { + return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported; +} + +bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const { + return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware; +} + +bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const { + return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software; +} + +std::string +AMDGPUDevice::getDataLayout() const { + std::string DataLayout = std::string( + "e" + "-p:32:32:32" + "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128" + "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048" + "-n32:64" + ); + + if (usesHardware(AMDGPUDeviceInfo::DoubleOps)) { + DataLayout.append("-f64:64:64"); + } + + return DataLayout; +} diff --git a/lib/Target/R600/AMDILDevice.h b/lib/Target/R600/AMDILDevice.h new file mode 100644 index 0000000000..97df98cafb --- /dev/null +++ b/lib/Target/R600/AMDILDevice.h @@ -0,0 +1,117 @@ +//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the subtarget data classes. +// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef AMDILDEVICEIMPL_H +#define AMDILDEVICEIMPL_H +#include "AMDIL.h" +#include "llvm/ADT/BitVector.h" + +namespace llvm { + class AMDGPUSubtarget; + class MCStreamer; +//===----------------------------------------------------------------------===// +// Interface for data that is specific to a single device +//===----------------------------------------------------------------------===// +class AMDGPUDevice { +public: + AMDGPUDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUDevice(); + + // Enum values for the various memory types. + enum { + RAW_UAV_ID = 0, + ARENA_UAV_ID = 1, + LDS_ID = 2, + GDS_ID = 3, + SCRATCH_ID = 4, + CONSTANT_ID = 5, + GLOBAL_ID = 6, + MAX_IDS = 7 + } IO_TYPE_IDS; + + /// \returns The max LDS size that the hardware supports. Size is in + /// bytes. + virtual size_t getMaxLDSSize() const = 0; + + /// \returns The max GDS size that the hardware supports if the GDS is + /// supported by the hardware. Size is in bytes. + virtual size_t getMaxGDSSize() const; + + /// \returns The max number of hardware constant address spaces that + /// are supported by this device. + virtual size_t getMaxNumCBs() const; + + /// \returns The max number of bytes a single hardware constant buffer + /// can support. Size is in bytes. + virtual size_t getMaxCBSize() const; + + /// \returns The max number of bytes allowed by the hardware scratch + /// buffer. Size is in bytes. + virtual size_t getMaxScratchSize() const; + + /// \brief Get the flag that corresponds to the device. + virtual uint32_t getDeviceFlag() const; + + /// \returns The number of work-items that exist in a single hardware + /// wavefront. + virtual size_t getWavefrontSize() const = 0; + + /// \brief Get the generational name of this specific device. + virtual uint32_t getGeneration() const = 0; + + /// \brief Get the stack alignment of this specific device. + virtual uint32_t getStackAlignment() const; + + /// \brief Get the resource ID for this specific device. + virtual uint32_t getResourceID(uint32_t DeviceID) const = 0; + + /// \brief Get the max number of UAV's for this device. + virtual uint32_t getMaxNumUAVs() const = 0; + + + // API utilizing more detailed capabilities of each family of + // cards. If a capability is supported, then either usesHardware or + // usesSoftware returned true. If usesHardware returned true, then + // usesSoftware must return false for the same capability. Hardware + // execution means that the feature is done natively by the hardware + // and is not emulated by the softare. Software execution means + // that the feature could be done in the hardware, but there is + // software that emulates it with possibly using the hardware for + // support since the hardware does not fully comply with OpenCL + // specs. + + bool isSupported(AMDGPUDeviceInfo::Caps Mode) const; + bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const; + bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const; + virtual std::string getDataLayout() const; + static const unsigned int MAX_LDS_SIZE_700 = 16384; + static const unsigned int MAX_LDS_SIZE_800 = 32768; + static const unsigned int WavefrontSize = 64; + static const unsigned int HalfWavefrontSize = 32; + static const unsigned int QuarterWavefrontSize = 16; +protected: + virtual void setCaps(); + BitVector mHWBits; + llvm::BitVector mSWBits; + AMDGPUSubtarget *mSTM; + uint32_t DeviceFlag; +private: + AMDGPUDeviceInfo::ExecutionMode + getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const; +}; + +} // namespace llvm +#endif // AMDILDEVICEIMPL_H diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp new file mode 100644 index 0000000000..9605fbe633 --- /dev/null +++ b/lib/Target/R600/AMDILDeviceInfo.cpp @@ -0,0 +1,94 @@ +//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Function that creates DeviceInfo from a device name and other information. +// +//==-----------------------------------------------------------------------===// +#include "AMDILDevices.h" +#include "AMDGPUSubtarget.h" + +using namespace llvm; +namespace llvm { +namespace AMDGPUDeviceInfo { + +AMDGPUDevice* getDeviceFromName(const std::string &deviceName, + AMDGPUSubtarget *ptr, + bool is64bit, bool is64on32bit) { + if (deviceName.c_str()[2] == '7') { + switch (deviceName.c_str()[3]) { + case '1': + return new AMDGPU710Device(ptr); + case '7': + return new AMDGPU770Device(ptr); + default: + return new AMDGPU7XXDevice(ptr); + } + } else if (deviceName == "cypress") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUCypressDevice(ptr); + } else if (deviceName == "juniper") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUEvergreenDevice(ptr); + } else if (deviceName == "redwood") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPURedwoodDevice(ptr); + } else if (deviceName == "cedar") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUCedarDevice(ptr); + } else if (deviceName == "barts" || deviceName == "turks") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUNIDevice(ptr); + } else if (deviceName == "cayman") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUCaymanDevice(ptr); + } else if (deviceName == "caicos") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUNIDevice(ptr); + } else if (deviceName == "SI") { + return new AMDGPUSIDevice(ptr); + } else { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPU7XXDevice(ptr); + } +} +} // End namespace AMDGPUDeviceInfo +} // End namespace llvm diff --git a/lib/Target/R600/AMDILDeviceInfo.h b/lib/Target/R600/AMDILDeviceInfo.h new file mode 100644 index 0000000000..4b2c3a53c7 --- /dev/null +++ b/lib/Target/R600/AMDILDeviceInfo.h @@ -0,0 +1,88 @@ +//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#ifndef AMDILDEVICEINFO_H +#define AMDILDEVICEINFO_H + + +#include <string> + +namespace llvm { + class AMDGPUDevice; + class AMDGPUSubtarget; + namespace AMDGPUDeviceInfo { + /// Each Capabilities can be executed using a hardware instruction, + /// emulated with a sequence of software instructions, or not + /// supported at all. + enum ExecutionMode { + Unsupported = 0, ///< Unsupported feature on the card(Default value) + /// This is the execution mode that is set if the feature is emulated in + /// software. + Software, + /// This execution mode is set if the feature exists natively in hardware + Hardware + }; + + enum Caps { + HalfOps = 0x1, ///< Half float is supported or not. + DoubleOps = 0x2, ///< Double is supported or not. + ByteOps = 0x3, ///< Byte(char) is support or not. + ShortOps = 0x4, ///< Short is supported or not. + LongOps = 0x5, ///< Long is supported or not. + Images = 0x6, ///< Images are supported or not. + ByteStores = 0x7, ///< ByteStores available(!HD4XXX). + ConstantMem = 0x8, ///< Constant/CB memory. + LocalMem = 0x9, ///< Local/LDS memory. + PrivateMem = 0xA, ///< Scratch/Private/Stack memory. + RegionMem = 0xB, ///< OCL GDS Memory Extension. + FMA = 0xC, ///< Use HW FMA or SW FMA. + ArenaSegment = 0xD, ///< Use for Arena UAV per pointer 12-1023. + MultiUAV = 0xE, ///< Use for UAV per Pointer 0-7. + Reserved0 = 0xF, ///< ReservedFlag + NoAlias = 0x10, ///< Cached loads. + Signed24BitOps = 0x11, ///< Peephole Optimization. + /// Debug mode implies that no hardware features or optimizations + /// are performned and that all memory access go through a single + /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX). + Debug = 0x12, + CachedMem = 0x13, ///< Cached mem is available or not. + BarrierDetect = 0x14, ///< Detect duplicate barriers. + Reserved1 = 0x15, ///< Reserved flag + ByteLDSOps = 0x16, ///< Flag to specify if byte LDS ops are available. + ArenaVectors = 0x17, ///< Flag to specify if vector loads from arena work. + TmrReg = 0x18, ///< Flag to specify if Tmr register is supported. + NoInline = 0x19, ///< Flag to specify that no inlining should occur. + MacroDB = 0x1A, ///< Flag to specify that backend handles macrodb. + HW64BitDivMod = 0x1B, ///< Flag for backend to generate 64bit div/mod. + ArenaUAV = 0x1C, ///< Flag to specify that arena uav is supported. + PrivateUAV = 0x1D, ///< Flag to specify that private memory uses uav's. + /// If more capabilities are required, then + /// this number needs to be increased. + /// All capabilities must come before this + /// number. + MaxNumberCapabilities = 0x20 + }; + /// These have to be in order with the older generations + /// having the lower number enumerations. + enum Generation { + HD4XXX = 0, ///< 7XX based devices. + HD5XXX, ///< Evergreen based devices. + HD6XXX, ///< NI/Evergreen+ based devices. + HD7XXX, ///< Southern Islands based devices. + HDTEST, ///< Experimental feature testing device. + HDNUMGEN + }; + + + AMDGPUDevice* + getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr, + bool is64bit = false, bool is64on32bit = false); + } // namespace AMDILDeviceInfo +} // namespace llvm +#endif // AMDILDEVICEINFO_H diff --git a/lib/Target/R600/AMDILDevices.h b/lib/Target/R600/AMDILDevices.h new file mode 100644 index 0000000000..636fa6d359 --- /dev/null +++ b/lib/Target/R600/AMDILDevices.h @@ -0,0 +1,19 @@ +//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#ifndef AMDIL_DEVICES_H +#define AMDIL_DEVICES_H +// Include all of the device specific header files +#include "AMDIL7XXDevice.h" +#include "AMDILDevice.h" +#include "AMDILEvergreenDevice.h" +#include "AMDILNIDevice.h" +#include "AMDILSIDevice.h" + +#endif // AMDIL_DEVICES_H diff --git a/lib/Target/R600/AMDILEvergreenDevice.cpp b/lib/Target/R600/AMDILEvergreenDevice.cpp new file mode 100644 index 0000000000..c5213a0410 --- /dev/null +++ b/lib/Target/R600/AMDILEvergreenDevice.cpp @@ -0,0 +1,169 @@ +//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#include "AMDILEvergreenDevice.h" + +using namespace llvm; + +AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST) +: AMDGPUDevice(ST) { + setCaps(); + std::string name = ST->getDeviceName(); + if (name == "cedar") { + DeviceFlag = OCL_DEVICE_CEDAR; + } else if (name == "redwood") { + DeviceFlag = OCL_DEVICE_REDWOOD; + } else if (name == "cypress") { + DeviceFlag = OCL_DEVICE_CYPRESS; + } else { + DeviceFlag = OCL_DEVICE_JUNIPER; + } +} + +AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() { +} + +size_t AMDGPUEvergreenDevice::getMaxLDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_800; + } else { + return 0; + } +} +size_t AMDGPUEvergreenDevice::getMaxGDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { + return MAX_LDS_SIZE_800; + } else { + return 0; + } +} +uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const { + return 12; +} + +uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const { + switch(id) { + default: + assert(0 && "ID type passed in is unknown!"); + break; + case CONSTANT_ID: + case RAW_UAV_ID: + return GLOBAL_RETURN_RAW_UAV_ID; + case GLOBAL_ID: + case ARENA_UAV_ID: + return DEFAULT_ARENA_UAV_ID; + case LDS_ID: + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return DEFAULT_LDS_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + case GDS_ID: + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { + return DEFAULT_GDS_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + case SCRATCH_ID: + if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) { + return DEFAULT_SCRATCH_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + }; + return 0; +} + +size_t AMDGPUEvergreenDevice::getWavefrontSize() const { + return AMDGPUDevice::WavefrontSize; +} + +uint32_t AMDGPUEvergreenDevice::getGeneration() const { + return AMDGPUDeviceInfo::HD5XXX; +} + +void AMDGPUEvergreenDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::ArenaSegment); + mHWBits.set(AMDGPUDeviceInfo::ArenaUAV); + mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod); + mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod); + mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps); + if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) { + mHWBits.set(AMDGPUDeviceInfo::ByteStores); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { + mSWBits.set(AMDGPUDeviceInfo::LocalMem); + mSWBits.set(AMDGPUDeviceInfo::RegionMem); + } else { + mHWBits.set(AMDGPUDeviceInfo::LocalMem); + mHWBits.set(AMDGPUDeviceInfo::RegionMem); + } + mHWBits.set(AMDGPUDeviceInfo::Images); + if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) { + mHWBits.set(AMDGPUDeviceInfo::NoAlias); + } + mHWBits.set(AMDGPUDeviceInfo::CachedMem); + if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) { + mHWBits.set(AMDGPUDeviceInfo::MultiUAV); + } + mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps); + mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps); + mHWBits.set(AMDGPUDeviceInfo::ArenaVectors); + mHWBits.set(AMDGPUDeviceInfo::LongOps); + mSWBits.reset(AMDGPUDeviceInfo::LongOps); + mHWBits.set(AMDGPUDeviceInfo::TmrReg); +} + +AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + setCaps(); +} + +AMDGPUCypressDevice::~AMDGPUCypressDevice() { +} + +void AMDGPUCypressDevice::setCaps() { + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { + mHWBits.set(AMDGPUDeviceInfo::DoubleOps); + mHWBits.set(AMDGPUDeviceInfo::FMA); + } +} + + +AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + setCaps(); +} + +AMDGPUCedarDevice::~AMDGPUCedarDevice() { +} + +void AMDGPUCedarDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::FMA); +} + +size_t AMDGPUCedarDevice::getWavefrontSize() const { + return AMDGPUDevice::QuarterWavefrontSize; +} + +AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + setCaps(); +} + +AMDGPURedwoodDevice::~AMDGPURedwoodDevice() { +} + +void AMDGPURedwoodDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::FMA); +} + +size_t AMDGPURedwoodDevice::getWavefrontSize() const { + return AMDGPUDevice::HalfWavefrontSize; +} diff --git a/lib/Target/R600/AMDILEvergreenDevice.h b/lib/Target/R600/AMDILEvergreenDevice.h new file mode 100644 index 0000000000..ea90f774a8 --- /dev/null +++ b/lib/Target/R600/AMDILEvergreenDevice.h @@ -0,0 +1,93 @@ +//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the subtarget data classes. +/// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef AMDILEVERGREENDEVICE_H +#define AMDILEVERGREENDEVICE_H +#include "AMDGPUSubtarget.h" +#include "AMDILDevice.h" + +namespace llvm { + class AMDGPUSubtarget; +//===----------------------------------------------------------------------===// +// Evergreen generation of devices and their respective sub classes +//===----------------------------------------------------------------------===// + + +/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen +/// series of cards. +/// +/// This class contains information required to differentiate +/// the Evergreen device from the generic AMDGPUDevice. This device represents +/// that capabilities of the 'Juniper' cards, also known as the HD57XX. +class AMDGPUEvergreenDevice : public AMDGPUDevice { +public: + AMDGPUEvergreenDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUEvergreenDevice(); + virtual size_t getMaxLDSSize() const; + virtual size_t getMaxGDSSize() const; + virtual size_t getWavefrontSize() const; + virtual uint32_t getGeneration() const; + virtual uint32_t getMaxNumUAVs() const; + virtual uint32_t getResourceID(uint32_t) const; +protected: + virtual void setCaps(); +}; + +/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has +/// support for double precision operations. This device is used to represent +/// both the Cypress and Hemlock cards, which are commercially known as HD58XX +/// and HD59XX cards. +class AMDGPUCypressDevice : public AMDGPUEvergreenDevice { +public: + AMDGPUCypressDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUCypressDevice(); +private: + virtual void setCaps(); +}; + + +/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based +/// devices. +/// +/// This class differs from the base AMDGPUEvergreenDevice in that the +/// device is a ~quarter of the 'Juniper'. These are commercially known as the +/// HD54XX and HD53XX series of cards. +class AMDGPUCedarDevice : public AMDGPUEvergreenDevice { +public: + AMDGPUCedarDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUCedarDevice(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; + +/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based +/// devices. +/// +/// This class differs from the base class, in that these devices are +/// considered about half of a 'Juniper' device. These are commercially known as +/// the HD55XX and HD56XX series of cards. +class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice { +public: + AMDGPURedwoodDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPURedwoodDevice(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; + +} // namespace llvm +#endif // AMDILEVERGREENDEVICE_H diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp new file mode 100644 index 0000000000..0c7880d232 --- /dev/null +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -0,0 +1,626 @@ +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Defines an instruction selector for the AMDGPU target. +// +//===----------------------------------------------------------------------===// +#include "AMDGPUInstrInfo.h" +#include "AMDGPUISelLowering.h" // For AMDGPUISD +#include "AMDGPURegisterInfo.h" +#include "AMDILDevices.h" +#include "R600InstrInfo.h" +#include "SIISelLowering.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/Compiler.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include <list> +#include <queue> + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +namespace { +/// AMDGPU specific code to select AMDGPU machine instructions for +/// SelectionDAG operations. +class AMDGPUDAGToDAGISel : public SelectionDAGISel { + // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can + // make the right decision when generating code for different targets. + const AMDGPUSubtarget &Subtarget; +public: + AMDGPUDAGToDAGISel(TargetMachine &TM); + virtual ~AMDGPUDAGToDAGISel(); + + SDNode *Select(SDNode *N); + virtual const char *getPassName() const; + virtual void PostprocessISelDAG(); + +private: + inline SDValue getSmallIPtrImm(unsigned Imm); + bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); + + // Complex pattern selectors + bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); + bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); + bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); + + static bool checkType(const Value *ptr, unsigned int addrspace); + static const Value *getBasePointerValue(const Value *V); + + static bool isGlobalStore(const StoreSDNode *N); + static bool isPrivateStore(const StoreSDNode *N); + static bool isLocalStore(const StoreSDNode *N); + static bool isRegionStore(const StoreSDNode *N); + + static bool isCPLoad(const LoadSDNode *N); + static bool isConstantLoad(const LoadSDNode *N, int cbID); + static bool isGlobalLoad(const LoadSDNode *N); + static bool isParamLoad(const LoadSDNode *N); + static bool isPrivateLoad(const LoadSDNode *N); + static bool isLocalLoad(const LoadSDNode *N); + static bool isRegionLoad(const LoadSDNode *N); + + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, + SDValue &BaseReg, SDValue& Offset); + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + + // Include the pieces autogenerated from the target description. +#include "AMDGPUGenDAGISel.inc" +}; +} // end anonymous namespace + +/// \brief This pass converts a legalized DAG into a AMDGPU-specific +// DAG, ready for instruction scheduling. +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM + ) { + return new AMDGPUDAGToDAGISel(TM); +} + +AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM + ) + : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) { +} + +AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { +} + +SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); +} + +bool AMDGPUDAGToDAGISel::SelectADDRParam( + SDValue Addr, SDValue& R1, SDValue& R2) { + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + return SelectADDRParam(Addr, R1, R2); +} + + +bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } + return true; +} + +SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + return NULL; // Already selected. + } + switch (Opc) { + default: break; + case ISD::BUILD_VECTOR: { + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + break; + } + // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + SDValue RegSeqArgs[9] = { + CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + bool IsRegSeq = true; + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { + IsRegSeq = false; + break; + } + RegSeqArgs[2 * i + 1] = N->getOperand(i); + } + if (!IsRegSeq) + break; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), + RegSeqArgs, 2 * N->getNumOperands() + 1); + } + case ISD::ConstantFP: + case ISD::Constant: { + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + // XXX: Custom immediate lowering not implemented yet. Instead we use + // pseudo instructions defined in SIInstructions.td + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + break; + } + const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); + + uint64_t ImmValue = 0; + unsigned ImmReg = AMDGPU::ALU_LITERAL_X; + + if (N->getOpcode() == ISD::ConstantFP) { + // XXX: 64-bit Immediates not supported yet + assert(N->getValueType(0) != MVT::f64); + + ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N); + APFloat Value = C->getValueAPF(); + float FloatValue = Value.convertToFloat(); + if (FloatValue == 0.0) { + ImmReg = AMDGPU::ZERO; + } else if (FloatValue == 0.5) { + ImmReg = AMDGPU::HALF; + } else if (FloatValue == 1.0) { + ImmReg = AMDGPU::ONE; + } else { + ImmValue = Value.bitcastToAPInt().getZExtValue(); + } + } else { + // XXX: 64-bit Immediates not supported yet + assert(N->getValueType(0) != MVT::i64); + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); + if (C->getZExtValue() == 0) { + ImmReg = AMDGPU::ZERO; + } else if (C->getZExtValue() == 1) { + ImmReg = AMDGPU::ONE_INT; + } else { + ImmValue = C->getZExtValue(); + } + } + + for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use); + Use != SDNode::use_end(); Use = Next) { + Next = llvm::next(Use); + std::vector<SDValue> Ops; + for (unsigned i = 0; i < Use->getNumOperands(); ++i) { + Ops.push_back(Use->getOperand(i)); + } + + if (!Use->isMachineOpcode()) { + if (ImmReg == AMDGPU::ALU_LITERAL_X) { + // We can only use literal constants (e.g. AMDGPU::ZERO, + // AMDGPU::ONE, etc) in machine opcodes. + continue; + } + } else { + if (!TII->isALUInstr(Use->getMachineOpcode()) || + (TII->get(Use->getMachineOpcode()).TSFlags & + R600_InstFlag::VECTOR)) { + continue; + } + + int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM); + assert(ImmIdx != -1); + + // subtract one from ImmIdx, because the DST operand is usually index + // 0 for MachineInstrs, but we have no DST in the Ops vector. + ImmIdx--; + + // Check that we aren't already using an immediate. + // XXX: It's possible for an instruction to have more than one + // immediate operand, but this is not supported yet. + if (ImmReg == AMDGPU::ALU_LITERAL_X) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx)); + assert(C); + + if (C->getZExtValue() != 0) { + // This instruction is already using an immediate. + continue; + } + + // Set the immediate value + Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32); + } + } + // Set the immediate register + Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32); + + CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands()); + } + break; + } + } + SDNode *Result = SelectCode(N); + + // Fold operands of selected node + + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + const R600InstrInfo *TII = + static_cast<const R600InstrInfo*>(TM.getInstrInfo()); + if (Result && Result->isMachineOpcode() && + !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) + && TII->isALUInstr(Result->getMachineOpcode())) { + // Fold FNEG/FABS/CONST_ADDRESS + // TODO: Isel can generate multiple MachineInst, we need to recursively + // parse Result + bool IsModified = false; + do { + std::vector<SDValue> Ops; + for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); + I != E; ++I) + Ops.push_back(*I); + IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); + if (IsModified) { + Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); + } + } while (IsModified); + + // If node has a single use which is CLAMP_R600, folds it + if (Result->hasOneUse() && Result->isMachineOpcode()) { + SDNode *PotentialClamp = *Result->use_begin(); + if (PotentialClamp->isMachineOpcode() && + PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) { + unsigned ClampIdx = + TII->getOperandIdx(Result->getMachineOpcode(), R600Operands::CLAMP); + std::vector<SDValue> Ops; + unsigned NumOp = Result->getNumOperands(); + for (unsigned i = 0; i < NumOp; ++i) { + Ops.push_back(Result->getOperand(i)); + } + Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32); + Result = CurDAG->SelectNodeTo(PotentialClamp, + Result->getMachineOpcode(), PotentialClamp->getVTList(), + Ops.data(), NumOp); + } + } + } + } + + return Result; +} + +bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, + const R600InstrInfo *TII, std::vector<SDValue> &Ops) { + int OperandIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0), + TII->getOperandIdx(Opcode, R600Operands::SRC1), + TII->getOperandIdx(Opcode, R600Operands::SRC2) + }; + int SelIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL), + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL), + TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG), + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG), + TII->getOperandIdx(Opcode, R600Operands::SRC2_NEG) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS), + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS), + -1 + }; + + for (unsigned i = 0; i < 3; i++) { + if (OperandIdx[i] < 0) + return false; + SDValue Operand = Ops[OperandIdx[i] - 1]; + switch (Operand.getOpcode()) { + case AMDGPUISD::CONST_ADDRESS: { + if (i == 2) + break; + SDValue CstOffset; + if (!Operand.getValueType().isVector() && + SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { + Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); + Ops[SelIdx[i] - 1] = CstOffset; + return true; + } + } + break; + case ISD::FNEG: + if (NegIdx[i] < 0) + break; + Ops[OperandIdx[i] - 1] = Operand.getOperand(0); + Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32); + return true; + case ISD::FABS: + if (AbsIdx[i] < 0) + break; + Ops[OperandIdx[i] - 1] = Operand.getOperand(0); + Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32); + return true; + case ISD::BITCAST: + Ops[OperandIdx[i] - 1] = Operand.getOperand(0); + return true; + default: + break; + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) { + if (!ptr) { + return false; + } + Type *ptrType = ptr->getType(); + return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace; +} + +const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) { + if (!V) { + return NULL; + } + const Value *ret = NULL; + ValueMap<const Value *, bool> ValueBitMap; + std::queue<const Value *, std::list<const Value *> > ValueQueue; + ValueQueue.push(V); + while (!ValueQueue.empty()) { + V = ValueQueue.front(); + if (ValueBitMap.find(V) == ValueBitMap.end()) { + ValueBitMap[V] = true; + if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) { + ret = V; + break; + } else if (dyn_cast<GlobalVariable>(V)) { + ret = V; + break; + } else if (dyn_cast<Constant>(V)) { + const ConstantExpr *CE = dyn_cast<ConstantExpr>(V); + if (CE) { + ValueQueue.push(CE->getOperand(0)); + } + } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) { + ret = AI; + break; + } else if (const Instruction *I = dyn_cast<Instruction>(V)) { + uint32_t numOps = I->getNumOperands(); + for (uint32_t x = 0; x < numOps; ++x) { + ValueQueue.push(I->getOperand(x)); + } + } else { + assert(!"Found a Value that we didn't know how to handle!"); + } + } + ValueQueue.pop(); + } + return ret; +} + +bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { + return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)); +} + +bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) { + if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) { + return true; + } + MachineMemOperand *MMO = N->getMemOperand(); + const Value *V = MMO->getValue(); + const Value *BV = getBasePointerValue(V); + if (MMO + && MMO->getValue() + && ((V && dyn_cast<GlobalValue>(V)) + || (BV && dyn_cast<GlobalValue>( + getBasePointerValue(MMO->getValue()))))) { + return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS); + } else { + return false; + } +} + +bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) { + MachineMemOperand *MMO = N->getMemOperand(); + if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { + if (MMO) { + const Value *V = MMO->getValue(); + const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V); + if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + return true; + } + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) { + if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { + // Check to make sure we are not a constant pool load or a constant load + // that is marked as a private load + if (isCPLoad(N) || isConstantLoad(N, -1)) { + return false; + } + } + if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) { + return true; + } + return false; +} + +const char *AMDGPUDAGToDAGISel::getPassName() const { + return "AMDGPU DAG->DAG Pattern Instruction Selection"; +} + +#ifdef DEBUGTMP +#undef INT64_C +#endif +#undef DEBUGTMP + +///==== AMDGPU Functions ====/// + +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!dyn_cast<ConstantSDNode>(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode * IMMOffset; + + if (Addr.getOpcode() == ISD::ADD + && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) + && isInt<16>(IMMOffset->getZExtValue())) { + + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); + return true; + // If the pointer address is constant, we can move it to the offset field. + } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) + && isInt<16>(IMMOffset->getZExtValue())) { + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + CurDAG->getEntryNode().getDebugLoc(), + AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); + return true; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + + if ((C = dyn_cast<ConstantSDNode>(Addr))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + } + + return true; +} + +void AMDGPUDAGToDAGISel::PostprocessISelDAG() { + + // Go over all selected nodes and try to fold them a bit more + const AMDGPUTargetLowering& Lowering = ((const AMDGPUTargetLowering&)TLI); + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ++I) { + + MachineSDNode *Node = dyn_cast<MachineSDNode>(I); + if (!Node) + continue; + + SDNode *ResNode = Lowering.PostISelFolding(Node, *CurDAG); + if (ResNode != Node) + ReplaceUses(Node, ResNode); + } +} + diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp new file mode 100644 index 0000000000..922cac12b9 --- /dev/null +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -0,0 +1,647 @@ +//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief TargetLowering functions borrowed from AMDIL. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "AMDILDevices.h" +#include "AMDILIntrinsicInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; +//===----------------------------------------------------------------------===// +// TargetLowering Implementation Help Functions End +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TargetLowering Class Implementation Begins +//===----------------------------------------------------------------------===// +void AMDGPUTargetLowering::InitAMDILLowering() { + int types[] = { + (int)MVT::i8, + (int)MVT::i16, + (int)MVT::i32, + (int)MVT::f32, + (int)MVT::f64, + (int)MVT::i64, + (int)MVT::v2i8, + (int)MVT::v4i8, + (int)MVT::v2i16, + (int)MVT::v4i16, + (int)MVT::v4f32, + (int)MVT::v4i32, + (int)MVT::v2f32, + (int)MVT::v2i32, + (int)MVT::v2f64, + (int)MVT::v2i64 + }; + + int IntTypes[] = { + (int)MVT::i8, + (int)MVT::i16, + (int)MVT::i32, + (int)MVT::i64 + }; + + int FloatTypes[] = { + (int)MVT::f32, + (int)MVT::f64 + }; + + int VectorTypes[] = { + (int)MVT::v2i8, + (int)MVT::v4i8, + (int)MVT::v2i16, + (int)MVT::v4i16, + (int)MVT::v4f32, + (int)MVT::v4i32, + (int)MVT::v2f32, + (int)MVT::v2i32, + (int)MVT::v2f64, + (int)MVT::v2i64 + }; + size_t NumTypes = sizeof(types) / sizeof(*types); + size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes); + size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes); + size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes); + + const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>(); + // These are the current register classes that are + // supported + + for (unsigned int x = 0; x < NumTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x]; + + //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types + // We cannot sextinreg, expand to shifts + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); + setOperationAction(ISD::SUBE, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::BRCOND, VT, Custom); + setOperationAction(ISD::BR_JT, VT, Expand); + setOperationAction(ISD::BRIND, VT, Expand); + // TODO: Implement custom UREM/SREM routines + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + if (VT != MVT::i64 && VT != MVT::v2i64) { + setOperationAction(ISD::SDIV, VT, Custom); + } + } + for (unsigned int x = 0; x < NumFloatTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x]; + + // IL does not have these operations for floating point types + setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); + setOperationAction(ISD::SETOLT, VT, Expand); + setOperationAction(ISD::SETOGE, VT, Expand); + setOperationAction(ISD::SETOGT, VT, Expand); + setOperationAction(ISD::SETOLE, VT, Expand); + setOperationAction(ISD::SETULT, VT, Expand); + setOperationAction(ISD::SETUGE, VT, Expand); + setOperationAction(ISD::SETUGT, VT, Expand); + setOperationAction(ISD::SETULE, VT, Expand); + } + + for (unsigned int x = 0; x < NumIntTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x]; + + // GPU also does not have divrem function for signed or unsigned + setOperationAction(ISD::SDIVREM, VT, Expand); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + // GPU doesn't have a rotl, rotr, or byteswap instruction + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + + // GPU doesn't have any counting operators + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii]; + + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + // setOperationAction(ISD::VSETCC, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + + } + if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) { + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::v2i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::v2i64, Expand); + setOperationAction(ISD::ADD, MVT::v2i64, Expand); + setOperationAction(ISD::SREM, MVT::v2i64, Expand); + setOperationAction(ISD::Constant , MVT::i64 , Legal); + setOperationAction(ISD::SDIV, MVT::v2i64, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand); + } + if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) { + // we support loading/storing v2f64 but not operations on the type + setOperationAction(ISD::FADD, MVT::v2f64, Expand); + setOperationAction(ISD::FSUB, MVT::v2f64, Expand); + setOperationAction(ISD::FMUL, MVT::v2f64, Expand); + setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ConstantFP , MVT::f64 , Legal); + // We want to expand vector conversions into their scalar + // counterparts. + setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::v2f64, Expand); + } + // TODO: Fix the UDIV24 algorithm so it works for these + // types correctly. This needs vector comparisons + // for this to work correctly. + setOperationAction(ISD::UDIV, MVT::v2i8, Expand); + setOperationAction(ISD::UDIV, MVT::v4i8, Expand); + setOperationAction(ISD::UDIV, MVT::v2i16, Expand); + setOperationAction(ISD::UDIV, MVT::v4i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); + setOperationAction(ISD::SUBC, MVT::Other, Expand); + setOperationAction(ISD::ADDE, MVT::Other, Expand); + setOperationAction(ISD::ADDC, MVT::Other, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); + + + // Use the default implementation. + setOperationAction(ISD::ConstantFP , MVT::f32 , Legal); + setOperationAction(ISD::Constant , MVT::i32 , Legal); + + setSchedulingPreference(Sched::RegPressure); + setPow2DivIsCheap(false); + setSelectIsExpensive(true); + setJumpIsExpensive(true); + + MaxStoresPerMemcpy = 4096; + MaxStoresPerMemmove = 4096; + MaxStoresPerMemset = 4096; + +} + +bool +AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, unsigned Intrinsic) const { + return false; +} + +// The backend supports 32 and 64 bit floating point immediates +bool +AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { + return true; + } else { + return false; + } +} + +bool +AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { + return false; + } else { + return true; + } +} + + +// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to +// be zero. Op is expected to be a target specific node. Used by DAG +// combiner. + +void +AMDGPUTargetLowering::computeMaskedBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + APInt KnownZero2; + APInt KnownOne2; + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything + switch (Op.getOpcode()) { + default: break; + case ISD::SELECT_CC: + DAG.ComputeMaskedBits( + Op.getOperand(1), + KnownZero, + KnownOne, + Depth + 1 + ); + DAG.ComputeMaskedBits( + Op.getOperand(0), + KnownZero2, + KnownOne2 + ); + assert((KnownZero & KnownOne) == 0 + && "Bits known to be one AND zero?"); + assert((KnownZero2 & KnownOne2) == 0 + && "Bits known to be one AND zero?"); + // Only known if known in both the LHS and RHS + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + break; + }; +} + +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// + +SDValue +AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerSDIV64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerSDIV32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16 + || OVT.getScalarType() == MVT::i8) { + DST = LowerSDIV24(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerSREM64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerSREM32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16) { + DST = LowerSREM16(Op, DAG); + } else if (OVT.getScalarType() == MVT::i8) { + DST = LowerSREM8(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { + SDValue Data = Op.getOperand(0); + VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1)); + DebugLoc DL = Op.getDebugLoc(); + EVT DVT = Data.getValueType(); + EVT BVT = BaseType->getVT(); + unsigned baseBits = BVT.getScalarType().getSizeInBits(); + unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1; + unsigned shiftBits = srcBits - baseBits; + if (srcBits < 32) { + // If the op is less than 32 bits, then it needs to extend to 32bits + // so it can properly keep the upper bits valid. + EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1); + Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data); + shiftBits = 32 - baseBits; + DVT = IVT; + } + SDValue Shift = DAG.getConstant(shiftBits, DVT); + // Shift left by 'Shift' bits. + Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift); + // Signed shift Right by 'Shift' bits. + Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift); + if (srcBits < 32) { + // Once the sign extension is done, the op needs to be converted to + // its original type. + Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType()); + } + return Data; +} +EVT +AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const { + int iSize = (size * numEle); + int vEle = (iSize >> ((size == 64) ? 6 : 5)); + if (!vEle) { + vEle = 1; + } + if (size == 64) { + if (vEle == 1) { + return EVT(MVT::i64); + } else { + return EVT(MVT::getVectorVT(MVT::i64, vEle)); + } + } else { + if (vEle == 1) { + return EVT(MVT::i32); + } else { + return EVT(MVT::getVectorVT(MVT::i32, vEle)); + } + } +} + +SDValue +AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + SDValue Result; + Result = DAG.getNode( + AMDGPUISD::BRANCH_COND, + Op.getDebugLoc(), + Op.getValueType(), + Chain, Jump, Cond); + return Result; +} + +SDValue +AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT INTTY; + MVT FLTTY; + if (!OVT.isVector()) { + INTTY = MVT::i32; + FLTTY = MVT::f32; + } else if (OVT.getVectorNumElements() == 2) { + INTTY = MVT::v2i32; + FLTTY = MVT::v2f32; + } else if (OVT.getVectorNumElements() == 4) { + INTTY = MVT::v4i32; + FLTTY = MVT::v4f32; + } + unsigned bitsize = OVT.getScalarType().getSizeInBits(); + // char|short jq = ia ^ ib; + SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, INTTY); + + // int ia = (int)LHS; + SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); + + // int ib, (int)RHS; + SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, + DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); + + // int cv = fr >= fb; + SDValue cv; + if (INTTY == MVT::i32) { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } else { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, + DAG.getConstant(0, OVT)); + // dst = iq + jq; + iq = DAG.getSExtOrTrunc(iq, DL, OVT); + iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); + return iq; +} + +SDValue +AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSDIV32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r0, r0, r1 + // ixor r10, r10, r11 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSelectCC(DL, + r0, DAG.getConstant(0, OVT), + DAG.getConstant(-1, MVT::i32), + DAG.getConstant(0, MVT::i32), + ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSelectCC(DL, + r1, DAG.getConstant(0, OVT), + DAG.getConstant(-1, MVT::i32), + DAG.getConstant(0, MVT::i32), + ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r0, r0, r1 + r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); + + // ixor r10, r10, r11 + r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} + +SDValue +AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i8) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i8) { + INTTY = MVT::v4i32; + } + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); + return LHS; +} + +SDValue +AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i16) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i16) { + INTTY = MVT::v4i32; + } + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); + return LHS; +} + +SDValue +AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSREM32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r20, r0, r1 + // umul r20, r20, r1 + // sub r0, r0, r20 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r20, r0, r1 + SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); + + // umul r20, r20, r1 + r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); + + // sub r0, r0, r20 + r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td new file mode 100644 index 0000000000..110f147651 --- /dev/null +++ b/lib/Target/R600/AMDILInstrInfo.td @@ -0,0 +1,207 @@ +//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file describes the AMDIL instructions in TableGen format. +// +//===----------------------------------------------------------------------===// +// AMDIL Instruction Predicate Definitions +// Predicate that is set to true if the hardware supports double precision +// divide +def HasHWDDiv : Predicate<"Subtarget.device()" + "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && " + "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">; + +// Predicate that is set to true if the hardware supports double, but not double +// precision divide in hardware +def HasSWDDiv : Predicate<"Subtarget.device()" + "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&" + "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">; + +// Predicate that is set to true if the hardware support 24bit signed +// math ops. Otherwise a software expansion to 32bit math ops is used instead. +def HasHWSign24Bit : Predicate<"Subtarget.device()" + "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">; + +// Predicate that is set to true if 64bit operations are supported or not +def HasHW64Bit : Predicate<"Subtarget.device()" + "->usesHardware(AMDGPUDeviceInfo::LongOps)">; +def HasSW64Bit : Predicate<"Subtarget.device()" + "->usesSoftware(AMDGPUDeviceInfo::LongOps)">; + +// Predicate that is set to true if the timer register is supported +def HasTmrRegister : Predicate<"Subtarget.device()" + "->isSupported(AMDGPUDeviceInfo::TmrReg)">; +// Predicate that is true if we are at least evergreen series +def HasDeviceIDInst : Predicate<"Subtarget.device()" + "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">; + +// Predicate that is true if we have region address space. +def hasRegionAS : Predicate<"Subtarget.device()" + "->usesHardware(AMDGPUDeviceInfo::RegionMem)">; + +// Predicate that is false if we don't have region address space. +def noRegionAS : Predicate<"!Subtarget.device()" + "->isSupported(AMDGPUDeviceInfo::RegionMem)">; + + +// Predicate that is set to true if 64bit Mul is supported in the IL or not +def HasHW64Mul : Predicate<"Subtarget.calVersion()" + ">= CAL_VERSION_SC_139" + "&& Subtarget.device()" + "->getGeneration() >=" + "AMDGPUDeviceInfo::HD5XXX">; +def HasSW64Mul : Predicate<"Subtarget.calVersion()" + "< CAL_VERSION_SC_139">; +// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not +def HasHW64DivMod : Predicate<"Subtarget.device()" + "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">; +def HasSW64DivMod : Predicate<"Subtarget.device()" + "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">; + +// Predicate that is set to true if 64bit pointer are used. +def Has64BitPtr : Predicate<"Subtarget.is64bit()">; +def Has32BitPtr : Predicate<"!Subtarget.is64bit()">; +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +def brtarget : Operand<OtherVT>; + +//===--------------------------------------------------------------------===// +// Custom Selection DAG Type Profiles +//===--------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Generic Profile Types +//===----------------------------------------------------------------------===// + +def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> + ]>; +def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3> + ]>; +def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [ + SDTCisEltOfVec<1, 0> + ]>; + +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; + +//===--------------------------------------------------------------------===// +// Custom Selection DAG Nodes +//===--------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +//===--------------------------------------------------------------------===// +// Instructions +//===--------------------------------------------------------------------===// +// Floating point math functions +def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>; + +//===----------------------------------------------------------------------===// +// Integer functions +//===----------------------------------------------------------------------===// +def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; + +//===--------------------------------------------------------------------===// +// Custom Pattern DAG Nodes +//===--------------------------------------------------------------------===// +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +//===----------------------------------------------------------------------===// +// Load pattern fragments +//===----------------------------------------------------------------------===// +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + +//===----------------------------------------------------------------------===// +// Complex addressing mode patterns +//===----------------------------------------------------------------------===// +def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>; +def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>; +def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>; +def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>; + +//===----------------------------------------------------------------------===// +// Instruction format classes +//===----------------------------------------------------------------------===// +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +//===--------------------------------------------------------------------===// +// Multiclass Instruction formats +//===--------------------------------------------------------------------===// +// Multiclass that handles branch instructions +multiclass BranchConditional<SDNode Op> { + def _i32 : ILFormat<(outs), + (ins brtarget:$target, GPRI32:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, GPRI32:$src0)]>; + def _f32 : ILFormat<(outs), + (ins brtarget:$target, GPRF32:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, GPRF32:$src0)]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr<string name> { + def _i32 : ILFormat<(outs), (ins GPRI32:$src), + !strconcat(name, " $src"), []>; + def _f32 : ILFormat<(outs), (ins GPRF32:$src), + !strconcat(name, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2<string name> { + def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1), + !strconcat(name, " $src0, $src1"), []>; + def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1), + !strconcat(name, " $src0, $src1"), []>; +} + +//===--------------------------------------------------------------------===// +// Intrinsics support +//===--------------------------------------------------------------------===// +include "AMDILIntrinsics.td" diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp new file mode 100644 index 0000000000..4ddb057d80 --- /dev/null +++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp @@ -0,0 +1,79 @@ +//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Implementation of the IntrinsicInfo class. +// +//===-----------------------------------------------------------------------===// + +#include "AMDILIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "AMDIL.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN +#include "AMDGPUGenIntrinsics.inc" +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN + +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) + : TargetIntrinsicInfo() { +} + +std::string +AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, + unsigned int numTys) const { + static const char* const names[] = { +#define GET_INTRINSIC_NAME_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_NAME_TABLE + }; + + if (IntrID < Intrinsic::num_intrinsics) { + return 0; + } + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics + && "Invalid intrinsic ID"); + + std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + return Result; +} + +unsigned int +AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { +#define GET_FUNCTION_RECOGNIZER +#include "AMDGPUGenIntrinsics.inc" +#undef GET_FUNCTION_RECOGNIZER + AMDGPUIntrinsic::ID IntrinsicID + = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); + + if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { + return IntrinsicID; + } + return 0; +} + +bool +AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { + // Overload Table +#define GET_INTRINSIC_OVERLOAD_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_OVERLOAD_TABLE +} + +Function* +AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { + llvm_unreachable("Not implemented"); +} diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h new file mode 100644 index 0000000000..35559e23fc --- /dev/null +++ b/lib/Target/R600/AMDILIntrinsicInfo.h @@ -0,0 +1,49 @@ +//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. +// +//===-----------------------------------------------------------------------===// +#ifndef AMDIL_INTRINSICS_H +#define AMDIL_INTRINSICS_H + +#include "llvm/IR/Intrinsics.h" +#include "llvm/Target/TargetIntrinsicInfo.h" + +namespace llvm { +class TargetMachine; + +namespace AMDGPUIntrinsic { +enum ID { + last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, +#define GET_INTRINSIC_ENUM_VALUES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ENUM_VALUES + , num_AMDGPU_intrinsics +}; + +} // end namespace AMDGPUIntrinsic + +class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { +public: + AMDGPUIntrinsicInfo(TargetMachine *tm); + std::string getName(unsigned int IntrId, Type **Tys = 0, + unsigned int numTys = 0) const; + unsigned int lookupName(const char *Name, unsigned int Len) const; + bool isOverloaded(unsigned int IID) const; + Function *getDeclaration(Module *M, unsigned int ID, + Type **Tys = 0, + unsigned int numTys = 0) const; +}; + +} // end namespace llvm + +#endif // AMDIL_INTRINSICS_H + diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td new file mode 100644 index 0000000000..6ec3559af2 --- /dev/null +++ b/lib/Target/R600/AMDILIntrinsics.td @@ -0,0 +1,232 @@ +//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file defines all of the amdil-specific intrinsics +// +//===---------------------------------------------------------------===// +//===--------------------------------------------------------------------===// +// Intrinsic classes +// Generic versions of the above classes but for Target specific intrinsics +// instead of SDNode patterns. +//===--------------------------------------------------------------------===// +let TargetPrefix = "AMDIL", isTarget = 1 in { + class VoidIntLong : + Intrinsic<[llvm_i64_ty], [], []>; + class VoidIntInt : + Intrinsic<[llvm_i32_ty], [], []>; + class VoidIntBool : + Intrinsic<[llvm_i32_ty], [], []>; + class UnaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class UnaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class ConvertIntFTOI : + Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; + class ConvertIntITOF : + Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>; + class UnaryIntNoRetInt : + Intrinsic<[], [llvm_anyint_ty], []>; + class UnaryIntNoRetFloat : + Intrinsic<[], [llvm_anyfloat_ty], []>; + class BinaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class BinaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class BinaryIntNoRetInt : + Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>; + class BinaryIntNoRetFloat : + Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>; + class TernaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class TernaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class QuaternaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class UnaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class BinaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class TernaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + class UnaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class BinaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class TernaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; +} + +let TargetPrefix = "AMDIL", isTarget = 1 in { + def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt; + + def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">, + TernaryIntInt; + def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">, + TernaryIntInt; + def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">, + UnaryIntInt; + def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">, + UnaryIntInt; + def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">, + UnaryIntInt; + def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">, + UnaryIntInt; + def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">, + UnaryIntInt; + def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">, + TernaryIntInt; + def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">, + TernaryIntInt; + def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">, + QuaternaryIntInt; + def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">, + TernaryIntInt; + def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">, + BinaryIntInt; + def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">, + BinaryIntInt; + def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">, + BinaryIntInt; + def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">, + BinaryIntInt; + def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">, + BinaryIntInt; + def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">, + BinaryIntInt; + def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">, + BinaryIntInt; + def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">, + BinaryIntInt; + def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">, + BinaryIntInt; + def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">, + BinaryIntInt; + def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">, + BinaryIntInt; + def int_AMDIL_min : GCCBuiltin<"__amdil_min">, + BinaryIntFloat; + def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">, + BinaryIntInt; + def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">, + BinaryIntInt; + def int_AMDIL_max : GCCBuiltin<"__amdil_max">, + BinaryIntFloat; + def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">, + TernaryIntInt; + def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">, + TernaryIntInt; + def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">, + TernaryIntInt; + def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">, + UnaryIntFloat; + def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">, + TernaryIntFloat; + def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">, + UnaryIntFloat; + def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">, + UnaryIntFloat; + def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">, + UnaryIntFloat; + def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">, + UnaryIntFloat; + def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">, + UnaryIntFloat; + def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">, + UnaryIntFloat; + def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">, + UnaryIntFloat; + def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">, + UnaryIntFloat; + def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">, + UnaryIntFloat; + def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">, + UnaryIntFloat; + def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">, + UnaryIntFloat; + def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">, + UnaryIntFloat; + def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat; + def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat; + def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt; + def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">, + UnaryIntFloat; + def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">, + UnaryIntFloat; + def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">, + UnaryIntFloat; + def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">, + UnaryIntFloat; + def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">, + UnaryIntFloat; + def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">, + UnaryIntFloat; + def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">, + UnaryIntFloat; + def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">, + UnaryIntFloat; + def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">, + TernaryIntFloat; + def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">, + UnaryIntFloat; + def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">, + UnaryIntFloat; + def int_AMDIL_length : GCCBuiltin<"__amdil_length">, + UnaryIntFloat; + def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">, + TernaryIntFloat; + def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">, + Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, + llvm_v4i32_ty, llvm_i32_ty], []>; + + def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">, + Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>; + def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">, + Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>; + def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">, + Intrinsic<[llvm_double_ty], [llvm_double_ty], []>; + def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">, + ConvertIntITOF; + def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">, + ConvertIntFTOI; + def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">, + Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>; + def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">, + ConvertIntITOF; + def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">, + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, + llvm_v2f32_ty, llvm_float_ty], []>; + def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">, + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, + llvm_v2f32_ty], []>; + def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">, + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, + llvm_v4f32_ty], []>; + def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">, + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, + llvm_v4f32_ty], []>; +} diff --git a/lib/Target/R600/AMDILNIDevice.cpp b/lib/Target/R600/AMDILNIDevice.cpp new file mode 100644 index 0000000000..47c3f7f209 --- /dev/null +++ b/lib/Target/R600/AMDILNIDevice.cpp @@ -0,0 +1,65 @@ +//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#include "AMDILNIDevice.h" +#include "AMDGPUSubtarget.h" +#include "AMDILEvergreenDevice.h" + +using namespace llvm; + +AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + std::string name = ST->getDeviceName(); + if (name == "caicos") { + DeviceFlag = OCL_DEVICE_CAICOS; + } else if (name == "turks") { + DeviceFlag = OCL_DEVICE_TURKS; + } else if (name == "cayman") { + DeviceFlag = OCL_DEVICE_CAYMAN; + } else { + DeviceFlag = OCL_DEVICE_BARTS; + } +} +AMDGPUNIDevice::~AMDGPUNIDevice() { +} + +size_t +AMDGPUNIDevice::getMaxLDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_900; + } else { + return 0; + } +} + +uint32_t +AMDGPUNIDevice::getGeneration() const { + return AMDGPUDeviceInfo::HD6XXX; +} + + +AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST) + : AMDGPUNIDevice(ST) { + setCaps(); +} + +AMDGPUCaymanDevice::~AMDGPUCaymanDevice() { +} + +void +AMDGPUCaymanDevice::setCaps() { + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { + mHWBits.set(AMDGPUDeviceInfo::DoubleOps); + mHWBits.set(AMDGPUDeviceInfo::FMA); + } + mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps); + mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps); + mSWBits.set(AMDGPUDeviceInfo::ArenaSegment); +} + diff --git a/lib/Target/R600/AMDILNIDevice.h b/lib/Target/R600/AMDILNIDevice.h new file mode 100644 index 0000000000..24a640845e --- /dev/null +++ b/lib/Target/R600/AMDILNIDevice.h @@ -0,0 +1,57 @@ +//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +/// \file +/// \brief Interface for the subtarget data classes. +/// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===---------------------------------------------------------------------===// +#ifndef AMDILNIDEVICE_H +#define AMDILNIDEVICE_H +#include "AMDGPUSubtarget.h" +#include "AMDILEvergreenDevice.h" + +namespace llvm { + +class AMDGPUSubtarget; +//===---------------------------------------------------------------------===// +// NI generation of devices and their respective sub classes +//===---------------------------------------------------------------------===// + +/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of +/// cards. +/// +/// It is very similiar to the AMDGPUEvergreenDevice, with the major +/// exception being differences in wavefront size and hardware capabilities. The +/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit +/// integer operations +class AMDGPUNIDevice : public AMDGPUEvergreenDevice { +public: + AMDGPUNIDevice(AMDGPUSubtarget*); + virtual ~AMDGPUNIDevice(); + virtual size_t getMaxLDSSize() const; + virtual uint32_t getGeneration() const; +}; + +/// Just as the AMDGPUCypressDevice is the double capable version of the +/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version +/// of the AMDGPUNIDevice. The other major difference is that the Cayman Device +/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide. +class AMDGPUCaymanDevice: public AMDGPUNIDevice { +public: + AMDGPUCaymanDevice(AMDGPUSubtarget*); + virtual ~AMDGPUCaymanDevice(); +private: + virtual void setCaps(); +}; + +static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800; +} // namespace llvm +#endif // AMDILNIDEVICE_H diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp new file mode 100644 index 0000000000..3a28038666 --- /dev/null +++ b/lib/Target/R600/AMDILPeepholeOptimizer.cpp @@ -0,0 +1,1215 @@ +//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "PeepholeOpt" +#ifdef DEBUG +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME 0 +#endif + +#include "AMDILDevices.h" +#include "AMDGPUInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Constants.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" + +#include <sstream> + +#if 0 +STATISTIC(PointerAssignments, "Number of dynamic pointer " + "assigments discovered"); +STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); +#endif + +using namespace llvm; +// The Peephole optimization pass is used to do simple last minute optimizations +// that are required for correct code or to remove redundant functions +namespace { + +class OpaqueType; + +class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { +public: + TargetMachine &TM; + static char ID; + AMDGPUPeepholeOpt(TargetMachine &tm); + ~AMDGPUPeepholeOpt(); + const char *getPassName() const; + bool runOnFunction(Function &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const; +protected: +private: + // Function to initiate all of the instruction level optimizations. + bool instLevelOptimizations(BasicBlock::iterator *inst); + // Quick check to see if we need to dump all of the pointers into the + // arena. If this is correct, then we set all pointers to exist in arena. This + // is a workaround for aliasing of pointers in a struct/union. + bool dumpAllIntoArena(Function &F); + // Because I don't want to invalidate any pointers while in the + // safeNestedForEachFunction. I push atomic conversions to a vector and handle + // it later. This function does the conversions if required. + void doAtomicConversionIfNeeded(Function &F); + // Because __amdil_is_constant cannot be properly evaluated if + // optimizations are disabled, the call's are placed in a vector + // and evaluated after the __amdil_image* functions are evaluated + // which should allow the __amdil_is_constant function to be + // evaluated correctly. + void doIsConstCallConversionIfNeeded(); + bool mChanged; + bool mDebug; + bool mConvertAtomics; + CodeGenOpt::Level optLevel; + // Run a series of tests to see if we can optimize a CALL instruction. + bool optimizeCallInst(BasicBlock::iterator *bbb); + // A peephole optimization to optimize bit extract sequences. + bool optimizeBitExtract(Instruction *inst); + // A peephole optimization to optimize bit insert sequences. + bool optimizeBitInsert(Instruction *inst); + bool setupBitInsert(Instruction *base, + Instruction *&src, + Constant *&mask, + Constant *&shift); + // Expand the bit field insert instruction on versions of OpenCL that + // don't support it. + bool expandBFI(CallInst *CI); + // Expand the bit field mask instruction on version of OpenCL that + // don't support it. + bool expandBFM(CallInst *CI); + // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in + // this case we need to expand them. These functions check for 24bit functions + // and then expand. + bool isSigned24BitOps(CallInst *CI); + void expandSigned24BitOps(CallInst *CI); + // One optimization that can occur is that if the required workgroup size is + // specified then the result of get_local_size is known at compile time and + // can be returned accordingly. + bool isRWGLocalOpt(CallInst *CI); + // On northern island cards, the division is slightly less accurate than on + // previous generations, so we need to utilize a more accurate division. So we + // can translate the accurate divide to a normal divide on all other cards. + bool convertAccurateDivide(CallInst *CI); + void expandAccurateDivide(CallInst *CI); + // If the alignment is set incorrectly, it can produce really inefficient + // code. This checks for this scenario and fixes it if possible. + bool correctMisalignedMemOp(Instruction *inst); + + // If we are in no opt mode, then we need to make sure that + // local samplers are properly propagated as constant propagation + // doesn't occur and we need to know the value of kernel defined + // samplers at compile time. + bool propagateSamplerInst(CallInst *CI); + + // Helper functions + + // Group of functions that recursively calculate the size of a structure based + // on it's sub-types. + size_t getTypeSize(Type * const T, bool dereferencePtr = false); + size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); + size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); + size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); + size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); + size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); + size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); + size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); + + LLVMContext *mCTX; + Function *mF; + const AMDGPUSubtarget *mSTM; + SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; + SmallVector<CallInst *, 16> isConstVec; +}; // class AMDGPUPeepholeOpt + char AMDGPUPeepholeOpt::ID = 0; + +// A template function that has two levels of looping before calling the +// function with a pointer to the current iterator. +template<class InputIterator, class SecondIterator, class Function> +Function safeNestedForEach(InputIterator First, InputIterator Last, + SecondIterator S, Function F) { + for ( ; First != Last; ++First) { + SecondIterator sf, sl; + for (sf = First->begin(), sl = First->end(); + sf != sl; ) { + if (!F(&sf)) { + ++sf; + } + } + } + return F; +} + +} // anonymous namespace + +namespace llvm { + FunctionPass * + createAMDGPUPeepholeOpt(TargetMachine &tm) { + return new AMDGPUPeepholeOpt(tm); + } +} // llvm namespace + +AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) + : FunctionPass(ID), TM(tm) { + mDebug = DEBUGME; + optLevel = TM.getOptLevel(); + +} + +AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { +} + +const char * +AMDGPUPeepholeOpt::getPassName() const { + return "AMDGPU PeepHole Optimization Pass"; +} + +bool +containsPointerType(Type *Ty) { + if (!Ty) { + return false; + } + switch(Ty->getTypeID()) { + default: + return false; + case Type::StructTyID: { + const StructType *ST = dyn_cast<StructType>(Ty); + for (StructType::element_iterator stb = ST->element_begin(), + ste = ST->element_end(); stb != ste; ++stb) { + if (!containsPointerType(*stb)) { + continue; + } + return true; + } + break; + } + case Type::VectorTyID: + case Type::ArrayTyID: + return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); + case Type::PointerTyID: + return true; + }; + return false; +} + +bool +AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { + bool dumpAll = false; + for (Function::const_arg_iterator cab = F.arg_begin(), + cae = F.arg_end(); cab != cae; ++cab) { + const Argument *arg = cab; + const PointerType *PT = dyn_cast<PointerType>(arg->getType()); + if (!PT) { + continue; + } + Type *DereferencedType = PT->getElementType(); + if (!dyn_cast<StructType>(DereferencedType) + ) { + continue; + } + if (!containsPointerType(DereferencedType)) { + continue; + } + // FIXME: Because a pointer inside of a struct/union may be aliased to + // another pointer we need to take the conservative approach and place all + // pointers into the arena until more advanced detection is implemented. + dumpAll = true; + } + return dumpAll; +} +void +AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { + if (isConstVec.empty()) { + return; + } + for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { + CallInst *CI = isConstVec[x]; + Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) + : ConstantInt::get(aType, 0); + CI->replaceAllUsesWith(Val); + CI->eraseFromParent(); + } + isConstVec.clear(); +} +void +AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { + // Don't do anything if we don't have any atomic operations. + if (atomicFuncs.empty()) { + return; + } + // Change the function name for the atomic if it is required + uint32_t size = atomicFuncs.size(); + for (uint32_t x = 0; x < size; ++x) { + atomicFuncs[x].first->setOperand( + atomicFuncs[x].first->getNumOperands()-1, + atomicFuncs[x].second); + + } + mChanged = true; + if (mConvertAtomics) { + return; + } +} + +bool +AMDGPUPeepholeOpt::runOnFunction(Function &MF) { + mChanged = false; + mF = &MF; + mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); + if (mDebug) { + MF.dump(); + } + mCTX = &MF.getType()->getContext(); + mConvertAtomics = true; + safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), + std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), + this)); + + doAtomicConversionIfNeeded(MF); + doIsConstCallConversionIfNeeded(); + + if (mDebug) { + MF.dump(); + } + return mChanged; +} + +bool +AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { + Instruction *inst = (*bbb); + CallInst *CI = dyn_cast<CallInst>(inst); + if (!CI) { + return false; + } + if (isSigned24BitOps(CI)) { + expandSigned24BitOps(CI); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + if (propagateSamplerInst(CI)) { + return false; + } + if (expandBFI(CI) || expandBFM(CI)) { + ++(*bbb); + CI->eraseFromParent(); + return true; + } + if (convertAccurateDivide(CI)) { + expandAccurateDivide(CI); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + + StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); + if (calleeName.startswith("__amdil_is_constant")) { + // If we do not have optimizations, then this + // cannot be properly evaluated, so we add the + // call instruction to a vector and process + // them at the end of processing after the + // samplers have been correctly handled. + if (optLevel == CodeGenOpt::None) { + isConstVec.push_back(CI); + return false; + } else { + Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) + : ConstantInt::get(aType, 0); + CI->replaceAllUsesWith(Val); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + } + + if (calleeName.equals("__amdil_is_asic_id_i32")) { + ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = CV; + if (Val) { + Val = ConstantInt::get(aType, + mSTM->device()->getDeviceFlag() & CV->getZExtValue()); + } else { + Val = ConstantInt::get(aType, 0); + } + CI->replaceAllUsesWith(Val); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); + if (!F) { + return false; + } + if (F->getName().startswith("__atom") && !CI->getNumUses() + && F->getName().find("_xchg") == StringRef::npos) { + std::string buffer(F->getName().str() + "_noret"); + F = dyn_cast<Function>( + F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); + atomicFuncs.push_back(std::make_pair(CI, F)); + } + + if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) + && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { + return false; + } + if (!mConvertAtomics) { + return false; + } + StringRef name = F->getName(); + if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { + mConvertAtomics = false; + } + return false; +} + +bool +AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, + Instruction *&src, + Constant *&mask, + Constant *&shift) { + if (!base) { + if (mDebug) { + dbgs() << "Null pointer passed into function.\n"; + } + return false; + } + bool andOp = false; + if (base->getOpcode() == Instruction::Shl) { + shift = dyn_cast<Constant>(base->getOperand(1)); + } else if (base->getOpcode() == Instruction::And) { + mask = dyn_cast<Constant>(base->getOperand(1)); + andOp = true; + } else { + if (mDebug) { + dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; + } + // If the base is neither a Shl or a And, we don't fit any of the patterns above. + return false; + } + src = dyn_cast<Instruction>(base->getOperand(0)); + if (!src) { + if (mDebug) { + dbgs() << "Failed setup since the base operand is not an instruction!\n"; + } + return false; + } + // If we find an 'and' operation, then we don't need to + // find the next operation as we already know the + // bits that are valid at this point. + if (andOp) { + return true; + } + if (src->getOpcode() == Instruction::Shl && !shift) { + shift = dyn_cast<Constant>(src->getOperand(1)); + src = dyn_cast<Instruction>(src->getOperand(0)); + } else if (src->getOpcode() == Instruction::And && !mask) { + mask = dyn_cast<Constant>(src->getOperand(1)); + } + if (!mask && !shift) { + if (mDebug) { + dbgs() << "Failed setup since both mask and shift are NULL!\n"; + } + // Did not find a constant mask or a shift. + return false; + } + return true; +} +bool +AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { + if (!inst) { + return false; + } + if (!inst->isBinaryOp()) { + return false; + } + if (inst->getOpcode() != Instruction::Or) { + return false; + } + if (optLevel == CodeGenOpt::None) { + return false; + } + // We want to do an optimization on a sequence of ops that in the end equals a + // single ISA instruction. + // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) + // Some simplified versions of this pattern are as follows: + // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 + // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E + // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B + // (A & B) | (D << F) when (1 << F) >= B + // (A << C) | (D & E) when (1 << C) >= E + if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { + // The HD4XXX hardware doesn't support the ubit_insert instruction. + return false; + } + Type *aType = inst->getType(); + bool isVector = aType->isVectorTy(); + int numEle = 1; + // This optimization only works on 32bit integers. + if (aType->getScalarType() + != Type::getInt32Ty(inst->getContext())) { + return false; + } + if (isVector) { + const VectorType *VT = dyn_cast<VectorType>(aType); + numEle = VT->getNumElements(); + // We currently cannot support more than 4 elements in a intrinsic and we + // cannot support Vec3 types. + if (numEle > 4 || numEle == 3) { + return false; + } + } + // TODO: Handle vectors. + if (isVector) { + if (mDebug) { + dbgs() << "!!! Vectors are not supported yet!\n"; + } + return false; + } + Instruction *LHSSrc = NULL, *RHSSrc = NULL; + Constant *LHSMask = NULL, *RHSMask = NULL; + Constant *LHSShift = NULL, *RHSShift = NULL; + Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); + Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); + if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { + if (mDebug) { + dbgs() << "Found an OR Operation that failed setup!\n"; + inst->dump(); + if (LHS) { LHS->dump(); } + if (LHSSrc) { LHSSrc->dump(); } + if (LHSMask) { LHSMask->dump(); } + if (LHSShift) { LHSShift->dump(); } + } + // There was an issue with the setup for BitInsert. + return false; + } + if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { + if (mDebug) { + dbgs() << "Found an OR Operation that failed setup!\n"; + inst->dump(); + if (RHS) { RHS->dump(); } + if (RHSSrc) { RHSSrc->dump(); } + if (RHSMask) { RHSMask->dump(); } + if (RHSShift) { RHSShift->dump(); } + } + // There was an issue with the setup for BitInsert. + return false; + } + if (mDebug) { + dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; + dbgs() << "Op: "; inst->dump(); + dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } + } + Constant *offset = NULL; + Constant *width = NULL; + uint32_t lhsMaskVal = 0, rhsMaskVal = 0; + uint32_t lhsShiftVal = 0, rhsShiftVal = 0; + uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; + uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; + lhsMaskVal = (LHSMask + ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); + rhsMaskVal = (RHSMask + ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); + lhsShiftVal = (LHSShift + ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); + rhsShiftVal = (RHSShift + ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); + lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; + rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; + lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; + rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; + // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). + if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { + return false; + } + if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { + offset = ConstantInt::get(aType, lhsMaskOffset, false); + width = ConstantInt::get(aType, lhsMaskWidth, false); + RHSSrc = RHS; + if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { + return false; + } + if (!LHSShift) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", LHS); + } else if (lhsShiftVal != lhsMaskOffset) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", LHS); + } + if (mDebug) { + dbgs() << "Optimizing LHS!\n"; + } + } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { + offset = ConstantInt::get(aType, rhsMaskOffset, false); + width = ConstantInt::get(aType, rhsMaskWidth, false); + LHSSrc = RHSSrc; + RHSSrc = LHS; + if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { + return false; + } + if (!RHSShift) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", RHS); + } else if (rhsShiftVal != rhsMaskOffset) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", RHS); + } + if (mDebug) { + dbgs() << "Optimizing RHS!\n"; + } + } else { + if (mDebug) { + dbgs() << "Failed constraint 3!\n"; + } + return false; + } + if (mDebug) { + dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } + dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } + dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } + dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } + } + if (!offset || !width) { + if (mDebug) { + dbgs() << "Either width or offset are NULL, failed detection!\n"; + } + return false; + } + // Lets create the function signature. + std::vector<Type *> callTypes; + callTypes.push_back(aType); + callTypes.push_back(aType); + callTypes.push_back(aType); + callTypes.push_back(aType); + FunctionType *funcType = FunctionType::get(aType, callTypes, false); + std::string name = "__amdil_ubit_insert"; + if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } + Function *Func = + dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> + getOrInsertFunction(StringRef(name), funcType)); + Value *Operands[4] = { + width, + offset, + LHSSrc, + RHSSrc + }; + CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); + if (mDebug) { + dbgs() << "Old Inst: "; + inst->dump(); + dbgs() << "New Inst: "; + CI->dump(); + dbgs() << "\n\n"; + } + CI->insertBefore(inst); + inst->replaceAllUsesWith(CI); + return true; +} + +bool +AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { + if (!inst) { + return false; + } + if (!inst->isBinaryOp()) { + return false; + } + if (inst->getOpcode() != Instruction::And) { + return false; + } + if (optLevel == CodeGenOpt::None) { + return false; + } + // We want to do some simple optimizations on Shift right/And patterns. The + // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a + // value smaller than 32 and C is a mask. If C is a constant value, then the + // following transformation can occur. For signed integers, it turns into the + // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned + // integers, it turns into the function call dst = + // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract + // can be found in Section 7.9 of the ATI IL spec of the stream SDK for + // Evergreen hardware. + if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { + // This does not work on HD4XXX hardware. + return false; + } + Type *aType = inst->getType(); + bool isVector = aType->isVectorTy(); + + // XXX Support vector types + if (isVector) { + return false; + } + int numEle = 1; + // This only works on 32bit integers + if (aType->getScalarType() + != Type::getInt32Ty(inst->getContext())) { + return false; + } + if (isVector) { + const VectorType *VT = dyn_cast<VectorType>(aType); + numEle = VT->getNumElements(); + // We currently cannot support more than 4 elements in a intrinsic and we + // cannot support Vec3 types. + if (numEle > 4 || numEle == 3) { + return false; + } + } + BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); + // If the first operand is not a shift instruction, then we can return as it + // doesn't match this pattern. + if (!ShiftInst || !ShiftInst->isShift()) { + return false; + } + // If we are a shift left, then we need don't match this pattern. + if (ShiftInst->getOpcode() == Instruction::Shl) { + return false; + } + bool isSigned = ShiftInst->isArithmeticShift(); + Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); + Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); + // Lets make sure that the shift value and the and mask are constant integers. + if (!AndMask || !ShrVal) { + return false; + } + Constant *newMaskConst; + Constant *shiftValConst; + if (isVector) { + // Handle the vector case + std::vector<Constant *> maskVals; + std::vector<Constant *> shiftVals; + ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); + ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); + Type *scalarType = AndMaskVec->getType()->getScalarType(); + assert(AndMaskVec->getNumOperands() == + ShrValVec->getNumOperands() && "cannot have a " + "combination where the number of elements to a " + "shift and an and are different!"); + for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { + ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); + ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); + if (!AndCI || !ShiftIC) { + return false; + } + uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); + if (!isMask_32(maskVal)) { + return false; + } + maskVal = (uint32_t)CountTrailingOnes_32(maskVal); + uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); + // If the mask or shiftval is greater than the bitcount, then break out. + if (maskVal >= 32 || shiftVal >= 32) { + return false; + } + // If the mask val is greater than the the number of original bits left + // then this optimization is invalid. + if (maskVal > (32 - shiftVal)) { + return false; + } + maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); + shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); + } + newMaskConst = ConstantVector::get(maskVals); + shiftValConst = ConstantVector::get(shiftVals); + } else { + // Handle the scalar case + uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); + // This must be a mask value where all lower bits are set to 1 and then any + // bit higher is set to 0. + if (!isMask_32(maskVal)) { + return false; + } + maskVal = (uint32_t)CountTrailingOnes_32(maskVal); + // Count the number of bits set in the mask, this is the width of the + // resulting bit set that is extracted from the source value. + uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); + // If the mask or shift val is greater than the bitcount, then break out. + if (maskVal >= 32 || shiftVal >= 32) { + return false; + } + // If the mask val is greater than the the number of original bits left then + // this optimization is invalid. + if (maskVal > (32 - shiftVal)) { + return false; + } + newMaskConst = ConstantInt::get(aType, maskVal, isSigned); + shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); + } + // Lets create the function signature. + std::vector<Type *> callTypes; + callTypes.push_back(aType); + callTypes.push_back(aType); + callTypes.push_back(aType); + FunctionType *funcType = FunctionType::get(aType, callTypes, false); + std::string name = "llvm.AMDGPU.bit.extract.u32"; + if (isVector) { + name += ".v" + itostr(numEle) + "i32"; + } else { + name += "."; + } + // Lets create the function. + Function *Func = + dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> + getOrInsertFunction(StringRef(name), funcType)); + Value *Operands[3] = { + ShiftInst->getOperand(0), + shiftValConst, + newMaskConst + }; + // Lets create the Call with the operands + CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); + CI->setDoesNotAccessMemory(); + CI->insertBefore(inst); + inst->replaceAllUsesWith(CI); + return true; +} + +bool +AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { + if (!CI) { + return false; + } + Value *LHS = CI->getOperand(CI->getNumOperands() - 1); + if (!LHS->getName().startswith("__amdil_bfi")) { + return false; + } + Type* type = CI->getOperand(0)->getType(); + Constant *negOneConst = NULL; + if (type->isVectorTy()) { + std::vector<Constant *> negOneVals; + negOneConst = ConstantInt::get(CI->getContext(), + APInt(32, StringRef("-1"), 10)); + for (size_t x = 0, + y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { + negOneVals.push_back(negOneConst); + } + negOneConst = ConstantVector::get(negOneVals); + } else { + negOneConst = ConstantInt::get(CI->getContext(), + APInt(32, StringRef("-1"), 10)); + } + // __amdil_bfi => (A & B) | (~A & C) + BinaryOperator *lhs = + BinaryOperator::Create(Instruction::And, CI->getOperand(0), + CI->getOperand(1), "bfi_and", CI); + BinaryOperator *rhs = + BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, + "bfi_not", CI); + rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), + "bfi_and", CI); + lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); + CI->replaceAllUsesWith(lhs); + return true; +} + +bool +AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { + if (!CI) { + return false; + } + Value *LHS = CI->getOperand(CI->getNumOperands() - 1); + if (!LHS->getName().startswith("__amdil_bfm")) { + return false; + } + // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) + Constant *newMaskConst = NULL; + Constant *newShiftConst = NULL; + Type* type = CI->getOperand(0)->getType(); + if (type->isVectorTy()) { + std::vector<Constant*> newMaskVals, newShiftVals; + newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); + newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); + for (size_t x = 0, + y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { + newMaskVals.push_back(newMaskConst); + newShiftVals.push_back(newShiftConst); + } + newMaskConst = ConstantVector::get(newMaskVals); + newShiftConst = ConstantVector::get(newShiftVals); + } else { + newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); + newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); + } + BinaryOperator *lhs = + BinaryOperator::Create(Instruction::And, CI->getOperand(0), + newMaskConst, "bfm_mask", CI); + lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, + lhs, "bfm_shl", CI); + lhs = BinaryOperator::Create(Instruction::Sub, lhs, + newShiftConst, "bfm_sub", CI); + BinaryOperator *rhs = + BinaryOperator::Create(Instruction::And, CI->getOperand(1), + newMaskConst, "bfm_mask", CI); + lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); + CI->replaceAllUsesWith(lhs); + return true; +} + +bool +AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { + Instruction *inst = (*bbb); + if (optimizeCallInst(bbb)) { + return true; + } + if (optimizeBitExtract(inst)) { + return false; + } + if (optimizeBitInsert(inst)) { + return false; + } + if (correctMisalignedMemOp(inst)) { + return false; + } + return false; +} +bool +AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { + LoadInst *linst = dyn_cast<LoadInst>(inst); + StoreInst *sinst = dyn_cast<StoreInst>(inst); + unsigned alignment; + Type* Ty = inst->getType(); + if (linst) { + alignment = linst->getAlignment(); + Ty = inst->getType(); + } else if (sinst) { + alignment = sinst->getAlignment(); + Ty = sinst->getValueOperand()->getType(); + } else { + return false; + } + unsigned size = getTypeSize(Ty); + if (size == alignment || size < alignment) { + return false; + } + if (!Ty->isStructTy()) { + return false; + } + if (alignment < 4) { + if (linst) { + linst->setAlignment(0); + return true; + } else if (sinst) { + sinst->setAlignment(0); + return true; + } + } + return false; +} +bool +AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { + if (!CI) { + return false; + } + Value *LHS = CI->getOperand(CI->getNumOperands() - 1); + std::string namePrefix = LHS->getName().substr(0, 14); + if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" + && namePrefix != "__amdil__imul24_high") { + return false; + } + if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { + return false; + } + return true; +} + +void +AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { + assert(isSigned24BitOps(CI) && "Must be a " + "signed 24 bit operation to call this function!"); + Value *LHS = CI->getOperand(CI->getNumOperands()-1); + // On 7XX and 8XX we do not have signed 24bit, so we need to + // expand it to the following: + // imul24 turns into 32bit imul + // imad24 turns into 32bit imad + // imul24_high turns into 32bit imulhigh + if (LHS->getName().substr(0, 14) == "__amdil_imad24") { + Type *aType = CI->getOperand(0)->getType(); + bool isVector = aType->isVectorTy(); + int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; + std::vector<Type*> callTypes; + callTypes.push_back(CI->getOperand(0)->getType()); + callTypes.push_back(CI->getOperand(1)->getType()); + callTypes.push_back(CI->getOperand(2)->getType()); + FunctionType *funcType = + FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); + std::string name = "__amdil_imad"; + if (isVector) { + name += "_v" + itostr(numEle) + "i32"; + } else { + name += "_i32"; + } + Function *Func = dyn_cast<Function>( + CI->getParent()->getParent()->getParent()-> + getOrInsertFunction(StringRef(name), funcType)); + Value *Operands[3] = { + CI->getOperand(0), + CI->getOperand(1), + CI->getOperand(2) + }; + CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); + nCI->insertBefore(CI); + CI->replaceAllUsesWith(nCI); + } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { + BinaryOperator *mulOp = + BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), + CI->getOperand(1), "imul24", CI); + CI->replaceAllUsesWith(mulOp); + } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { + Type *aType = CI->getOperand(0)->getType(); + + bool isVector = aType->isVectorTy(); + int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; + std::vector<Type*> callTypes; + callTypes.push_back(CI->getOperand(0)->getType()); + callTypes.push_back(CI->getOperand(1)->getType()); + FunctionType *funcType = + FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); + std::string name = "__amdil_imul_high"; + if (isVector) { + name += "_v" + itostr(numEle) + "i32"; + } else { + name += "_i32"; + } + Function *Func = dyn_cast<Function>( + CI->getParent()->getParent()->getParent()-> + getOrInsertFunction(StringRef(name), funcType)); + Value *Operands[2] = { + CI->getOperand(0), + CI->getOperand(1) + }; + CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); + nCI->insertBefore(CI); + CI->replaceAllUsesWith(nCI); + } +} + +bool +AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { + return (CI != NULL + && CI->getOperand(CI->getNumOperands() - 1)->getName() + == "__amdil_get_local_size_int"); +} + +bool +AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { + if (!CI) { + return false; + } + if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX + && (mSTM->getDeviceName() == "cayman")) { + return false; + } + return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) + == "__amdil_improved_div"; +} + +void +AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { + assert(convertAccurateDivide(CI) + && "expanding accurate divide can only happen if it is expandable!"); + BinaryOperator *divOp = + BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), + CI->getOperand(1), "fdiv32", CI); + CI->replaceAllUsesWith(divOp); +} + +bool +AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { + if (optLevel != CodeGenOpt::None) { + return false; + } + + if (!CI) { + return false; + } + + unsigned funcNameIdx = 0; + funcNameIdx = CI->getNumOperands() - 1; + StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); + if (calleeName != "__amdil_image2d_read_norm" + && calleeName != "__amdil_image2d_read_unnorm" + && calleeName != "__amdil_image3d_read_norm" + && calleeName != "__amdil_image3d_read_unnorm") { + return false; + } + + unsigned samplerIdx = 2; + samplerIdx = 1; + Value *sampler = CI->getOperand(samplerIdx); + LoadInst *lInst = dyn_cast<LoadInst>(sampler); + if (!lInst) { + return false; + } + + if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return false; + } + + GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); + // If we are loading from what is not a global value, then we + // fail and return. + if (!gv) { + return false; + } + + // If we don't have an initializer or we have an initializer and + // the initializer is not a 32bit integer, we fail. + if (!gv->hasInitializer() + || !gv->getInitializer()->getType()->isIntegerTy(32)) { + return false; + } + + // Now that we have the global variable initializer, lets replace + // all uses of the load instruction with the samplerVal and + // reparse the __amdil_is_constant() function. + Constant *samplerVal = gv->getInitializer(); + lInst->replaceAllUsesWith(samplerVal); + return true; +} + +bool +AMDGPUPeepholeOpt::doInitialization(Module &M) { + return false; +} + +bool +AMDGPUPeepholeOpt::doFinalization(Module &M) { + return false; +} + +void +AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineFunctionAnalysis>(); + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesAll(); +} + +size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { + size_t size = 0; + if (!T) { + return size; + } + switch (T->getTypeID()) { + case Type::X86_FP80TyID: + case Type::FP128TyID: + case Type::PPC_FP128TyID: + case Type::LabelTyID: + assert(0 && "These types are not supported by this backend"); + default: + case Type::FloatTyID: + case Type::DoubleTyID: + size = T->getPrimitiveSizeInBits() >> 3; + break; + case Type::PointerTyID: + size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); + break; + case Type::IntegerTyID: + size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); + break; + case Type::StructTyID: + size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); + break; + case Type::ArrayTyID: + size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); + break; + case Type::FunctionTyID: + size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); + break; + case Type::VectorTyID: + size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); + break; + }; + return size; +} + +size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, + bool dereferencePtr) { + size_t size = 0; + if (!ST) { + return size; + } + Type *curType; + StructType::element_iterator eib; + StructType::element_iterator eie; + for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { + curType = *eib; + size += getTypeSize(curType, dereferencePtr); + } + return size; +} + +size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, + bool dereferencePtr) { + return IT ? (IT->getBitWidth() >> 3) : 0; +} + +size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, + bool dereferencePtr) { + assert(0 && "Should not be able to calculate the size of an function type"); + return 0; +} + +size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, + bool dereferencePtr) { + return (size_t)(AT ? (getTypeSize(AT->getElementType(), + dereferencePtr) * AT->getNumElements()) + : 0); +} + +size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, + bool dereferencePtr) { + return VT ? (VT->getBitWidth() >> 3) : 0; +} + +size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, + bool dereferencePtr) { + if (!PT) { + return 0; + } + Type *CT = PT->getElementType(); + if (CT->getTypeID() == Type::StructTyID && + PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + return getTypeSize(dyn_cast<StructType>(CT)); + } else if (dereferencePtr) { + size_t size = 0; + for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { + size += getTypeSize(PT->getContainedType(x), dereferencePtr); + } + return size; + } else { + return 4; + } +} + +size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, + bool dereferencePtr) { + //assert(0 && "Should not be able to calculate the size of an opaque type"); + return 4; +} diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td new file mode 100644 index 0000000000..b9d033432e --- /dev/null +++ b/lib/Target/R600/AMDILRegisterInfo.td @@ -0,0 +1,107 @@ +//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Declarations that describe the AMDIL register file +// +//===----------------------------------------------------------------------===// + +class AMDILReg<bits<16> num, string n> : Register<n> { + field bits<16> Value; + let Value = num; + let Namespace = "AMDGPU"; +} + +// We will start with 8 registers for each class before expanding to more +// Since the swizzle is added based on the register class, we can leave it +// off here and just specify different registers for different register classes +def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>; +def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>; +def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>; +def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>; +def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>; +def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>; +def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>; +def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>; +def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>; +def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>; +def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>; +def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>; +def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>; +def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>; +def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>; +def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>; +def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>; +def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>; +def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>; +def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>; + +// All registers between 1000 and 1024 are reserved and cannot be used +// unless commented in this section +// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's +// r1020 is used to hold the frame index for local arrays +// r1019 is used to hold the dynamic stack allocation pointer +// r1018 is used as a temporary register for handwritten code +// r1017 is used as a temporary register for handwritten code +// r1016 is used as a temporary register for load/store code +// r1015 is used as a temporary register for data segment offset +// r1014 is used as a temporary register for store code +// r1013 is used as the section data pointer register +// r1012-r1010 and r1001-r1008 are used for temporary I/O registers +// r1009 is used as the frame pointer register +// r999 is used as the mem register. +// r998 is used as the return address register. +//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>; +//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>; +//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>; +//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>; +//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>; +//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>; +def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>; +def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>; +def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>; +def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>; +def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>; +def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>; +def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>; +def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>; +def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>; +def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>; +def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>; +def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>; +def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>; +def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>; +def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>; +def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>; +def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>; +def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>; +def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>; +def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>; +def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>; +def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>; +def GPRI16 : RegisterClass<"AMDGPU", [i16], 16, + (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { + let AltOrders = [(add (sequence "R%u", 1, 20))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRI32 : RegisterClass<"AMDGPU", [i32], 32, + (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { + let AltOrders = [(add (sequence "R%u", 1, 20))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRF32 : RegisterClass<"AMDGPU", [f32], 32, + (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { + let AltOrders = [(add (sequence "R%u", 1, 20))]; + let AltOrderSelect = [{ + return 1; + }]; + } diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp new file mode 100644 index 0000000000..0d1de3d11e --- /dev/null +++ b/lib/Target/R600/AMDILSIDevice.cpp @@ -0,0 +1,48 @@ +//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#include "AMDILSIDevice.h" +#include "AMDGPUSubtarget.h" +#include "AMDILEvergreenDevice.h" +#include "AMDILNIDevice.h" + +using namespace llvm; + +AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { +} +AMDGPUSIDevice::~AMDGPUSIDevice() { +} + +size_t +AMDGPUSIDevice::getMaxLDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_900; + } else { + return 0; + } +} + +uint32_t +AMDGPUSIDevice::getGeneration() const { + return AMDGPUDeviceInfo::HD7XXX; +} + +std::string +AMDGPUSIDevice::getDataLayout() const { + return std::string( + "e" + "-p:64:64:64" + "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128" + "-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" + "-v2048:2048:2048" + "-n32:64" + ); +} diff --git a/lib/Target/R600/AMDILSIDevice.h b/lib/Target/R600/AMDILSIDevice.h new file mode 100644 index 0000000000..5b2cb25022 --- /dev/null +++ b/lib/Target/R600/AMDILSIDevice.h @@ -0,0 +1,39 @@ +//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the subtarget data classes. +/// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===---------------------------------------------------------------------===// +#ifndef AMDILSIDEVICE_H +#define AMDILSIDEVICE_H +#include "AMDILEvergreenDevice.h" + +namespace llvm { +class AMDGPUSubtarget; +//===---------------------------------------------------------------------===// +// SI generation of devices and their respective sub classes +//===---------------------------------------------------------------------===// + +/// \brief The AMDGPUSIDevice is the base class for all Southern Island series +/// of cards. +class AMDGPUSIDevice : public AMDGPUEvergreenDevice { +public: + AMDGPUSIDevice(AMDGPUSubtarget*); + virtual ~AMDGPUSIDevice(); + virtual size_t getMaxLDSSize() const; + virtual uint32_t getGeneration() const; + virtual std::string getDataLayout() const; +}; + +} // namespace llvm +#endif // AMDILSIDEVICE_H diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt new file mode 100644 index 0000000000..63c59e1cb5 --- /dev/null +++ b/lib/Target/R600/CMakeLists.txt @@ -0,0 +1,56 @@ +set(LLVM_TARGET_DEFINITIONS AMDGPU.td) + +tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) +tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) +tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) +tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) +tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter) +tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) +tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) +add_public_tablegen_target(AMDGPUCommonTableGen) + +add_llvm_target(R600CodeGen + AMDIL7XXDevice.cpp + AMDILCFGStructurizer.cpp + AMDILDevice.cpp + AMDILDeviceInfo.cpp + AMDILEvergreenDevice.cpp + AMDILIntrinsicInfo.cpp + AMDILISelDAGToDAG.cpp + AMDILISelLowering.cpp + AMDILNIDevice.cpp + AMDILPeepholeOptimizer.cpp + AMDILSIDevice.cpp + AMDGPUAsmPrinter.cpp + AMDGPUFrameLowering.cpp + AMDGPUIndirectAddressing.cpp + AMDGPUMCInstLower.cpp + AMDGPUSubtarget.cpp + AMDGPUStructurizeCFG.cpp + AMDGPUTargetMachine.cpp + AMDGPUISelLowering.cpp + AMDGPUConvertToISA.cpp + AMDGPUInstrInfo.cpp + AMDGPURegisterInfo.cpp + R600ExpandSpecialInstrs.cpp + R600InstrInfo.cpp + R600ISelLowering.cpp + R600MachineFunctionInfo.cpp + R600MachineScheduler.cpp + R600RegisterInfo.cpp + SIAnnotateControlFlow.cpp + SIInsertWaits.cpp + SIInstrInfo.cpp + SIISelLowering.cpp + SILowerControlFlow.cpp + SIMachineFunctionInfo.cpp + SIRegisterInfo.cpp + ) + +add_dependencies(LLVMR600CodeGen intrinsics_gen) + +add_subdirectory(InstPrinter) +add_subdirectory(TargetInfo) +add_subdirectory(MCTargetDesc) diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp new file mode 100644 index 0000000000..10547a5988 --- /dev/null +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -0,0 +1,172 @@ +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCExpr.h" + +using namespace llvm; + +void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot) { + printInstruction(MI, OS); + + printAnnotation(OS, Annot); +} + +void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + switch (Op.getReg()) { + // This is the default predicate state, so we don't need to print it. + case AMDGPU::PRED_SEL_OFF: break; + default: O << getRegisterName(Op.getReg()); break; + } + } else if (Op.isImm()) { + O << Op.getImm(); + } else if (Op.isFPImm()) { + O << Op.getFPImm(); + } else if (Op.isExpr()) { + const MCExpr *Exp = Op.getExpr(); + Exp->print(O); + } else { + assert(!"unknown operand type in printOperand"); + } +} + +void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + + if (Imm == 2) { + O << "P0"; + } else if (Imm == 1) { + O << "P20"; + } else if (Imm == 0) { + O << "P10"; + } else { + assert(!"Invalid interpolation parameter slot"); + } +} + +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo + 1, O); +} + +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, StringRef Asm) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) { + O << Asm; + } +} + +void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "|"); +} + +void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "_SAT"); +} + +void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + union Literal { + float f; + int32_t i; + } L; + + L.i = MI->getOperand(OpNo).getImm(); + O << L.i << "(" << L.f << ")"; +} + +void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, " *"); +} + +void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "-"); +} + +void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + switch (MI->getOperand(OpNo).getImm()) { + default: break; + case 1: + O << " * 2.0"; + break; + case 2: + O << " * 4.0"; + break; + case 3: + O << " / 2.0"; + break; + } +} + +void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "+"); +} + +void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "ExecMask,"); +} + +void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "Pred,"); +} + +void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.getImm() == 0) { + O << " (MASKED)"; + } +} + +void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const char * chans = "XYZW"; + int sel = MI->getOperand(OpNo).getImm(); + + int chan = sel & 3; + sel >>= 2; + + if (sel >= 512) { + sel -= 512; + int cb = sel >> 12; + sel &= 4095; + O << cb << "[" << sel << "]"; + } else if (sel >= 448) { + sel -= 448; + O << sel; + } else if (sel >= 0){ + O << sel; + } + + if (sel >= 0) + O << "." << chans[chan]; +} + +#include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h new file mode 100644 index 0000000000..767a7082cc --- /dev/null +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -0,0 +1,54 @@ +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUINSTPRINTER_H +#define AMDGPUINSTPRINTER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class AMDGPUInstPrinter : public MCInstPrinter { +public: + AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + //Autogenerated by tblgen + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); + +private: + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm); + void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; + +} // End namespace llvm + +#endif // AMDGPUINSTRPRINTER_H diff --git a/lib/Target/R600/InstPrinter/CMakeLists.txt b/lib/Target/R600/InstPrinter/CMakeLists.txt new file mode 100644 index 0000000000..069c55ba94 --- /dev/null +++ b/lib/Target/R600/InstPrinter/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMR600AsmPrinter + AMDGPUInstPrinter.cpp + ) + +add_dependencies(LLVMR600AsmPrinter AMDGPUCommonTableGen) diff --git a/lib/Target/R600/InstPrinter/LLVMBuild.txt b/lib/Target/R600/InstPrinter/LLVMBuild.txt new file mode 100644 index 0000000000..ec0be89f10 --- /dev/null +++ b/lib/Target/R600/InstPrinter/LLVMBuild.txt @@ -0,0 +1,24 @@ +;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = R600AsmPrinter +parent = R600 +required_libraries = MC Support +add_to_library_groups = R600 + diff --git a/lib/Target/R600/InstPrinter/Makefile b/lib/Target/R600/InstPrinter/Makefile new file mode 100644 index 0000000000..a794cc1124 --- /dev/null +++ b/lib/Target/R600/InstPrinter/Makefile @@ -0,0 +1,15 @@ +#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600AsmPrinter + +# Hack: we need to include 'main' x86 target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt new file mode 100644 index 0000000000..f2a7554e52 --- /dev/null +++ b/lib/Target/R600/LLVMBuild.txt @@ -0,0 +1,32 @@ +;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[common] +subdirectories = InstPrinter MCTargetDesc TargetInfo + +[component_0] +type = TargetGroup +name = R600 +parent = Target +has_asmprinter = 1 + +[component_1] +type = Library +name = R600CodeGen +parent = R600 +required_libraries = AsmPrinter CodeGen Core SelectionDAG Support Target MC R600AsmPrinter R600Desc R600Info +add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp new file mode 100644 index 0000000000..98fca43267 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -0,0 +1,90 @@ +//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +namespace { + +class AMDGPUMCObjectWriter : public MCObjectWriter { +public: + AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { } + virtual void ExecutePostLayoutBinding(MCAssembler &Asm, + const MCAsmLayout &Layout) { + //XXX: Implement if necessary. + } + virtual void RecordRelocation(const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue) { + assert(!"Not implemented"); + } + + virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout); + +}; + +class AMDGPUAsmBackend : public MCAsmBackend { +public: + AMDGPUAsmBackend(const Target &T) + : MCAsmBackend() {} + + virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const; + virtual unsigned getNumFixupKinds() const { return 0; }; + virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const; + virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + return false; + } + virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const { + assert(!"Not implemented"); + } + virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; } + virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const { + return true; + } +}; + +} //End anonymous namespace + +void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm, + const MCAsmLayout &Layout) { + for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { + Asm.writeSectionData(I, Layout); + } +} + +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT, + StringRef CPU) { + return new AMDGPUAsmBackend(T); +} + +AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter( + raw_ostream &OS) const { + return new AMDGPUMCObjectWriter(OS); +} + +void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value) const { + + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); + assert(Fixup.getKind() == FK_PCRel_4); + *Dst = (Value - 4) / 4; +} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp new file mode 100644 index 0000000000..4d3d3e7945 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -0,0 +1,85 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCAsmInfo.h" + +using namespace llvm; +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() { + HasSingleParameterDotFile = false; + WeakDefDirective = 0; + //===------------------------------------------------------------------===// + HasSubsectionsViaSymbols = true; + HasMachoZeroFillDirective = false; + HasMachoTBSSDirective = false; + HasStaticCtorDtorReferenceInStaticMode = false; + LinkerRequiresNonEmptyDwarfLines = true; + MaxInstLength = 16; + PCSymbol = "$"; + SeparatorString = "\n"; + CommentColumn = 40; + CommentString = ";"; + LabelSuffix = ":"; + GlobalPrefix = "@"; + PrivateGlobalPrefix = ";."; + LinkerPrivateGlobalPrefix = "!"; + InlineAsmStart = ";#ASMSTART"; + InlineAsmEnd = ";#ASMEND"; + AssemblerDialect = 0; + AllowQuotesInName = false; + AllowNameToStartWithDigit = false; + AllowPeriodsInName = false; + + //===--- Data Emission Directives -------------------------------------===// + ZeroDirective = ".zero"; + AsciiDirective = ".ascii\t"; + AscizDirective = ".asciz\t"; + Data8bitsDirective = ".byte\t"; + Data16bitsDirective = ".short\t"; + Data32bitsDirective = ".long\t"; + Data64bitsDirective = ".quad\t"; + GPRel32Directive = 0; + SunStyleELFSectionSwitchSyntax = true; + UsesELFSectionDirectiveForBSS = true; + HasMicrosoftFastStdCallMangling = false; + + //===--- Alignment Information ----------------------------------------===// + AlignDirective = ".align\t"; + AlignmentIsInBytes = true; + TextAlignFillValue = 0; + + //===--- Global Variable Emission Directives --------------------------===// + GlobalDirective = ".global"; + ExternDirective = ".extern"; + HasSetDirective = false; + HasAggressiveSymbolFolding = true; + COMMDirectiveAlignmentIsInBytes = false; + HasDotTypeDotSizeDirective = false; + HasNoDeadStrip = true; + HasSymbolResolver = false; + WeakRefDirective = ".weakref\t"; + LinkOnceDirective = 0; + //===--- Dwarf Emission Directives -----------------------------------===// + HasLEB128 = true; + SupportsDebugInformation = true; + ExceptionsType = ExceptionHandling::None; + DwarfUsesInlineInfoSection = false; + DwarfSectionOffsetDirective = ".offset"; + +} + +const char* +AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const { + return 0; +} + +const MCSection* +AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const { + return 0; +} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h new file mode 100644 index 0000000000..3ad0fa6824 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -0,0 +1,30 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUMCASMINFO_H +#define AMDGPUMCASMINFO_H + +#include "llvm/MC/MCAsmInfo.h" +namespace llvm { + +class Target; +class StringRef; + +class AMDGPUMCAsmInfo : public MCAsmInfo { +public: + explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT); + const char* getDataASDirective(unsigned int Size, unsigned int AS) const; + const MCSection* getNonexecutableStackSection(MCContext &CTX) const; +}; +} // namespace llvm +#endif // AMDGPUMCASMINFO_H diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h new file mode 100644 index 0000000000..cd3a7ce65a --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -0,0 +1,40 @@ +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUCODEEMITTER_H +#define AMDGPUCODEEMITTER_H + +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class MCInst; +class MCOperand; + +class AMDGPUMCCodeEmitter : public MCCodeEmitter { +public: + + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups) const; + + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const { + return 0; + } +}; + +} // End namespace llvm + +#endif // AMDGPUCODEEMITTER_H diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp new file mode 100644 index 0000000000..072ee49b63 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -0,0 +1,113 @@ +//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This file provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "AMDGPUMCAsmInfo.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +#define GET_INSTRINFO_MC_DESC +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "AMDGPUGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "AMDGPUGenRegisterInfo.inc" + +using namespace llvm; + +static MCInstrInfo *createAMDGPUMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitAMDGPUMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAMDGPUMCRegisterInfo(X, 0); + return X; +} + +static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS) { + MCSubtargetInfo * X = new MCSubtargetInfo(); + InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); + return X; +} + +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->InitMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI) { + return new AMDGPUInstPrinter(MAI, MII, MRI); +} + +static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) { + return createSIMCCodeEmitter(MCII, MRI, STI, Ctx); + } else { + return createR600MCCodeEmitter(MCII, MRI, STI, Ctx); + } +} + +static MCStreamer *createMCStreamer(const Target &T, StringRef TT, + MCContext &Ctx, MCAsmBackend &MAB, + raw_ostream &_OS, + MCCodeEmitter *_Emitter, + bool RelaxAll, + bool NoExecStack) { + return createPureStreamer(Ctx, MAB, _OS, _Emitter); +} + +extern "C" void LLVMInitializeR600TargetMC() { + + RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget); + + TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo); + + TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo); + + TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo); + + TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo); + + TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter); + + TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter); + + TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend); + + TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer); +} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h new file mode 100644 index 0000000000..363a4af3f3 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -0,0 +1,55 @@ +//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// +// + +#ifndef AMDGPUMCTARGETDESC_H +#define AMDGPUMCTARGETDESC_H + +#include "llvm/ADT/StringRef.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; + +extern Target TheAMDGPUTarget; + +MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx); + +MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx); + +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT, + StringRef CPU); +} // End llvm namespace + +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +#define GET_INSTRINFO_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "AMDGPUGenSubtargetInfo.inc" + +#endif // AMDGPUMCTARGETDESC_H diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt new file mode 100644 index 0000000000..37e714c2e7 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt @@ -0,0 +1,10 @@ + +add_llvm_library(LLVMR600Desc + AMDGPUAsmBackend.cpp + AMDGPUMCTargetDesc.cpp + AMDGPUMCAsmInfo.cpp + R600MCCodeEmitter.cpp + SIMCCodeEmitter.cpp + ) + +add_dependencies(LLVMR600Desc AMDGPUCommonTableGen) diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt new file mode 100644 index 0000000000..b1beab0bb3 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = R600Desc +parent = R600 +required_libraries = R600AsmPrinter R600Info MC +add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/Makefile b/lib/Target/R600/MCTargetDesc/Makefile new file mode 100644 index 0000000000..8894a7607f --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/Makefile @@ -0,0 +1,16 @@ +##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600Desc + +# Hack: we need to include 'main' target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp new file mode 100644 index 0000000000..d20716000d --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -0,0 +1,581 @@ +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This code emitter outputs bytecode that is understood by the r600g driver +/// in the Mesa [1] project. The bytecode is very similar to the hardware's ISA, +/// but it still needs to be run through a finalizer in order to be executed +/// by the GPU. +/// +/// [1] http://www.mesa3d.org/ +// +//===----------------------------------------------------------------------===// + +#include "R600Defines.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" +#include <stdio.h> + +#define SRC_BYTE_COUNT 11 +#define DST_BYTE_COUNT 5 + +using namespace llvm; + +namespace { + +class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { + R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION; + void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + const MCSubtargetInfo &STI; + MCContext &Ctx; + +public: + + R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + const MCSubtargetInfo &sti, MCContext &ctx) + : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { } + + /// \brief Encode the instruction and write it to the OS. + virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// \returns the encoding for an MCOperand. + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const; +private: + + void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, + raw_ostream &OS) const; + void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const; + void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx, + raw_ostream &OS) const; + void EmitDst(const MCInst &MI, raw_ostream &OS) const; + void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, + raw_ostream &OS) const; + void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const; + + void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const; + + void EmitByte(unsigned int byte, raw_ostream &OS) const; + + void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const; + + void Emit(uint32_t value, raw_ostream &OS) const; + void Emit(uint64_t value, raw_ostream &OS) const; + + unsigned getHWRegChan(unsigned reg) const; + unsigned getHWReg(unsigned regNo) const; + + bool isFCOp(unsigned opcode) const; + bool isTexOp(unsigned opcode) const; + bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const; + +}; + +} // End anonymous namespace + +enum RegElement { + ELEMENT_X = 0, + ELEMENT_Y, + ELEMENT_Z, + ELEMENT_W +}; + +enum InstrTypes { + INSTR_ALU = 0, + INSTR_TEX, + INSTR_FC, + INSTR_NATIVE, + INSTR_VTX, + INSTR_EXPORT +}; + +enum FCInstr { + FC_IF_PREDICATE = 0, + FC_ELSE, + FC_ENDIF, + FC_BGNLOOP, + FC_ENDLOOP, + FC_BREAK_PREDICATE, + FC_CONTINUE +}; + +enum TextureTypes { + TEXTURE_1D = 1, + TEXTURE_2D, + TEXTURE_3D, + TEXTURE_CUBE, + TEXTURE_RECT, + TEXTURE_SHADOW1D, + TEXTURE_SHADOW2D, + TEXTURE_SHADOWRECT, + TEXTURE_1D_ARRAY, + TEXTURE_2D_ARRAY, + TEXTURE_SHADOW1D_ARRAY, + TEXTURE_SHADOW2D_ARRAY +}; + +MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new R600MCCodeEmitter(MCII, MRI, STI, Ctx); +} + +void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const { + if (isTexOp(MI.getOpcode())) { + EmitTexInstr(MI, Fixups, OS); + } else if (isFCOp(MI.getOpcode())){ + EmitFCInstr(MI, OS); + } else if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::BUNDLE || + MI.getOpcode() == AMDGPU::KILL) { + return; + } else { + switch(MI.getOpcode()) { + case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { + uint64_t inst = getBinaryCodeForInstr(MI, Fixups); + EmitByte(INSTR_NATIVE, OS); + Emit(inst, OS); + break; + } + case AMDGPU::CONSTANT_LOAD_eg: + case AMDGPU::VTX_READ_PARAM_8_eg: + case AMDGPU::VTX_READ_PARAM_16_eg: + case AMDGPU::VTX_READ_PARAM_32_eg: + case AMDGPU::VTX_READ_PARAM_128_eg: + case AMDGPU::VTX_READ_GLOBAL_8_eg: + case AMDGPU::VTX_READ_GLOBAL_32_eg: + case AMDGPU::VTX_READ_GLOBAL_128_eg: + case AMDGPU::TEX_VTX_CONSTBUF: + case AMDGPU::TEX_VTX_TEXBUF : { + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); + uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset + + EmitByte(INSTR_VTX, OS); + Emit(InstWord01, OS); + Emit(InstWord2, OS); + break; + } + case AMDGPU::EG_ExportSwz: + case AMDGPU::R600_ExportSwz: + case AMDGPU::EG_ExportBuf: + case AMDGPU::R600_ExportBuf: { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + EmitByte(INSTR_EXPORT, OS); + Emit(Inst, OS); + break; + } + + default: + EmitALUInstr(MI, Fixups, OS); + break; + } + } +} + +void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + raw_ostream &OS) const { + const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode()); + + // Emit instruction type + EmitByte(INSTR_ALU, OS); + + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); + + //older alu have different encoding for instructions with one or two src + //parameters. + if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) && + !(MCDesc.TSFlags & R600_InstFlag::OP3)) { + uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39); + InstWord01 &= ~(0x3FFULL << 39); + InstWord01 |= ISAOpCode << 1; + } + + unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 : + MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1; + + EmitByte(SrcNum, OS); + + const unsigned SrcOps[3][2] = { + {R600Operands::SRC0, R600Operands::SRC0_SEL}, + {R600Operands::SRC1, R600Operands::SRC1_SEL}, + {R600Operands::SRC2, R600Operands::SRC2_SEL} + }; + + for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) { + unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]]; + unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]]; + EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS); + } + + Emit(InstWord01, OS); + return; +} + +void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx, + raw_ostream &OS) const { + const MCOperand &MO = MI.getOperand(OpIdx); + union { + float f; + uint32_t i; + } Value; + Value.i = 0; + // Emit the source select (2 bytes). For GPRs, this is the register index. + // For other potential instruction operands, (e.g. constant registers) the + // value of the source select is defined in the r600isa docs. + if (MO.isReg()) { + unsigned reg = MO.getReg(); + EmitTwoBytes(getHWReg(reg), OS); + if (reg == AMDGPU::ALU_LITERAL_X) { + unsigned ImmOpIndex = MI.getNumOperands() - 1; + MCOperand ImmOp = MI.getOperand(ImmOpIndex); + if (ImmOp.isFPImm()) { + Value.f = ImmOp.getFPImm(); + } else { + assert(ImmOp.isImm()); + Value.i = ImmOp.getImm(); + } + } + } else { + // XXX: Handle other operand types. + EmitTwoBytes(0, OS); + } + + // Emit the source channel (1 byte) + if (MO.isReg()) { + EmitByte(getHWRegChan(MO.getReg()), OS); + } else { + EmitByte(0, OS); + } + + // XXX: Emit isNegated (1 byte) + if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS))) + && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) || + (MO.isReg() && + (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){ + EmitByte(1, OS); + } else { + EmitByte(0, OS); + } + + // Emit isAbsolute (1 byte) + if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) { + EmitByte(1, OS); + } else { + EmitByte(0, OS); + } + + // XXX: Emit relative addressing mode (1 byte) + EmitByte(0, OS); + + // Emit kc_bank, This will be adjusted later by r600_asm + EmitByte(0, OS); + + // Emit the literal value, if applicable (4 bytes). + Emit(Value.i, OS); + +} + +void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, + unsigned SelOpIdx, raw_ostream &OS) const { + const MCOperand &RegMO = MI.getOperand(RegOpIdx); + const MCOperand &SelMO = MI.getOperand(SelOpIdx); + + union { + float f; + uint32_t i; + } InlineConstant; + InlineConstant.i = 0; + // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0 + // and select is 0 (GPR index is encoded in the instr encoding. For constants + // type is 1 and select is the original const select passed from the driver. + unsigned Reg = RegMO.getReg(); + if (Reg == AMDGPU::ALU_CONST) { + EmitByte(1, OS); + uint32_t Sel = SelMO.getImm(); + Emit(Sel, OS); + } else { + EmitByte(0, OS); + Emit((uint32_t)0, OS); + } + + if (Reg == AMDGPU::ALU_LITERAL_X) { + unsigned ImmOpIndex = MI.getNumOperands() - 1; + MCOperand ImmOp = MI.getOperand(ImmOpIndex); + if (ImmOp.isFPImm()) { + InlineConstant.f = ImmOp.getFPImm(); + } else { + assert(ImmOp.isImm()); + InlineConstant.i = ImmOp.getImm(); + } + } + + // Emit the literal value, if applicable (4 bytes). + Emit(InlineConstant.i, OS); +} + +void R600MCCodeEmitter::EmitTexInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + raw_ostream &OS) const { + + unsigned Opcode = MI.getOpcode(); + bool hasOffsets = (Opcode == AMDGPU::TEX_LD); + unsigned OpOffset = hasOffsets ? 3 : 0; + int64_t Resource = MI.getOperand(OpOffset + 2).getImm(); + int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); + int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); + unsigned srcSelect[4] = {0, 1, 2, 3}; + + // Emit instruction type + EmitByte(1, OS); + + // Emit instruction + EmitByte(getBinaryCodeForInstr(MI, Fixups), OS); + + // Emit resource id + EmitByte(Resource, OS); + + // Emit source register + EmitByte(getHWReg(MI.getOperand(1).getReg()), OS); + + // XXX: Emit src isRelativeAddress + EmitByte(0, OS); + + // Emit destination register + EmitByte(getHWReg(MI.getOperand(0).getReg()), OS); + + // XXX: Emit dst isRealtiveAddress + EmitByte(0, OS); + + // XXX: Emit dst select + EmitByte(0, OS); // X + EmitByte(1, OS); // Y + EmitByte(2, OS); // Z + EmitByte(3, OS); // W + + // XXX: Emit lod bias + EmitByte(0, OS); + + // XXX: Emit coord types + unsigned coordType[4] = {1, 1, 1, 1}; + + if (TextureType == TEXTURE_RECT + || TextureType == TEXTURE_SHADOWRECT) { + coordType[ELEMENT_X] = 0; + coordType[ELEMENT_Y] = 0; + } + + if (TextureType == TEXTURE_1D_ARRAY + || TextureType == TEXTURE_SHADOW1D_ARRAY) { + if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) { + coordType[ELEMENT_Y] = 0; + } else { + coordType[ELEMENT_Z] = 0; + srcSelect[ELEMENT_Z] = ELEMENT_Y; + } + } else if (TextureType == TEXTURE_2D_ARRAY + || TextureType == TEXTURE_SHADOW2D_ARRAY) { + coordType[ELEMENT_Z] = 0; + } + + for (unsigned i = 0; i < 4; i++) { + EmitByte(coordType[i], OS); + } + + // XXX: Emit offsets + if (hasOffsets) + for (unsigned i = 2; i < 5; i++) + EmitByte(MI.getOperand(i).getImm()<<1, OS); + else + EmitNullBytes(3, OS); + + // Emit sampler id + EmitByte(Sampler, OS); + + // XXX:Emit source select + if ((TextureType == TEXTURE_SHADOW1D + || TextureType == TEXTURE_SHADOW2D + || TextureType == TEXTURE_SHADOWRECT + || TextureType == TEXTURE_SHADOW1D_ARRAY) + && Opcode != AMDGPU::TEX_SAMPLE_C_L + && Opcode != AMDGPU::TEX_SAMPLE_C_LB) { + srcSelect[ELEMENT_W] = ELEMENT_Z; + } + + for (unsigned i = 0; i < 4; i++) { + EmitByte(srcSelect[i], OS); + } +} + +void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const { + + // Emit instruction type + EmitByte(INSTR_FC, OS); + + // Emit SRC + unsigned NumOperands = MI.getNumOperands(); + if (NumOperands > 0) { + assert(NumOperands == 1); + EmitSrc(MI, 0, OS); + } else { + EmitNullBytes(SRC_BYTE_COUNT, OS); + } + + // Emit FC Instruction + enum FCInstr instr; + switch (MI.getOpcode()) { + case AMDGPU::PREDICATED_BREAK: + instr = FC_BREAK_PREDICATE; + break; + case AMDGPU::CONTINUE: + instr = FC_CONTINUE; + break; + case AMDGPU::IF_PREDICATE_SET: + instr = FC_IF_PREDICATE; + break; + case AMDGPU::ELSE: + instr = FC_ELSE; + break; + case AMDGPU::ENDIF: + instr = FC_ENDIF; + break; + case AMDGPU::ENDLOOP: + instr = FC_ENDLOOP; + break; + case AMDGPU::WHILELOOP: + instr = FC_BGNLOOP; + break; + default: + abort(); + break; + } + EmitByte(instr, OS); +} + +void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount, + raw_ostream &OS) const { + + for (unsigned int i = 0; i < ByteCount; i++) { + EmitByte(0, OS); + } +} + +void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { + OS.write((uint8_t) Byte & 0xff); +} + +void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes, + raw_ostream &OS) const { + OS.write((uint8_t) (Bytes & 0xff)); + OS.write((uint8_t) ((Bytes >> 8) & 0xff)); +} + +void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { + for (unsigned i = 0; i < 4; i++) { + OS.write((uint8_t) ((Value >> (8 * i)) & 0xff)); + } +} + +void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { + for (unsigned i = 0; i < 8; i++) { + EmitByte((Value >> (8 * i)) & 0xff, OS); + } +} + +unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { + return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { + return MRI.getEncodingValue(RegNo) & HW_REG_MASK; +} + +uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixup) const { + if (MO.isReg()) { + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) { + return MRI.getEncodingValue(MO.getReg()); + } else { + return getHWReg(MO.getReg()); + } + } else if (MO.isImm()) { + return MO.getImm(); + } else { + assert(0); + return 0; + } +} + +//===----------------------------------------------------------------------===// +// Encoding helper functions +//===----------------------------------------------------------------------===// + +bool R600MCCodeEmitter::isFCOp(unsigned opcode) const { + switch(opcode) { + default: return false; + case AMDGPU::PREDICATED_BREAK: + case AMDGPU::CONTINUE: + case AMDGPU::IF_PREDICATE_SET: + case AMDGPU::ELSE: + case AMDGPU::ENDIF: + case AMDGPU::ENDLOOP: + case AMDGPU::WHILELOOP: + return true; + } +} + +bool R600MCCodeEmitter::isTexOp(unsigned opcode) const { + switch(opcode) { + default: return false; + case AMDGPU::TEX_LD: + case AMDGPU::TEX_GET_TEXTURE_RESINFO: + case AMDGPU::TEX_SAMPLE: + case AMDGPU::TEX_SAMPLE_C: + case AMDGPU::TEX_SAMPLE_L: + case AMDGPU::TEX_SAMPLE_C_L: + case AMDGPU::TEX_SAMPLE_LB: + case AMDGPU::TEX_SAMPLE_C_LB: + case AMDGPU::TEX_SAMPLE_G: + case AMDGPU::TEX_SAMPLE_C_G: + case AMDGPU::TEX_GET_GRADIENTS_H: + case AMDGPU::TEX_GET_GRADIENTS_V: + case AMDGPU::TEX_SET_GRADIENTS_H: + case AMDGPU::TEX_SET_GRADIENTS_V: + return true; + } +} + +bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand, + unsigned Flag) const { + const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode()); + unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags); + if (FlagIndex == 0) { + return false; + } + assert(MI.getOperand(FlagIndex).isImm()); + return !!((MI.getOperand(FlagIndex).getImm() >> + (NUM_MO_FLAGS * Operand)) & Flag); +} + +#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp new file mode 100644 index 0000000000..e27abccbe1 --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -0,0 +1,203 @@ +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The SI code emitter produces machine code that can be executed +/// directly on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +/// \brief Helper type used in encoding +typedef union { + int32_t I; + float F; +} IntFloatUnion; + +class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { + SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; + void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + const MCSubtargetInfo &STI; + MCContext &Ctx; + + /// \brief Can this operand also contain immediate values? + bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; + + /// \brief Encode an fp or int literal + uint32_t getLitEncoding(const MCOperand &MO) const; + +public: + SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + const MCSubtargetInfo &sti, MCContext &ctx) + : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { } + + ~SIMCCodeEmitter() { } + + /// \breif Encode the instruction and write it to the OS. + virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// \returns the encoding for an MCOperand. + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const; +}; + +} // End anonymous namespace + +MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new SIMCCodeEmitter(MCII, MRI, STI, Ctx); +} + +bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, + unsigned OpNo) const { + + unsigned RegClass = Desc.OpInfo[OpNo].RegClass; + return (AMDGPU::SSrc_32RegClassID == RegClass) || + (AMDGPU::SSrc_64RegClassID == RegClass) || + (AMDGPU::VSrc_32RegClassID == RegClass) || + (AMDGPU::VSrc_64RegClassID == RegClass); +} + +uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { + + IntFloatUnion Imm; + if (MO.isImm()) + Imm.I = MO.getImm(); + else if (MO.isFPImm()) + Imm.F = MO.getFPImm(); + else + return ~0; + + if (Imm.I >= 0 && Imm.I <= 64) + return 128 + Imm.I; + + if (Imm.I >= -16 && Imm.I <= -1) + return 192 + abs(Imm.I); + + if (Imm.F == 0.5f) + return 240; + + if (Imm.F == -0.5f) + return 241; + + if (Imm.F == 1.0f) + return 242; + + if (Imm.F == -1.0f) + return 243; + + if (Imm.F == 2.0f) + return 244; + + if (Imm.F == -2.0f) + return 245; + + if (Imm.F == 4.0f) + return 246; + + if (Imm.F == -4.0f) + return 247; + + return 255; +} + +void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const { + + uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + unsigned bytes = Desc.getSize(); + + for (unsigned i = 0; i < bytes; i++) { + OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); + } + + if (bytes > 4) + return; + + // Check for additional literals in SRC0/1/2 (Op 1/2/3) + for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { + + // Check if this operand should be encoded as [SV]Src + if (!isSrcOperand(Desc, i)) + continue; + + // Is this operand a literal immediate? + const MCOperand &Op = MI.getOperand(i); + if (getLitEncoding(Op) != 255) + continue; + + // Yes! Encode it + IntFloatUnion Imm; + if (Op.isImm()) + Imm.I = Op.getImm(); + else + Imm.F = Op.getFPImm(); + + for (unsigned j = 0; j < 4; j++) { + OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff)); + } + + // Only one literal value allowed + break; + } +} + +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const { + if (MO.isReg()) + return MRI.getEncodingValue(MO.getReg()); + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(FK_PCRel_4); + Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + // Figure out the operand number, needed for isSrcOperand check + unsigned OpNo = 0; + for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { + if (&MO == &MI.getOperand(OpNo)) + break; + } + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (isSrcOperand(Desc, OpNo)) { + uint32_t Enc = getLitEncoding(MO); + if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) + return Enc; + + } else if (MO.isImm()) + return MO.getImm(); + + llvm_unreachable("Encoding of this operand type is not supported yet."); + return 0; +} + diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile new file mode 100644 index 0000000000..1b3ebbe8c8 --- /dev/null +++ b/lib/Target/R600/Makefile @@ -0,0 +1,23 @@ +##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMR600CodeGen +TARGET = AMDGPU + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ + AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ + AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ + AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ + AMDGPUGenAsmWriter.inc + +DIRS = InstPrinter TargetInfo MCTargetDesc + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td new file mode 100644 index 0000000000..868810c613 --- /dev/null +++ b/lib/Target/R600/Processors.td @@ -0,0 +1,30 @@ +//===-- Processors.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AMDIL processors supported. +// +//===----------------------------------------------------------------------===// + +class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features> +: Processor<Name, itin, Features>; +def : Proc<"", R600_EG_Itin, [FeatureR600ALUInst]>; +def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>; +def : Proc<"rv710", R600_EG_Itin, []>; +def : Proc<"rv730", R600_EG_Itin, []>; +def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>; +def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; +def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; +def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; +def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; +def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; +def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; +def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; +def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; +def : Proc<"SI", SI_Itin, [Feature64BitPtr]>; + diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h new file mode 100644 index 0000000000..16cfcf59eb --- /dev/null +++ b/lib/Target/R600/R600Defines.h @@ -0,0 +1,97 @@ +//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef R600DEFINES_H_ +#define R600DEFINES_H_ + +#include "llvm/MC/MCRegisterInfo.h" + +// Operand Flags +#define MO_FLAG_CLAMP (1 << 0) +#define MO_FLAG_NEG (1 << 1) +#define MO_FLAG_ABS (1 << 2) +#define MO_FLAG_MASK (1 << 3) +#define MO_FLAG_PUSH (1 << 4) +#define MO_FLAG_NOT_LAST (1 << 5) +#define MO_FLAG_LAST (1 << 6) +#define NUM_MO_FLAGS 7 + +/// \brief Helper for getting the operand index for the instruction flags +/// operand. +#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) + +namespace R600_InstFlag { + enum TIF { + TRANS_ONLY = (1 << 0), + TEX = (1 << 1), + REDUCTION = (1 << 2), + FC = (1 << 3), + TRIG = (1 << 4), + OP3 = (1 << 5), + VECTOR = (1 << 6), + //FlagOperand bits 7, 8 + NATIVE_OPERANDS = (1 << 9), + OP1 = (1 << 10), + OP2 = (1 << 11) + }; +} + +#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) + +/// \brief Defines for extracting register infomation from register encoding +#define HW_REG_MASK 0x1ff +#define HW_CHAN_SHIFT 9 + +#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) +#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) + +namespace R600Operands { + enum Ops { + DST, + UPDATE_EXEC_MASK, + UPDATE_PREDICATE, + WRITE, + OMOD, + DST_REL, + CLAMP, + SRC0, + SRC0_NEG, + SRC0_REL, + SRC0_ABS, + SRC0_SEL, + SRC1, + SRC1_NEG, + SRC1_REL, + SRC1_ABS, + SRC1_SEL, + SRC2, + SRC2_NEG, + SRC2_REL, + SRC2_SEL, + LAST, + PRED_SEL, + IMM, + COUNT + }; + + const static int ALUOpTable[3][R600Operands::COUNT] = { +// W C S S S S S S S S S S S +// R O D L S R R R R S R R R R S R R R L P +// D U I M R A R C C C C R C C C C R C C C A R I +// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M +// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M + {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12}, + {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19}, + {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17} + }; + +} + +#endif // R600DEFINES_H_ diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp new file mode 100644 index 0000000000..f8c900f727 --- /dev/null +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -0,0 +1,297 @@ +//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Vector, Reduction, and Cube instructions need to fill the entire instruction +/// group to work correctly. This pass expands these individual instructions +/// into several instructions that will completely fill the instruction group. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +class R600ExpandSpecialInstrsPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + bool ExpandInputPerspective(MachineInstr& MI); + bool ExpandInputConstant(MachineInstr& MI); + +public: + R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), + TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { + return "R600 Expand special instructions pass"; + } +}; + +} // End anonymous namespace + +char R600ExpandSpecialInstrsPass::ID = 0; + +FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { + return new R600ExpandSpecialInstrsPass(TM); +} + +bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + while (I != MBB.end()) { + MachineInstr &MI = *I; + I = llvm::next(I); + + switch (MI.getOpcode()) { + default: break; + // Expand PRED_X to one of the PRED_SET instructions. + case AMDGPU::PRED_X: { + uint64_t Flags = MI.getOperand(3).getImm(); + // The native opcode used by PRED_X is stored as an immediate in the + // third operand. + MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, + MI.getOperand(2).getImm(), // opcode + MI.getOperand(0).getReg(), // dst + MI.getOperand(1).getReg(), // src0 + AMDGPU::ZERO); // src1 + TII->addFlag(PredSet, 0, MO_FLAG_MASK); + if (Flags & MO_FLAG_PUSH) { + TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1); + } else { + TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1); + } + MI.eraseFromParent(); + continue; + } + case AMDGPU::BREAK: { + MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, + AMDGPU::PRED_SETE_INT, + AMDGPU::PREDICATE_BIT, + AMDGPU::ZERO, + AMDGPU::ZERO); + TII->addFlag(PredSet, 0, MO_FLAG_MASK); + TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1); + + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDGPU::PREDICATED_BREAK)) + .addReg(AMDGPU::PREDICATE_BIT); + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_XY: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = MI.getOperand(Chan).getReg(); + else + DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan >= 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_ZW: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; + else + DstReg = MI.getOperand(Chan-2).getReg(); + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan < 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_VEC_LOAD: { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(1).getImm()); + unsigned DstReg = MI.getOperand(0).getReg(); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, + TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + } + + bool IsReduction = TII->isReductionOp(MI.getOpcode()); + bool IsVector = TII->isVector(MI); + bool IsCube = TII->isCubeOp(MI.getOpcode()); + if (!IsReduction && !IsVector && !IsCube) { + continue; + } + + // Expand the instruction + // + // Reduction instructions: + // T0_X = DP4 T1_XYZW, T2_XYZW + // becomes: + // TO_X = DP4 T1_X, T2_X + // TO_Y (write masked) = DP4 T1_Y, T2_Y + // TO_Z (write masked) = DP4 T1_Z, T2_Z + // TO_W (write masked) = DP4 T1_W, T2_W + // + // Vector instructions: + // T0_X = MULLO_INT T1_X, T2_X + // becomes: + // T0_X = MULLO_INT T1_X, T2_X + // T0_Y (write masked) = MULLO_INT T1_X, T2_X + // T0_Z (write masked) = MULLO_INT T1_X, T2_X + // T0_W (write masked) = MULLO_INT T1_X, T2_X + // + // Cube instructions: + // T0_XYZW = CUBE T1_XYZW + // becomes: + // TO_X = CUBE T1_Z, T1_Y + // T0_Y = CUBE T1_Z, T1_X + // T0_Z = CUBE T1_X, T1_Z + // T0_W = CUBE T1_Y, T1_Z + for (unsigned Chan = 0; Chan < 4; Chan++) { + unsigned DstReg = MI.getOperand( + TII->getOperandIdx(MI, R600Operands::DST)).getReg(); + unsigned Src0 = MI.getOperand( + TII->getOperandIdx(MI, R600Operands::SRC0)).getReg(); + unsigned Src1 = 0; + + // Determine the correct source registers + if (!IsCube) { + int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1); + if (Src1Idx != -1) { + Src1 = MI.getOperand(Src1Idx).getReg(); + } + } + if (IsReduction) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + Src0 = TRI.getSubReg(Src0, SubRegIndex); + Src1 = TRI.getSubReg(Src1, SubRegIndex); + } else if (IsCube) { + static const int CubeSrcSwz[] = {2, 2, 0, 1}; + unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); + unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); + Src1 = TRI.getSubReg(Src0, SubRegIndex1); + Src0 = TRI.getSubReg(Src0, SubRegIndex0); + } + + // Determine the correct destination registers; + bool Mask = false; + bool NotLast = true; + if (IsCube) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + DstReg = TRI.getSubReg(DstReg, SubRegIndex); + } else { + // Mask the write if the original instruction does not write to + // the current Channel. + Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + } + + // Set the IsLast bit + NotLast = (Chan != 3 ); + + // Add the new instruction + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::CUBE_r600_pseudo: + Opcode = AMDGPU::CUBE_r600_real; + break; + case AMDGPU::CUBE_eg_pseudo: + Opcode = AMDGPU::CUBE_eg_real; + break; + case AMDGPU::DOT4_r600_pseudo: + Opcode = AMDGPU::DOT4_r600_real; + break; + case AMDGPU::DOT4_eg_pseudo: + Opcode = AMDGPU::DOT4_eg_real; + break; + default: + break; + } + + MachineInstr *NewMI = + TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); + + if (Chan != 0) + NewMI->bundleWithPred(); + if (Mask) { + TII->addFlag(NewMI, 0, MO_FLAG_MASK); + } + if (NotLast) { + TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); + } + } + MI.eraseFromParent(); + } + } + return false; +} diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp new file mode 100644 index 0000000000..a73691dd3c --- /dev/null +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -0,0 +1,1118 @@ +//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for R600 +// +//===----------------------------------------------------------------------===// + +#include "R600ISelLowering.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +R600TargetLowering::R600TargetLowering(TargetMachine &TM) : + AMDGPUTargetLowering(TM), + TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) { + setOperationAction(ISD::MUL, MVT::i64, Expand); + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); + computeRegisterProperties(); + + setOperationAction(ISD::FADD, MVT::v4f32, Expand); + setOperationAction(ISD::FMUL, MVT::v4f32, Expand); + setOperationAction(ISD::FDIV, MVT::v4f32, Expand); + setOperationAction(ISD::FSUB, MVT::v4f32, Expand); + + setOperationAction(ISD::ADD, MVT::v4i32, Expand); + setOperationAction(ISD::AND, MVT::v4i32, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); + setOperationAction(ISD::UDIV, MVT::v4i32, Expand); + setOperationAction(ISD::UREM, MVT::v4i32, Expand); + setOperationAction(ISD::SETCC, MVT::v4i32, Expand); + + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + + setOperationAction(ISD::FSUB, MVT::f32, Expand); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); + setOperationAction(ISD::FPOW, MVT::f32, Custom); + + setOperationAction(ISD::ROTL, MVT::i32, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + + setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Custom); + + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); + setOperationAction(ISD::STORE, MVT::i8, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setSchedulingPreference(Sched::VLIW); +} + +MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + MachineFunction * MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock::iterator I = *MI; + + switch (MI->getOpcode()) { + default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + case AMDGPU::CLAMP_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); + break; + } + + case AMDGPU::FABS_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_ABS); + break; + } + + case AMDGPU::FNEG_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_NEG); + break; + } + + case AMDGPU::MASK_WRITE: { + unsigned maskedRegister = MI->getOperand(0).getReg(); + assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); + MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); + TII->addFlag(defInstr, 0, MO_FLAG_MASK); + break; + } + + case AMDGPU::MOV_IMM_F32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getFPImm()->getValueAPF() + .bitcastToAPInt().getZExtValue()); + break; + case AMDGPU::MOV_IMM_I32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getImm()); + break; + case AMDGPU::CONST_COPY: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, + MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(NewMI, R600Operands::SRC0_SEL, + MI->getOperand(1).getImm()); + break; + } + + case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { + unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(EOP); // Set End of program bit + break; + } + + case AMDGPU::TXD: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::TXD_SHADOW: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::BRANCH: + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + .addOperand(MI->getOperand(0)); + break; + + case AMDGPU::BRANCH_COND_f32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::BRANCH_COND_i32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO_INT) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::EG_ExportSwz: + case AMDGPU::R600_ExportSwz: { + // Instruction is left unmodified if its not the last one of its type + bool isLastInstructionOfItsType = true; + unsigned InstExportType = MI->getOperand(1).getImm(); + for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), + EndBlock = BB->end(); NextExportInst != EndBlock; + NextExportInst = llvm::next(NextExportInst)) { + if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || + NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { + unsigned CurrentInstExportType = NextExportInst->getOperand(1) + .getImm(); + if (CurrentInstExportType == InstExportType) { + isLastInstructionOfItsType = false; + break; + } + } + } + bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; + if (!EOP && !isLastInstructionOfItsType) + return BB; + unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)) + .addImm(CfInst) + .addImm(EOP); + break; + } + case AMDGPU::RETURN: { + // RETURN instructions must have the live-out registers as implicit uses, + // otherwise they appear dead. + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); + MachineInstrBuilder MIB(*MF, MI); + for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) + MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); + return BB; + } + } + + MI->eraseFromParent(); + return BB; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +using namespace llvm::Intrinsic; +using namespace llvm::AMDGPUIntrinsic; + +SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::ROTL: return LowerROTL(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::FPOW: return LowerFPOW(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::INTRINSIC_VOID: { + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = + cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntrinsicID) { + case AMDGPUIntrinsic::AMDGPU_store_output: { + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + MFI->LiveOuts.push_back(Reg); + return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2)); + } + case AMDGPUIntrinsic::R600_store_swizzle: { + const SDValue Args[8] = { + Chain, + Op.getOperand(2), // Export Value + Op.getOperand(3), // ArrayBase + Op.getOperand(4), // Type + DAG.getConstant(0, MVT::i32), // SWZ_X + DAG.getConstant(1, MVT::i32), // SWZ_Y + DAG.getConstant(2, MVT::i32), // SWZ_Z + DAG.getConstant(3, MVT::i32) // SWZ_W + }; + return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(), + Args, 8); + } + + // default for switch(IntrinsicID) + default: break; + } + // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + switch(IntrinsicID) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case AMDGPUIntrinsic::R600_load_input: { + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT); + } + + case AMDGPUIntrinsic::R600_interp_input: { + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); + MachineSDNode *interp; + if (ijb < 0) { + interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, + MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); + return DAG.getTargetExtractSubreg( + TII->getRegisterInfo().getSubRegFromChannel(slot % 4), + DL, MVT::f32, SDValue(interp, 0)); + } + + if (slot % 4 < 2) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), + CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); + + return SDValue(interp, slot % 2); + } + + case r600_read_ngroups_x: + return LowerImplicitParameter(DAG, VT, DL, 0); + case r600_read_ngroups_y: + return LowerImplicitParameter(DAG, VT, DL, 1); + case r600_read_ngroups_z: + return LowerImplicitParameter(DAG, VT, DL, 2); + case r600_read_global_size_x: + return LowerImplicitParameter(DAG, VT, DL, 3); + case r600_read_global_size_y: + return LowerImplicitParameter(DAG, VT, DL, 4); + case r600_read_global_size_z: + return LowerImplicitParameter(DAG, VT, DL, 5); + case r600_read_local_size_x: + return LowerImplicitParameter(DAG, VT, DL, 6); + case r600_read_local_size_y: + return LowerImplicitParameter(DAG, VT, DL, 7); + case r600_read_local_size_z: + return LowerImplicitParameter(DAG, VT, DL, 8); + + case r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_X, VT); + case r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Y, VT); + case r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Z, VT); + case r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_X, VT); + case r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Y, VT); + case r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Z, VT); + } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) + break; + } + } // end switch(Op.getOpcode()) + return SDValue(); +} + +void R600TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: return; + case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: + SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + return; + } +} + +SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { + return DAG.getNode( + ISD::SETCC, + Op.getDebugLoc(), + MVT::i1, + Op, DAG.getConstantFP(0.0f, MVT::f32), + DAG.getCondCode(ISD::SETNE) + ); +} + +SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + DebugLoc DL, + unsigned DwordOffset) const { + unsigned ByteOffset = DwordOffset * 4; + PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::PARAM_I_ADDRESS); + + // We shouldn't be using an offset wider than 16-bits for implicit parameters. + assert(isInt<16>(ByteOffset)); + + return DAG.getLoad(VT, DL, DAG.getEntryNode(), + DAG.getConstant(ByteOffset, MVT::i32), // PTR + MachinePointerInfo(ConstantPointerNull::get(PtrType)), + false, false, false, 0); +} + +SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); + + FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); + assert(FIN); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); +} + +SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT, + Op.getOperand(0), + Op.getOperand(0), + DAG.getNode(ISD::SUB, DL, VT, + DAG.getConstant(32, MVT::i32), + Op.getOperand(1))); +} + +bool R600TargetLowering::isZero(SDValue Op) const { + if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { + return Cst->isNullValue(); + } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ + return CstFP->isZero(); + } else { + return false; + } +} + +SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue True = Op.getOperand(2); + SDValue False = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + SDValue Temp; + + // LHS and RHS are guaranteed to be the same value type + EVT CompareVT = LHS.getValueType(); + + // Check if we can lower this to a native operation. + + // Try to lower to a SET* instruction: + // + // SET* can match the following patterns: + // + // select_cc f32, f32, -1, 0, cc_any + // select_cc f32, f32, 1.0f, 0.0f, cc_any + // select_cc i32, i32, -1, 0, cc_any + // + + // Move hardware True/False values to the correct operand. + if (isHWTrueValue(False) && isHWFalseValue(True)) { + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + std::swap(False, True); + CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); + } + + if (isHWTrueValue(True) && isHWFalseValue(False) && + (CompareVT == VT || VT == MVT::i32)) { + // This can be matched by a SET* instruction. + return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); + } + + // Try to lower to a CND* instruction: + // + // CND* can match the following patterns: + // + // select_cc f32, 0.0, f32, f32, cc_any + // select_cc f32, 0.0, i32, i32, cc_any + // select_cc i32, 0, f32, f32, cc_any + // select_cc i32, 0, i32, i32, cc_any + // + if (isZero(LHS) || isZero(RHS)) { + SDValue Cond = (isZero(LHS) ? RHS : LHS); + SDValue Zero = (isZero(LHS) ? LHS : RHS); + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + if (CompareVT != VT) { + // Bitcast True / False to the correct types. This will end up being + // a nop, but it allows us to define only a single pattern in the + // .TD files for each CND* instruction rather than having to have + // one pattern for integer True/False and one for fp True/False + True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); + False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); + } + if (isZero(LHS)) { + CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode); + } + + switch (CCOpcode) { + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + case ISD::SETULE: + case ISD::SETULT: + case ISD::SETOLE: + case ISD::SETOLT: + case ISD::SETLE: + case ISD::SETLT: + CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); + Temp = True; + True = False; + False = Temp; + break; + default: + break; + } + SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, + Cond, Zero, + True, False, + DAG.getCondCode(CCOpcode)); + return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); + } + + + // Possible Min/Max pattern + SDValue MinMax = LowerMinMax(Op, DAG); + if (MinMax.getNode()) { + return MinMax; + } + + // If we make it this for it means we have no native instructions to handle + // this SELECT_CC, so we must lower it. + SDValue HWTrue, HWFalse; + + if (CompareVT == MVT::f32) { + HWTrue = DAG.getConstantFP(1.0f, CompareVT); + HWFalse = DAG.getConstantFP(0.0f, CompareVT); + } else if (CompareVT == MVT::i32) { + HWTrue = DAG.getConstant(-1, CompareVT); + HWFalse = DAG.getConstant(0, CompareVT); + } + else { + assert(!"Unhandled value type in LowerSELECT_CC"); + } + + // Lower this unsupported SELECT_CC into a combination of two supported + // SELECT_CC operations. + SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); + + return DAG.getNode(ISD::SELECT_CC, DL, VT, + Cond, HWFalse, + True, False, + DAG.getCondCode(ISD::SETNE)); +} + +SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + return DAG.getNode(ISD::SELECT_CC, + Op.getDebugLoc(), + Op.getValueType(), + Op.getOperand(0), + DAG.getConstant(0, MVT::i32), + Op.getOperand(1), + Op.getOperand(2), + DAG.getCondCode(ISD::SETNE)); +} + +/// LLVM generates byte-addresed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + +SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Ptr = Op.getOperand(2); + + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + Ptr->getOpcode() != AMDGPUISD::DWORDADDR) { + // Convert pointer from byte address to dword address. + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), + DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), + Ptr, DAG.getConstant(2, MVT::i32))); + + if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { + assert(!"Truncated and indexed stores not supported yet"); + } else { + Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); + } + return Chain; + } + + EVT ValueVT = Value.getValueType(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (ValueVT.isVector()) { + unsigned NumElemVT = ValueVT.getVectorNumElements(); + EVT ElemVT = ValueVT.getVectorElementType(); + SDValue Stores[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, MVT::i32)); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); + } else { + if (ValueVT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, MVT::i32)); // Channel + } + + return Chain; +} + +// return (512 + (kc_bank << 12) +static int +ConstantAddressBlock(unsigned AddressSpace) { + switch (AddressSpace) { + case AMDGPUAS::CONSTANT_BUFFER_0: + return 512; + case AMDGPUAS::CONSTANT_BUFFER_1: + return 512 + 4096; + case AMDGPUAS::CONSTANT_BUFFER_2: + return 512 + 4096 * 2; + case AMDGPUAS::CONSTANT_BUFFER_3: + return 512 + 4096 * 3; + case AMDGPUAS::CONSTANT_BUFFER_4: + return 512 + 4096 * 4; + case AMDGPUAS::CONSTANT_BUFFER_5: + return 512 + 4096 * 5; + case AMDGPUAS::CONSTANT_BUFFER_6: + return 512 + 4096 * 6; + case AMDGPUAS::CONSTANT_BUFFER_7: + return 512 + 4096 * 7; + case AMDGPUAS::CONSTANT_BUFFER_8: + return 512 + 4096 * 8; + case AMDGPUAS::CONSTANT_BUFFER_9: + return 512 + 4096 * 9; + case AMDGPUAS::CONSTANT_BUFFER_10: + return 512 + 4096 * 10; + case AMDGPUAS::CONSTANT_BUFFER_11: + return 512 + 4096 * 11; + case AMDGPUAS::CONSTANT_BUFFER_12: + return 512 + 4096 * 12; + case AMDGPUAS::CONSTANT_BUFFER_13: + return 512 + 4096 * 13; + case AMDGPUAS::CONSTANT_BUFFER_14: + return 512 + 4096 * 14; + case AMDGPUAS::CONSTANT_BUFFER_15: + return 512 + 4096 * 15; + default: + return -1; + } +} + +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const +{ + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + if (ConstantBlock > -1) { + SDValue Result; + if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) || + dyn_cast<Constant>(LoadNode->getSrcValue()) || + dyn_cast<ConstantSDNode>(Ptr)) { + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); + } else { + // non constant ptr cant be folded, keeps it as a v4f32 load + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), + DAG.getConstant(LoadNode->getAddressSpace() - + AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) + ); + } + + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, MVT::i32)); + } + + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, 2, DL); + } + + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( + getTargetMachine().getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2]; + Ops[0] = LoweredLoad; + Ops[1] = Chain; + + return DAG.getMergeValues(Ops, 2, DL); +} + +SDValue R600TargetLowering::LowerFPOW(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0)); + SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase); + return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase); +} + +/// XXX Only kernel functions are supported, so we can assume for now that +/// every function is a kernel function, but in the future we should use +/// separate calling conventions for kernel and non-kernel functions. +SDValue R600TargetLowering::LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + unsigned ParamOffsetBytes = 36; + Function::const_arg_iterator FuncArg = + DAG.getMachineFunction().getFunction()->arg_begin(); + for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) { + EVT VT = Ins[i].VT; + Type *ArgType = FuncArg->getType(); + unsigned ArgSizeInBits = ArgType->isPointerTy() ? + 32 : ArgType->getPrimitiveSizeInBits(); + unsigned ArgBytes = ArgSizeInBits >> 3; + EVT ArgVT; + if (ArgSizeInBits < VT.getSizeInBits()) { + assert(!ArgType->isFloatTy() && + "Extending floating point arguments not supported yet"); + ArgVT = MVT::getIntegerVT(ArgSizeInBits); + } else { + ArgVT = VT; + } + PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::PARAM_I_ADDRESS); + SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), + DAG.getConstant(ParamOffsetBytes, MVT::i32), + MachinePointerInfo(UndefValue::get(PtrTy)), + ArgVT, false, false, ArgBytes); + InVals.push_back(Arg); + ParamOffsetBytes += ArgBytes; + } + return Chain; +} + +EVT R600TargetLowering::getSetCCResultType(EVT VT) const { + if (!VT.isVector()) return MVT::i32; + return VT.changeVectorElementTypeToInteger(); +} + +//===----------------------------------------------------------------------===// +// Custom DAG Optimizations +//===----------------------------------------------------------------------===// + +SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + switch (N->getOpcode()) { + // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) + case ISD::FP_ROUND: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { + return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0), + Arg.getOperand(0)); + } + break; + } + + // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> + // (i32 select_cc f32, f32, -1, 0 cc) + // + // Mesa's GLSL frontend generates the above pattern a lot and we can lower + // this to one of the SET*_DX10 instructions. + case ISD::FP_TO_SINT: { + SDValue FNeg = N->getOperand(0); + if (FNeg.getOpcode() != ISD::FNEG) { + return SDValue(); + } + SDValue SelectCC = FNeg.getOperand(0); + if (SelectCC.getOpcode() != ISD::SELECT_CC || + SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS + SelectCC.getOperand(2).getValueType() != MVT::f32 || // True + !isHWTrueValue(SelectCC.getOperand(2)) || + !isHWFalseValue(SelectCC.getOperand(3))) { + return SDValue(); + } + + return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0), + SelectCC.getOperand(0), // LHS + SelectCC.getOperand(1), // RHS + DAG.getConstant(-1, MVT::i32), // True + DAG.getConstant(0, MVT::i32), // Flase + SelectCC.getOperand(4)); // CC + + break; + } + // Extract_vec (Build_vector) generated by custom lowering + // also needs to be customly combined + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return Arg->getOperand(Element); + } + } + if (Arg.getOpcode() == ISD::BITCAST && + Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(), + Arg->getOperand(0).getOperand(Element)); + } + } + } + + case ISD::SELECT_CC: { + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> + // selectcc x, y, a, b, inv(cc) + // + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> + // selectcc x, y, a, b, cc + SDValue LHS = N->getOperand(0); + if (LHS.getOpcode() != ISD::SELECT_CC) { + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + + if (LHS.getOperand(2).getNode() != True.getNode() || + LHS.getOperand(3).getNode() != False.getNode() || + RHS.getNode() != False.getNode()) { + return SDValue(); + } + + switch (NCC) { + default: return SDValue(); + case ISD::SETNE: return LHS; + case ISD::SETEQ: { + ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); + LHSCC = ISD::getSetCCInverse(LHSCC, + LHS.getOperand(0).getValueType().isInteger()); + return DAG.getSelectCC(N->getDebugLoc(), + LHS.getOperand(0), + LHS.getOperand(1), + LHS.getOperand(2), + LHS.getOperand(3), + LHSCC); + } + } + } + case AMDGPUISD::EXPORT: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + SDValue NewBldVec[4] = { + DAG.getUNDEF(MVT::f32), + DAG.getUNDEF(MVT::f32), + DAG.getUNDEF(MVT::f32), + DAG.getUNDEF(MVT::f32) + }; + SDValue NewArgs[8] = { + N->getOperand(0), // Chain + SDValue(), + N->getOperand(2), // ArrayBase + N->getOperand(3), // Type + N->getOperand(4), // SWZ_X + N->getOperand(5), // SWZ_Y + N->getOperand(6), // SWZ_Z + N->getOperand(7) // SWZ_W + }; + for (unsigned i = 0; i < Arg.getNumOperands(); i++) { + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) { + if (C->isZero()) { + NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0 + } else if (C->isExactlyValue(1.0)) { + NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0 + } else { + NewBldVec[i] = Arg.getOperand(i); + } + } else { + NewBldVec[i] = Arg.getOperand(i); + } + } + DebugLoc DL = N->getDebugLoc(); + NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4); + return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); + } + } + return SDValue(); +} diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h new file mode 100644 index 0000000000..5cb4b912a1 --- /dev/null +++ b/lib/Target/R600/R600ISelLowering.h @@ -0,0 +1,75 @@ +//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef R600ISELLOWERING_H +#define R600ISELLOWERING_H + +#include "AMDGPUISelLowering.h" + +namespace llvm { + +class R600InstrInfo; + +class R600TargetLowering : public AMDGPUTargetLowering { +public: + R600TargetLowering(TargetMachine &TM); + virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock * BB) const; + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const; + virtual SDValue LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + virtual EVT getSetCCResultType(EVT VT) const; +private: + const R600InstrInfo * TII; + + /// Each OpenCL kernel has nine implicit parameters that are stored in the + /// first nine dwords of a Vertex Buffer. These implicit parameters are + /// lowered to load instructions which retreive the values from the Vertex + /// Buffer. + SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + DebugLoc DL, unsigned DwordOffset) const; + + void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, + MachineRegisterInfo & MRI, unsigned dword_offset) const; + + /// \brief Lower ROTL opcode to BITALIGN + SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + + SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, + SelectionDAG &DAG) const; + void getStackAddress(unsigned StackWidth, unsigned ElemIdx, + unsigned &Channel, unsigned &PtrIncr) const; + bool isZero(SDValue Op) const; +}; + +} // End namespace llvm; + +#endif // R600ISELLOWERING_H diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp new file mode 100644 index 0000000000..be3318a0b4 --- /dev/null +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -0,0 +1,784 @@ +//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + +#include "R600InstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define GET_INSTRINFO_CTOR +#include "AMDGPUGenDFAPacketizer.inc" + +using namespace llvm; + +R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) + : AMDGPUInstrInfo(tm), + RI(tm, *this) + { } + +const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { + return RI; +} + +bool R600InstrInfo::isTrig(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; +} + +bool R600InstrInfo::isVector(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; +} + +void +R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + if (AMDGPU::R600_Reg128RegClass.contains(DestReg) + && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { + for (unsigned I = 0; I < 4; I++) { + unsigned SubRegIndex = RI.getSubRegFromChannel(I); + buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + RI.getSubReg(DestReg, SubRegIndex), + RI.getSubReg(SrcReg, SubRegIndex)) + .addReg(DestReg, + RegState::Define | RegState::Implicit); + } + } else { + + // We can't copy vec4 registers + assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg) + && !AMDGPU::R600_Reg128RegClass.contains(SrcReg)); + + MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + DestReg, SrcReg); + NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0)) + .setIsKill(KillSrc); + } +} + +MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF, + unsigned DstReg, int64_t Imm) const { + MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc()); + MachineInstrBuilder MIB(*MF, MI); + MIB.addReg(DstReg, RegState::Define); + MIB.addReg(AMDGPU::ALU_LITERAL_X); + MIB.addImm(Imm); + MIB.addReg(0); // PREDICATE_BIT + + return MI; +} + +unsigned R600InstrInfo::getIEQOpcode() const { + return AMDGPU::SETE_INT; +} + +bool R600InstrInfo::isMov(unsigned Opcode) const { + + + switch(Opcode) { + default: return false; + case AMDGPU::MOV: + case AMDGPU::MOV_IMM_F32: + case AMDGPU::MOV_IMM_I32: + return true; + } +} + +// Some instructions act as place holders to emulate operations that the GPU +// hardware does automatically. This function can be used to check if +// an opcode falls into this category. +bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { + switch (Opcode) { + default: return false; + case AMDGPU::RETURN: + return true; + } +} + +bool R600InstrInfo::isReductionOp(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::DOT4_r600_pseudo: + case AMDGPU::DOT4_eg_pseudo: + return true; + } +} + +bool R600InstrInfo::isCubeOp(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::CUBE_r600_pseudo: + case AMDGPU::CUBE_r600_real: + case AMDGPU::CUBE_eg_pseudo: + case AMDGPU::CUBE_eg_real: + return true; + } +} + +bool R600InstrInfo::isALUInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::OP1) | + (TargetFlags & R600_InstFlag::OP2) | + (TargetFlags & R600_InstFlag::OP3)); +} + +DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM, + const ScheduleDAG *DAG) const { + const InstrItineraryData *II = TM->getInstrItineraryData(); + return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II); +} + +static bool +isPredicateSetter(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::PRED_X: + return true; + default: + return false; + } +} + +static MachineInstr * +findFirstPredicateSetterFrom(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + while (I != MBB.begin()) { + --I; + MachineInstr *MI = I; + if (isPredicateSetter(MI->getOpcode())) + return MI; + } + + return NULL; +} + +static +bool isJump(unsigned Opcode) { + return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; +} + +bool +R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // Most of the following comes from the ARM implementation of AnalyzeBranch + + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) { + return false; + } + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || + !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) { + if (LastOpc == AMDGPU::JUMP) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastOpc == AMDGPU::JUMP_COND) { + MachineInstr *predSet = I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If the block ends with a B and a Bcc, handle it. + if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { + MachineInstr *predSet = --I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = SecondLastInst->getOperand(0).getMBB(); + FBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +int R600InstrInfo::getBranchInstr(const MachineOperand &op) const { + const MachineInstr *MI = op.getParent(); + + switch (MI->getDesc().OpInfo->RegClass) { + default: // FIXME: fallthrough?? + case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32; + case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32; + }; +} + +unsigned +R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + + if (FBB == 0) { + if (Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); + return 1; + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + return 1; + } + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); + return 2; + } +} + +unsigned +R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + + // Note : we leave PRED* instructions there. + // They may be needed when predicating instructions. + + MachineBasicBlock::iterator I = MBB.end(); + + if (I == MBB.begin()) { + return 0; + } + --I; + switch (I->getOpcode()) { + default: + return 0; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + I = MBB.end(); + + if (I == MBB.begin()) { + return 1; + } + --I; + switch (I->getOpcode()) { + // FIXME: only one case?? + default: + return 1; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + return 2; +} + +bool +R600InstrInfo::isPredicated(const MachineInstr *MI) const { + int idx = MI->findFirstPredOperandIdx(); + if (idx < 0) + return false; + + unsigned Reg = MI->getOperand(idx).getReg(); + switch (Reg) { + default: return false; + case AMDGPU::PRED_SEL_ONE: + case AMDGPU::PRED_SEL_ZERO: + case AMDGPU::PREDICATE_BIT: + return true; + } +} + +bool +R600InstrInfo::isPredicable(MachineInstr *MI) const { + // XXX: KILL* instructions can be predicated, but they must be the last + // instruction in a clause, so this means any instructions after them cannot + // be predicated. Until we have proper support for instruction clauses in the + // backend, we will mark KILL* instructions as unpredicable. + + if (MI->getOpcode() == AMDGPU::KILLGT) { + return false; + } else if (isVector(*MI)) { + return false; + } else { + return AMDGPUInstrInfo::isPredicable(MI); + } +} + + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + unsigned ExtraPredCycles, + const BranchProbability &Probability) const{ + return true; +} + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, + unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, + unsigned ExtraFCycles, + const BranchProbability &Probability) const { + return true; +} + +bool +R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + const BranchProbability &Probability) + const { + return true; +} + +bool +R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const { + return false; +} + + +bool +R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + MachineOperand &MO = Cond[1]; + switch (MO.getImm()) { + case OPCODE_IS_ZERO_INT: + MO.setImm(OPCODE_IS_NOT_ZERO_INT); + break; + case OPCODE_IS_NOT_ZERO_INT: + MO.setImm(OPCODE_IS_ZERO_INT); + break; + case OPCODE_IS_ZERO: + MO.setImm(OPCODE_IS_NOT_ZERO); + break; + case OPCODE_IS_NOT_ZERO: + MO.setImm(OPCODE_IS_ZERO); + break; + default: + return true; + } + + MachineOperand &MO2 = Cond[2]; + switch (MO2.getReg()) { + case AMDGPU::PRED_SEL_ZERO: + MO2.setReg(AMDGPU::PRED_SEL_ONE); + break; + case AMDGPU::PRED_SEL_ONE: + MO2.setReg(AMDGPU::PRED_SEL_ZERO); + break; + default: + return true; + } + return false; +} + +bool +R600InstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + return isPredicateSetter(MI->getOpcode()); +} + + +bool +R600InstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const { + return false; +} + + +bool +R600InstrInfo::PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const { + int PIdx = MI->findFirstPredOperandIdx(); + + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setReg(Pred[2].getReg()); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + return true; + } + + return false; +} + +unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (PredCost) + *PredCost = 2; + return 2; +} + +int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = 0; + + if (MFI->getNumObjects() == 0) { + return -1; + } + + if (MRI.livein_empty()) { + return 0; + } + + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + Offset = std::max(Offset, + GET_REG_INDEX(RI.getEncodingValue(LI->first))); + } + + return Offset + 1; +} + +int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + int Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + assert(!MFI->hasVarSizedObjects()); + + if (MFI->getNumObjects() == 0) { + return -1; + } + + Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1); + + return getIndirectIndexBegin(MF) + Offset; +} + +std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs( + const MachineFunction &MF) const { + const AMDGPUFrameLowering *TFL = + static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering()); + std::vector<unsigned> Regs; + + unsigned StackWidth = TFL->getStackWidth(MF); + int End = getIndirectIndexEnd(MF); + + if (End == -1) { + return Regs; + } + + for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { + unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); + Regs.push_back(SuperReg); + for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + Regs.push_back(Reg); + } + } + return Regs; +} + +unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + // XXX: Remove when we support a stack width > 2 + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass( + unsigned SourceReg) const { + return &AMDGPU::R600_TReg32RegClass; +} + +const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const { + return &AMDGPU::TRegMemRegClass; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, OffsetReg); + setImmOperand(MOVA, R600Operands::WRITE, 0); + + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + AddrReg, ValueReg) + .addReg(AMDGPU::AR_X, RegState::Implicit); + setImmOperand(Mov, R600Operands::DST_REL, 1); + return Mov; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + OffsetReg); + setImmOperand(MOVA, R600Operands::WRITE, 0); + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + ValueReg, + AddrReg) + .addReg(AMDGPU::AR_X, RegState::Implicit); + setImmOperand(Mov, R600Operands::SRC0_REL, 1); + + return Mov; +} + +const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const { + return &AMDGPU::IndirectRegRegClass; +} + + +MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg) const { + MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode), + DstReg); // $dst + + if (Src1Reg) { + MIB.addImm(0) // $update_exec_mask + .addImm(0); // $update_predicate + } + MIB.addImm(1) // $write + .addImm(0) // $omod + .addImm(0) // $dst_rel + .addImm(0) // $dst_clamp + .addReg(Src0Reg) // $src0 + .addImm(0) // $src0_neg + .addImm(0) // $src0_rel + .addImm(0) // $src0_abs + .addImm(-1); // $src0_sel + + if (Src1Reg) { + MIB.addReg(Src1Reg) // $src1 + .addImm(0) // $src1_neg + .addImm(0) // $src1_rel + .addImm(0) // $src1_abs + .addImm(-1); // $src1_sel + } + + //XXX: The r600g finalizer expects this to be 1, once we've moved the + //scheduling to the backend, we can change the default to 0. + MIB.addImm(1) // $last + .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel + .addImm(0); // $literal + + return MIB; +} + +MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const { + MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, + AMDGPU::ALU_LITERAL_X); + setImmOperand(MovImm, R600Operands::IMM, Imm); + return MovImm; +} + +int R600InstrInfo::getOperandIdx(const MachineInstr &MI, + R600Operands::Ops Op) const { + return getOperandIdx(MI.getOpcode(), Op); +} + +int R600InstrInfo::getOperandIdx(unsigned Opcode, + R600Operands::Ops Op) const { + unsigned TargetFlags = get(Opcode).TSFlags; + unsigned OpTableIdx; + + if (!HAS_NATIVE_OPERANDS(TargetFlags)) { + switch (Op) { + case R600Operands::DST: return 0; + case R600Operands::SRC0: return 1; + case R600Operands::SRC1: return 2; + case R600Operands::SRC2: return 3; + default: + assert(!"Unknown operand type for instruction"); + return -1; + } + } + + if (TargetFlags & R600_InstFlag::OP1) { + OpTableIdx = 0; + } else if (TargetFlags & R600_InstFlag::OP2) { + OpTableIdx = 1; + } else { + assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined " + "for this instruction"); + OpTableIdx = 2; + } + + return R600Operands::ALUOpTable[OpTableIdx][Op]; +} + +void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op, + int64_t Imm) const { + int Idx = getOperandIdx(*MI, Op); + assert(Idx != -1 && "Operand not supported for this instruction."); + assert(MI->getOperand(Idx).isImm()); + MI->getOperand(Idx).setImm(Imm); +} + +//===----------------------------------------------------------------------===// +// Instruction flag getters/setters +//===----------------------------------------------------------------------===// + +bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { + return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; +} + +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + int FlagIndex = 0; + if (Flag != 0) { + // If we pass something other than the default value of Flag to this + // function, it means we are want to set a flag on an instruction + // that uses native encoding. + assert(HAS_NATIVE_OPERANDS(TargetFlags)); + bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; + switch (Flag) { + case MO_FLAG_CLAMP: + FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP); + break; + case MO_FLAG_MASK: + FlagIndex = getOperandIdx(*MI, R600Operands::WRITE); + break; + case MO_FLAG_NOT_LAST: + case MO_FLAG_LAST: + FlagIndex = getOperandIdx(*MI, R600Operands::LAST); + break; + case MO_FLAG_NEG: + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break; + case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break; + case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break; + } + break; + + case MO_FLAG_ABS: + assert(!IsOP3 && "Cannot set absolute value modifier for OP3 " + "instructions."); + (void)IsOP3; + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break; + case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break; + } + break; + + default: + FlagIndex = -1; + break; + } + assert(FlagIndex != -1 && "Flag not supported for this instruction"); + } else { + FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags); + assert(FlagIndex != 0 && + "Instruction flags not supported for this instruction"); + } + + MachineOperand &FlagOp = MI->getOperand(FlagIndex); + assert(FlagOp.isImm()); + return FlagOp; +} + +void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (Flag == 0) { + return; + } + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + if (Flag == MO_FLAG_NOT_LAST) { + clearFlag(MI, Operand, MO_FLAG_LAST); + } else if (Flag == MO_FLAG_MASK) { + clearFlag(MI, Operand, Flag); + } else { + FlagOp.setImm(1); + } + } else { + MachineOperand &FlagOp = getFlagOp(MI, Operand); + FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); + } +} + +void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + FlagOp.setImm(0); + } else { + MachineOperand &FlagOp = getFlagOp(MI); + unsigned InstFlags = FlagOp.getImm(); + InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); + FlagOp.setImm(InstFlags); + } +} diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h new file mode 100644 index 0000000000..efe721c00c --- /dev/null +++ b/lib/Target/R600/R600InstrInfo.h @@ -0,0 +1,200 @@ +//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600InstrInfo +// +//===----------------------------------------------------------------------===// + +#ifndef R600INSTRUCTIONINFO_H_ +#define R600INSTRUCTIONINFO_H_ + +#include "AMDGPUInstrInfo.h" +#include "AMDIL.h" +#include "R600Defines.h" +#include "R600RegisterInfo.h" +#include <map> + +namespace llvm { + + class AMDGPUTargetMachine; + class DFAPacketizer; + class ScheduleDAG; + class MachineFunction; + class MachineInstr; + class MachineInstrBuilder; + + class R600InstrInfo : public AMDGPUInstrInfo { + private: + const R600RegisterInfo RI; + + int getBranchInstr(const MachineOperand &op) const; + + public: + explicit R600InstrInfo(AMDGPUTargetMachine &tm); + + const R600RegisterInfo &getRegisterInfo() const; + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + + bool isTrig(const MachineInstr &MI) const; + bool isPlaceHolderOpcode(unsigned opcode) const; + bool isReductionOp(unsigned opcode) const; + bool isCubeOp(unsigned opcode) const; + + /// \returns true if this \p Opcode represents an ALU instruction. + bool isALUInstr(unsigned Opcode) const; + + /// \breif Vector instructions are instructions that must fill all + /// instruction slots within an instruction group. + bool isVector(const MachineInstr &MI) const; + + virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg, + int64_t Imm) const; + + virtual unsigned getIEQOpcode() const; + virtual bool isMov(unsigned Opcode) const; + + DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM, + const ScheduleDAG *DAG) const; + + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const; + + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const; + + bool isPredicated(const MachineInstr *MI) const; + + bool isPredicable(MachineInstr *MI) const; + + bool + isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + const BranchProbability &Probability) const; + + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + unsigned ExtraPredCycles, + const BranchProbability &Probability) const ; + + bool + isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + const BranchProbability &Probability) const; + + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; + + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const; + + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const; + + bool PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const; + + unsigned int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = 0) const; + + virtual int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const { return 1;} + + /// \returns a list of all the registers that may be accesed using indirect + /// addressing. + std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const; + + virtual int getIndirectIndexBegin(const MachineFunction &MF) const; + + virtual int getIndirectIndexEnd(const MachineFunction &MF) const; + + + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const; + + virtual const TargetRegisterClass *getIndirectAddrStoreRegClass( + unsigned SourceReg) const; + + virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const; + + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const; + + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const; + + virtual const TargetRegisterClass *getSuperIndirectRegClass() const; + + + ///buildDefaultInstruction - This function returns a MachineInstr with + /// all the instruction modifiers initialized to their default values. + /// You can use this function to avoid manually specifying each instruction + /// modifier operand when building a new instruction. + /// + /// \returns a MachineInstr with all the instruction modifiers initialized + /// to their default values. + MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg = 0) const; + + MachineInstr *buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const; + + /// \brief Get the index of Op in the MachineInstr. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const; + + /// \brief Get the index of \p Op for the given Opcode. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const; + + /// \brief Helper function for setting instruction flag values. + void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const; + + /// \returns true if this instruction has an operand for storing target flags. + bool hasFlagOperand(const MachineInstr &MI) const; + + ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. + void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + + ///\brief Determine if the specified \p Flag is set on this \p Operand. + bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; + + /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) + /// \param Flag The flag being set. + /// + /// \returns the operand containing the flags for this instruction. + MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, + unsigned Flag = 0) const; + + /// \brief Clear the specified flag on the instruction. + void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; +}; + +} // End llvm namespace + +#endif // R600INSTRINFO_H_ diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td new file mode 100644 index 0000000000..c5fa3347dc --- /dev/null +++ b/lib/Target/R600/R600Instructions.td @@ -0,0 +1,1995 @@ +//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Tablegen instruction definitions +// +//===----------------------------------------------------------------------===// + +include "R600Intrinsics.td" + +class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin> + : AMDGPUInst <outs, ins, asm, pattern> { + + field bits<64> Inst; + bit Trig = 0; + bit Op3 = 0; + bit isVector = 0; + bits<2> FlagOperandIdx = 0; + bit Op1 = 0; + bit Op2 = 0; + bit HasNativeOperands = 0; + + bits<11> op_code = inst; + //let Inst = inst; + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = itin; + + let TSFlags{4} = Trig; + let TSFlags{5} = Op3; + + // Vector instructions are instructions that must fill all slots in an + // instruction group + let TSFlags{6} = isVector; + let TSFlags{8-7} = FlagOperandIdx; + let TSFlags{9} = HasNativeOperands; + let TSFlags{10} = Op1; + let TSFlags{11} = Op2; +} + +class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> : + AMDGPUInst <outs, ins, asm, pattern> { + field bits<64> Inst; + + let Namespace = "AMDGPU"; +} + +def MEMxi : Operand<iPTR> { + let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index); + let PrintMethod = "printMemOperand"; +} + +def MEMrr : Operand<iPTR> { + let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); +} + +// Operands for non-registers + +class InstFlag<string PM = "printOperand", int Default = 0> + : OperandWithDefaultOps <i32, (ops (i32 Default))> { + let PrintMethod = PM; +} + +// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers +def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> { + let PrintMethod = "printSel"; +} + +def LITERAL : InstFlag<"printLiteral">; + +def WRITE : InstFlag <"printWrite", 1>; +def OMOD : InstFlag <"printOMOD">; +def REL : InstFlag <"printRel">; +def CLAMP : InstFlag <"printClamp">; +def NEG : InstFlag <"printNeg">; +def ABS : InstFlag <"printAbs">; +def UEM : InstFlag <"printUpdateExecMask">; +def UP : InstFlag <"printUpdatePred">; + +// XXX: The r600g finalizer in Mesa expects last to be one in most cases. +// Once we start using the packetizer in this backend we should have this +// default to 0. +def LAST : InstFlag<"printLast", 1>; + +def FRAMEri : Operand<iPTR> { + let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); +} + +def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>; +def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>; +def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>; +def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>; +def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>; +def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; + +class R600ALU_Word0 { + field bits<32> Word0; + + bits<11> src0; + bits<1> src0_neg; + bits<1> src0_rel; + bits<11> src1; + bits<1> src1_rel; + bits<1> src1_neg; + bits<3> index_mode = 0; + bits<2> pred_sel; + bits<1> last; + + bits<9> src0_sel = src0{8-0}; + bits<2> src0_chan = src0{10-9}; + bits<9> src1_sel = src1{8-0}; + bits<2> src1_chan = src1{10-9}; + + let Word0{8-0} = src0_sel; + let Word0{9} = src0_rel; + let Word0{11-10} = src0_chan; + let Word0{12} = src0_neg; + let Word0{21-13} = src1_sel; + let Word0{22} = src1_rel; + let Word0{24-23} = src1_chan; + let Word0{25} = src1_neg; + let Word0{28-26} = index_mode; + let Word0{30-29} = pred_sel; + let Word0{31} = last; +} + +class R600ALU_Word1 { + field bits<32> Word1; + + bits<11> dst; + bits<3> bank_swizzle = 0; + bits<1> dst_rel; + bits<1> clamp; + + bits<7> dst_sel = dst{6-0}; + bits<2> dst_chan = dst{10-9}; + + let Word1{20-18} = bank_swizzle; + let Word1{27-21} = dst_sel; + let Word1{28} = dst_rel; + let Word1{30-29} = dst_chan; + let Word1{31} = clamp; +} + +class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{ + + bits<1> src0_abs; + bits<1> src1_abs; + bits<1> update_exec_mask; + bits<1> update_pred; + bits<1> write; + bits<2> omod; + + let Word1{0} = src0_abs; + let Word1{1} = src1_abs; + let Word1{2} = update_exec_mask; + let Word1{3} = update_pred; + let Word1{4} = write; + let Word1{6-5} = omod; + let Word1{17-7} = alu_inst; +} + +class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{ + + bits<11> src2; + bits<1> src2_rel; + bits<1> src2_neg; + + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{12} = src2_neg; + let Word1{17-13} = alu_inst; +} + +class VTX_WORD0 { + field bits<32> Word0; + bits<7> SRC_GPR; + bits<5> VC_INST; + bits<2> FETCH_TYPE; + bits<1> FETCH_WHOLE_QUAD; + bits<8> BUFFER_ID; + bits<1> SRC_REL; + bits<2> SRC_SEL_X; + bits<6> MEGA_FETCH_COUNT; + + let Word0{4-0} = VC_INST; + let Word0{6-5} = FETCH_TYPE; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = BUFFER_ID; + let Word0{22-16} = SRC_GPR; + let Word0{23} = SRC_REL; + let Word0{25-24} = SRC_SEL_X; + let Word0{31-26} = MEGA_FETCH_COUNT; +} + +class VTX_WORD1_GPR { + field bits<32> Word1; + bits<7> DST_GPR; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<1> USE_CONST_FIELDS; + bits<6> DATA_FORMAT; + bits<2> NUM_FORMAT_ALL; + bits<1> FORMAT_COMP_ALL; + bits<1> SRF_MODE_ALL; + + let Word1{6-0} = DST_GPR; + let Word1{7} = DST_REL; + let Word1{8} = 0; // Reserved + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{21} = USE_CONST_FIELDS; + let Word1{27-22} = DATA_FORMAT; + let Word1{29-28} = NUM_FORMAT_ALL; + let Word1{30} = FORMAT_COMP_ALL; + let Word1{31} = SRF_MODE_ALL; +} + +/* +XXX: R600 subtarget uses a slightly different encoding than the other +subtargets. We currently handle this in R600MCCodeEmitter, but we may +want to use these instruction classes in the future. + +class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 { + + bits<1> fog_merge; + bits<10> alu_inst; + + let Inst{37} = fog_merge; + let Inst{39-38} = omod; + let Inst{49-40} = alu_inst; +} + +class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 { + + bits<11> alu_inst; + + let Inst{38-37} = omod; + let Inst{49-39} = alu_inst; +} +*/ + +def R600_Pred : PredicateOperand<i32, (ops R600_Predicate), + (ops PRED_SEL_OFF)>; + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + +// Class for instructions with only one source register. +// If you add new ins to this instruction, make sure they are listed before +// $literal, because the backend currently assumes that the last operand is +// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in +// R600Defines.h, R600InstrInfo::buildDefaultInstruction(), +// and R600InstrInfo::getOperandIdx(). +class R600_1OP <bits<11> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <0, + (outs R600_Reg32:$dst), + (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), + !strconcat(opName, + "$clamp $dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, " + "$literal $pred_sel$last"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 <inst> { + + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let update_exec_mask = 0; + let update_pred = 0; + let HasNativeOperands = 1; + let Op1 = 1; + let DisableEncoding = "$literal"; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node, + InstrItinClass itin = AnyALU> : + R600_1OP <inst, opName, + [(set R600_Reg32:$dst, (node R600_Reg32:$src0))] +>; + +// If you add our change the operands for R600_2OP instructions, you must +// also update the R600Op2OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). +class R600_2OP <bits<11> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <inst, + (outs R600_Reg32:$dst), + (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, + OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), + !strconcat(opName, + "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, " + "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, " + "$literal $pred_sel$last"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 <inst> { + + let HasNativeOperands = 1; + let Op2 = 1; + let DisableEncoding = "$literal"; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, + InstrItinClass itim = AnyALU> : + R600_2OP <inst, opName, + [(set R600_Reg32:$dst, (node R600_Reg32:$src0, + R600_Reg32:$src1))] +>; + +// If you add our change the operands for R600_3OP instructions, you must +// also update the R600Op3OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and +// R600InstrInfo::getOperandIdx(). +class R600_3OP <bits<5> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <0, + (outs R600_Reg32:$dst), + (ins REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), + !strconcat(opName, "$clamp $dst$dst_rel, " + "$src0_neg$src0$src0_sel$src0_rel, " + "$src1_neg$src1$src1_sel$src1_rel, " + "$src2_neg$src2$src2_sel$src2_rel, " + "$literal $pred_sel$last"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP3<inst>{ + + let HasNativeOperands = 1; + let DisableEncoding = "$literal"; + let Op3 = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, + InstrItinClass itin = VecALU> : + InstR600 <inst, + (outs R600_Reg32:$dst), + ins, + asm, + pattern, + itin>; + +class R600_TEX <bits<11> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <inst, + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + !strconcat(opName, "$dst, $src0, $resourceId, $samplerId, $textureTarget"), + pattern, + itin>{ + let Inst {10-0} = inst; + } + +} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 + +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13); + }] +>; + +def TEX_RECT : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 5; + }] +>; + +def TEX_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 9 || TType == 10 || TType == 15 || TType == 16; + }] +>; + +def TEX_SHADOW_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 11 || TType == 12 || TType == 17; + }] +>; + +class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs, + dag ins, string asm, list<dag> pattern> : + InstR600ISA <outs, ins, asm, pattern> { + bits<7> RW_GPR; + bits<7> INDEX_GPR; + + bits<2> RIM; + bits<2> TYPE; + bits<1> RW_REL; + bits<2> ELEM_SIZE; + + bits<12> ARRAY_SIZE; + bits<4> COMP_MASK; + bits<4> BURST_COUNT; + bits<1> VPM; + bits<1> eop; + bits<1> MARK; + bits<1> BARRIER; + + // CF_ALLOC_EXPORT_WORD0_RAT + let Inst{3-0} = rat_id; + let Inst{9-4} = rat_inst; + let Inst{10} = 0; // Reserved + let Inst{12-11} = RIM; + let Inst{14-13} = TYPE; + let Inst{21-15} = RW_GPR; + let Inst{22} = RW_REL; + let Inst{29-23} = INDEX_GPR; + let Inst{31-30} = ELEM_SIZE; + + // CF_ALLOC_EXPORT_WORD1_BUF + let Inst{43-32} = ARRAY_SIZE; + let Inst{47-44} = COMP_MASK; + let Inst{51-48} = BURST_COUNT; + let Inst{52} = VPM; + let Inst{53} = eop; + let Inst{61-54} = cf_inst; + let Inst{62} = MARK; + let Inst{63} = BARRIER; +} + +class LoadParamFrag <PatFrag load_type> : PatFrag < + (ops node:$ptr), (load_type node:$ptr), + [{ return isParamLoad(dyn_cast<LoadSDNode>(N)); }] +>; + +def load_param : LoadParamFrag<load>; +def load_param_zexti8 : LoadParamFrag<zextloadi8>; +def load_param_zexti16 : LoadParamFrag<zextloadi16>; + +def isR600 : Predicate<"Subtarget.device()" + "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">; +def isR700 : Predicate<"Subtarget.device()" + "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&" + "Subtarget.device()->getDeviceFlag()" + ">= OCL_DEVICE_RV710">; +def isEG : Predicate< + "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && " + "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && " + "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">; + +def isCayman : Predicate<"Subtarget.device()" + "->getDeviceFlag() == OCL_DEVICE_CAYMAN">; +def isEGorCayman : Predicate<"Subtarget.device()" + "->getGeneration() == AMDGPUDeviceInfo::HD5XXX" + "|| Subtarget.device()->getGeneration() ==" + "AMDGPUDeviceInfo::HD6XXX">; + +def isR600toCayman : Predicate< + "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">; + +//===----------------------------------------------------------------------===// +// R600 SDNodes +//===----------------------------------------------------------------------===// + +def INTERP_PAIR_XY : AMDGPUShaderInst < + (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), + (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2), + "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1", + []>; + +def INTERP_PAIR_ZW : AMDGPUShaderInst < + (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1), + (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2), + "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", + []>; + +def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", + SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPVariadic] +>; + +//===----------------------------------------------------------------------===// +// Interpolation Instructions +//===----------------------------------------------------------------------===// + +def INTERP_VEC_LOAD : AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins i32imm:$src0), + "INTERP_LOAD $src0 : $dst", + []>; + +def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { + let bank_swizzle = 5; +} + +def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> { + let bank_swizzle = 5; +} + +def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; + +//===----------------------------------------------------------------------===// +// Export Instructions +//===----------------------------------------------------------------------===// + +def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; + +def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, + [SDNPHasChain, SDNPSideEffect]>; + +class ExportWord0 { + field bits<32> Word0; + + bits<13> arraybase; + bits<2> type; + bits<7> gpr; + bits<2> elem_size; + + let Word0{12-0} = arraybase; + let Word0{14-13} = type; + let Word0{21-15} = gpr; + let Word0{22} = 0; // RW_REL + let Word0{29-23} = 0; // INDEX_GPR + let Word0{31-30} = elem_size; +} + +class ExportSwzWord1 { + field bits<32> Word1; + + bits<3> sw_x; + bits<3> sw_y; + bits<3> sw_z; + bits<3> sw_w; + bits<1> eop; + bits<8> inst; + + let Word1{2-0} = sw_x; + let Word1{5-3} = sw_y; + let Word1{8-6} = sw_z; + let Word1{11-9} = sw_w; +} + +class ExportBufWord1 { + field bits<32> Word1; + + bits<12> arraySize; + bits<4> compMask; + bits<1> eop; + bits<8> inst; + + let Word1{11-0} = arraySize; + let Word1{15-12} = compMask; +} + +multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { + def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0), + 0, 61, 0, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0), + 0, 61, 7, 0, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy (i32 imm:$type)), + (ExportInst + (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy 1), + (ExportInst + (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), + (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), + (ExportInst R600_Reg128:$src, imm:$type, imm:$base, + imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) + >; + +} + +multiclass SteamOutputExportPattern<Instruction ExportInst, + bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { +// Stream0 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), + (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + 4095, imm:$mask, buf0inst, 0)>; +// Stream1 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), + (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + 4095, imm:$mask, buf1inst, 0)>; +// Stream2 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), + (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + 4095, imm:$mask, buf2inst, 0)>; +// Stream3 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), + (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + 4095, imm:$mask, buf3inst, 0)>; +} + +let usesCustomInserter = 1 in { + +class ExportSwzInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst, + i32imm:$eop), + !strconcat("EXPORT", " $gpr"), + []>, ExportWord0, ExportSwzWord1 { + let elem_size = 3; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +} // End usesCustomInserter = 1 + +class ExportBufInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop), + !strconcat("EXPORT", " $gpr"), + []>, ExportWord0, ExportBufWord1 { + let elem_size = 0; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +let Predicates = [isR600toCayman] in { + +//===----------------------------------------------------------------------===// +// Common Instructions R600, R700, Evergreen, Cayman +//===----------------------------------------------------------------------===// + +def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; +// Non-IEEE MUL: 0 * anything = 0 +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; +def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>; +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>; + +// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, +// so some of the instruction names don't match the asm string. +// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. +def SETE : R600_2OP < + 0x08, "SETE", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, + COND_EQ))] +>; + +def SGT : R600_2OP < + 0x09, "SETGT", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, + COND_GT))] +>; + +def SGE : R600_2OP < + 0xA, "SETGE", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, + COND_GE))] +>; + +def SNE : R600_2OP < + 0xB, "SETNE", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, + COND_NE))] +>; + +def SETE_DX10 : R600_2OP < + 0xC, "SETE_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_EQ))] +>; + +def SETGT_DX10 : R600_2OP < + 0xD, "SETGT_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_GT))] +>; + +def SETGE_DX10 : R600_2OP < + 0xE, "SETGE_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_GE))] +>; + +def SETNE_DX10 : R600_2OP < + 0xF, "SETNE_DX10", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), + COND_NE))] +>; + +def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>; +def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; +def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; +def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; + +def MOV : R600_1OP <0x19, "MOV", []>; + +let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { + +class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst < + (outs R600_Reg32:$dst), + (ins immType:$imm), + "", + [] +>; + +} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 + +def MOV_IMM_I32 : MOV_IMM<i32, i32imm>; +def : Pat < + (imm:$val), + (MOV_IMM_I32 imm:$val) +>; + +def MOV_IMM_F32 : MOV_IMM<f32, f32imm>; +def : Pat < + (fpimm:$val), + (MOV_IMM_F32 fpimm:$val) +>; + +def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>; +def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>; +def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>; +def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>; + +let hasSideEffects = 1 in { + +def KILLGT : R600_2OP <0x2D, "KILLGT", []>; + +} // end hasSideEffects + +def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>; +def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>; +def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; +def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; +def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; +def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; +def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>; +def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>; +def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>; +def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>; + +def SETE_INT : R600_2OP < + 0x3A, "SETE_INT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))] +>; + +def SETGT_INT : R600_2OP < + 0x3B, "SETGT_INT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))] +>; + +def SETGE_INT : R600_2OP < + 0x3C, "SETGE_INT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))] +>; + +def SETNE_INT : R600_2OP < + 0x3D, "SETNE_INT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))] +>; + +def SETGT_UINT : R600_2OP < + 0x3E, "SETGT_UINT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))] +>; + +def SETGE_UINT : R600_2OP < + 0x3F, "SETGE_UINT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))] +>; + +def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; +def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>; +def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>; +def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; + +def CNDE_INT : R600_3OP < + 0x1C, "CNDE_INT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), 0, + (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), + COND_EQ))] +>; + +def CNDGE_INT : R600_3OP < + 0x1E, "CNDGE_INT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), 0, + (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), + COND_GE))] +>; + +def CNDGT_INT : R600_3OP < + 0x1D, "CNDGT_INT", + [(set (i32 R600_Reg32:$dst), + (selectcc (i32 R600_Reg32:$src0), 0, + (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), + COND_GT))] +>; + +//===----------------------------------------------------------------------===// +// Texture instructions +//===----------------------------------------------------------------------===// + +def TEX_LD : R600_TEX < + 0x03, "TEX_LD", + [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2, imm:$src3, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +> { +let AsmString = "TEX_LD $dst, $src0, $src1, $src2, $src3, $resourceId, $samplerId, $textureTarget"; +let InOperandList = (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2, i32imm:$src3, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget); +} + +def TEX_GET_TEXTURE_RESINFO : R600_TEX < + 0x04, "TEX_GET_TEXTURE_RESINFO", + [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +>; + +def TEX_GET_GRADIENTS_H : R600_TEX < + 0x07, "TEX_GET_GRADIENTS_H", + [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +>; + +def TEX_GET_GRADIENTS_V : R600_TEX < + 0x08, "TEX_GET_GRADIENTS_V", + [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +>; + +def TEX_SET_GRADIENTS_H : R600_TEX < + 0x0B, "TEX_SET_GRADIENTS_H", + [] +>; + +def TEX_SET_GRADIENTS_V : R600_TEX < + 0x0C, "TEX_SET_GRADIENTS_V", + [] +>; + +def TEX_SAMPLE : R600_TEX < + 0x10, "TEX_SAMPLE", + [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +>; + +def TEX_SAMPLE_C : R600_TEX < + 0x18, "TEX_SAMPLE_C", + [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] +>; + +def TEX_SAMPLE_L : R600_TEX < + 0x11, "TEX_SAMPLE_L", + [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +>; + +def TEX_SAMPLE_C_L : R600_TEX < + 0x19, "TEX_SAMPLE_C_L", + [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] +>; + +def TEX_SAMPLE_LB : R600_TEX < + 0x12, "TEX_SAMPLE_LB", + [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0,imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +>; + +def TEX_SAMPLE_C_LB : R600_TEX < + 0x1A, "TEX_SAMPLE_C_LB", + [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] +>; + +def TEX_SAMPLE_G : R600_TEX < + 0x14, "TEX_SAMPLE_G", + [] +>; + +def TEX_SAMPLE_C_G : R600_TEX < + 0x1C, "TEX_SAMPLE_C_G", + [] +>; + +//===----------------------------------------------------------------------===// +// Helper classes for common instructions +//===----------------------------------------------------------------------===// + +class MUL_LIT_Common <bits<5> inst> : R600_3OP < + inst, "MUL_LIT", + [] +>; + +class MULADD_Common <bits<5> inst> : R600_3OP < + inst, "MULADD", + [] +>; + +class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < + inst, "MULADD_IEEE", + [(set (f32 R600_Reg32:$dst), + (fadd (fmul R600_Reg32:$src0, R600_Reg32:$src1), R600_Reg32:$src2))] +>; + +class CNDE_Common <bits<5> inst> : R600_3OP < + inst, "CNDE", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), FP_ZERO, + (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), + COND_EQ))] +>; + +class CNDGT_Common <bits<5> inst> : R600_3OP < + inst, "CNDGT", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), FP_ZERO, + (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), + COND_GT))] +>; + +class CNDGE_Common <bits<5> inst> : R600_3OP < + inst, "CNDGE", + [(set R600_Reg32:$dst, + (selectcc (f32 R600_Reg32:$src0), FP_ZERO, + (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), + COND_GE))] +>; + +multiclass DOT4_Common <bits<11> inst> { + + def _pseudo : R600_REDUCTION <inst, + (ins R600_Reg128:$src0, R600_Reg128:$src1), + "DOT4 $dst $src0, $src1", + [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))] + >; + + def _real : R600_2OP <inst, "DOT4", []>; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { +multiclass CUBE_Common <bits<11> inst> { + + def _pseudo : InstR600 < + inst, + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src), + "CUBE $dst $src", + [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))], + VecALU + > { + let isPseudo = 1; + } + + def _real : R600_2OP <inst, "CUBE", []>; +} +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 + +class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "EXP_IEEE", fexp2 +>; + +class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "FLT_TO_INT", fp_to_sint +>; + +class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "INT_TO_FLT", sint_to_fp +>; + +class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "FLT_TO_UINT", fp_to_uint +>; + +class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "UINT_TO_FLT", uint_to_fp +>; + +class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < + inst, "LOG_CLAMPED", [] +>; + +class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "LOG_IEEE", flog2 +>; + +class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>; +class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>; +class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>; +class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI_INT", mulhs +>; +class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULHI", mulhu +>; +class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper < + inst, "MULLO_INT", mul +>; +class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>; + +class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP < + inst, "RECIP_CLAMPED", [] +>; + +class RECIP_IEEE_Common <bits<11> inst> : R600_1OP < + inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))] +>; + +class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIP_UINT", AMDGPUurecip +>; + +class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq +>; + +class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP < + inst, "RECIPSQRT_IEEE", [] +>; + +class SIN_Common <bits<11> inst> : R600_1OP < + inst, "SIN", []>{ + let Trig = 1; +} + +class COS_Common <bits<11> inst> : R600_1OP < + inst, "COS", []> { + let Trig = 1; +} + +//===----------------------------------------------------------------------===// +// Helper patterns for complex intrinsics +//===----------------------------------------------------------------------===// + +multiclass DIV_Common <InstR600 recip_ieee> { +def : Pat< + (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1), + (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) +>; + +def : Pat< + (fdiv R600_Reg32:$src0, R600_Reg32:$src1), + (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) +>; +} + +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat < + (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w), + (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x)) +>; + +//===----------------------------------------------------------------------===// +// R600 / R700 Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isR600] in { + + def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; + def MULADD_r600 : MULADD_Common<0x10>; + def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>; + def CNDE_r600 : CNDE_Common<0x18>; + def CNDGT_r600 : CNDGT_Common<0x19>; + def CNDGE_r600 : CNDGE_Common<0x1A>; + defm DOT4_r600 : DOT4_Common<0x50>; + defm CUBE_r600 : CUBE_Common<0x52>; + def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; + def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; + def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; + def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; + def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; + def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; + def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; + def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; + def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; + def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>; + def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>; + def SIN_r600 : SIN_Common<0x6E>; + def COS_r600 : COS_Common<0x6F>; + def ASHR_r600 : ASHR_Common<0x70>; + def LSHR_r600 : LSHR_Common<0x71>; + def LSHL_r600 : LSHL_Common<0x72>; + def MULLO_INT_r600 : MULLO_INT_Common<0x73>; + def MULHI_INT_r600 : MULHI_INT_Common<0x74>; + def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; + def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; + def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; + + defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; + def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; + + def : Pat<(fsqrt R600_Reg32:$src), + (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>; + + def R600_ExportSwz : ExportSwzInst { + let Word1{20-17} = 1; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 1; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : ExportPattern<R600_ExportSwz, 39>; + + def R600_ExportBuf : ExportBufInst { + let Word1{20-17} = 1; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 1; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>; +} + +// Helper pattern for normalizing inputs to triginomic instructions for R700+ +// cards. +class COS_PAT <InstR600 trig> : Pat< + (fcos R600_Reg32:$src), + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) +>; + +class SIN_PAT <InstR600 trig> : Pat< + (fsin R600_Reg32:$src), + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) +>; + +//===----------------------------------------------------------------------===// +// R700 Only instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isR700] in { + def SIN_r700 : SIN_Common<0x6E>; + def COS_r700 : COS_Common<0x6F>; + + // R700 normalizes inputs to SIN/COS the same as EG + def : SIN_PAT <SIN_r700>; + def : COS_PAT <COS_r700>; +} + +//===----------------------------------------------------------------------===// +// Evergreen Only instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEG] in { + +def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; +defm DIV_eg : DIV_Common<RECIP_IEEE_eg>; + +def MULLO_INT_eg : MULLO_INT_Common<0x8F>; +def MULHI_INT_eg : MULHI_INT_Common<0x90>; +def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; +def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; +def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; +def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; +def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; +def SIN_eg : SIN_Common<0x8D>; +def COS_eg : COS_Common<0x8E>; + +def : SIN_PAT <SIN_eg>; +def : COS_PAT <COS_eg>; +def : Pat<(fsqrt R600_Reg32:$src), + (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>; +} // End Predicates = [isEG] + +//===----------------------------------------------------------------------===// +// Evergreen / Cayman Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEGorCayman] in { + + // BFE_UINT - bit_extract, an optimization for mask and shift + // Src0 = Input + // Src1 = Offset + // Src2 = Width + // + // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width) + // + // Example Usage: + // (Offset, Width) + // + // (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 + // (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 + // (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 + // (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 + def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", + [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0, + R600_Reg32:$src1, + R600_Reg32:$src2))], + VecALU + >; + + def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", + [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1, + R600_Reg32:$src2))], + VecALU + >; + + def MULADD_eg : MULADD_Common<0x14>; + def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; + def ASHR_eg : ASHR_Common<0x15>; + def LSHR_eg : LSHR_Common<0x16>; + def LSHL_eg : LSHL_Common<0x17>; + def CNDE_eg : CNDE_Common<0x19>; + def CNDGT_eg : CNDGT_Common<0x1A>; + def CNDGE_eg : CNDGE_Common<0x1B>; + def MUL_LIT_eg : MUL_LIT_Common<0x1F>; + def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; + defm DOT4_eg : DOT4_Common<0xBE>; + defm CUBE_eg : CUBE_Common<0xC0>; + +let hasSideEffects = 1 in { + def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>; +} + + def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>; + + def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { + let Pattern = []; + } + + def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; + + def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { + let Pattern = []; + } + + def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; + + // TRUNC is used for the FLT_TO_INT instructions to work around a + // perceived problem where the rounding modes are applied differently + // depending on the instruction and the slot they are in. + // See: + // https://bugs.freedesktop.org/show_bug.cgi?id=50232 + // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c + // + // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, + // which do not need to be truncated since the fp values are 0.0f or 1.0f. + // We should look into handling these cases separately. + def : Pat<(fp_to_sint R600_Reg32:$src0), + (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>; + + def : Pat<(fp_to_uint R600_Reg32:$src0), + (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>; + + def EG_ExportSwz : ExportSwzInst { + let Word1{19-16} = 1; // BURST_COUNT + let Word1{20} = 1; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER + } + defm : ExportPattern<EG_ExportSwz, 83>; + + def EG_ExportBuf : ExportBufInst { + let Word1{19-16} = 1; // BURST_COUNT + let Word1{20} = 1; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER + } + defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>; + +//===----------------------------------------------------------------------===// +// Memory read/write instructions +//===----------------------------------------------------------------------===// +let usesCustomInserter = 1 in { + +class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name, + list<dag> pattern> + : EG_CF_RAT <0x57, 0x2, 0, (outs), ins, + !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> { + let RIM = 0; + // XXX: Have a separate instruction for non-indexed writes. + let TYPE = 1; + let RW_REL = 0; + let ELEM_SIZE = 0; + + let ARRAY_SIZE = 0; + let COMP_MASK = comp_mask; + let BURST_COUNT = 0; + let VPM = 0; + let MARK = 0; + let BARRIER = 1; +} + +} // End usesCustomInserter = 1 + +// 32-bit store +def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg < + (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + 0x1, "RAT_WRITE_CACHELESS_32_eg", + [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)] +>; + +//128-bit store +def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg < + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + 0xf, "RAT_WRITE_CACHELESS_128", + [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)] +>; + +class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> + : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>, + VTX_WORD1_GPR, VTX_WORD0 { + + // Static fields + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = buffer_id; + let SRC_REL = 0; + // XXX: We can infer this field based on the SRC_GPR. This would allow us + // to store vertex addresses in any channel, not just X. + let SRC_SEL_X = 0; + let DST_REL = 0; + // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL, + // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored, + // however, based on my testing if USE_CONST_FIELDS is set, then all + // these fields need to be set to 0. + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 1; + let FORMAT_COMP_ALL = 0; + let SRF_MODE_ALL = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + // LLVM can only encode 64-bit instructions, so these fields are manually + // encoded in R600CodeEmitter + // + // bits<16> OFFSET; + // bits<2> ENDIAN_SWAP = 0; + // bits<1> CONST_BUF_NO_STRIDE = 0; + // bits<1> MEGA_FETCH = 0; + // bits<1> ALT_CONST = 0; + // bits<2> BUFFER_INDEX_MODE = 0; + + + + // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding + // is done in R600CodeEmitter + // + // Inst{79-64} = OFFSET; + // Inst{81-80} = ENDIAN_SWAP; + // Inst{82} = CONST_BUF_NO_STRIDE; + // Inst{83} = MEGA_FETCH; + // Inst{84} = ALT_CONST; + // Inst{86-85} = BUFFER_INDEX_MODE; + // Inst{95-86} = 0; Reserved + + // VTX_WORD3 (Padding) + // + // Inst{127-96} = 0; +} + +class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst), + pattern> { + + let MEGA_FETCH_COUNT = 1; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 1; // FMT_8 +} + +class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst), + pattern> { + let MEGA_FETCH_COUNT = 2; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 5; // FMT_16 + +} + +class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst), + pattern> { + + let MEGA_FETCH_COUNT = 4; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 0xD; // COLOR_32 + + // This is not really necessary, but there were some GPU hangs that appeared + // to be caused by ALU instructions in the next instruction group that wrote + // to the $ptr registers of the VTX_READ. + // e.g. + // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24 + // %T2_X<def> = MOV %ZERO + //Adding this constraint prevents this from happening. + let Constraints = "$ptr.ptr = $dst"; +} + +class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> + : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst), + pattern> { + + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 + + // XXX: Need to force VTX_READ_128 instructions to write to the same register + // that holds its buffer address to avoid potential hangs. We can't use + // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst + // registers are different sizes. +} + +//===----------------------------------------------------------------------===// +// VTX Read from parameter memory space +//===----------------------------------------------------------------------===// + +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, + [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))] +>; + +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, + [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))] +>; + +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, + [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))] +>; + +def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, + [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))] +>; + +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// + +// 8-bit reads +def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, + [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))] +>; + +// 32-bit reads +def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, + [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))] +>; + +// 128-bit reads +def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, + [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))] +>; + +//===----------------------------------------------------------------------===// +// Constant Loads +// XXX: We are currently storing all constants in the global address space. +//===----------------------------------------------------------------------===// + +def CONSTANT_LOAD_eg : VTX_READ_32_eg <1, + [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))] +>; + +} + +//===----------------------------------------------------------------------===// +// Regist loads and stores - for indirect addressing +//===----------------------------------------------------------------------===// + +defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; + +let Predicates = [isCayman] in { + +let isVector = 1 in { + +def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; + +def MULLO_INT_cm : MULLO_INT_Common<0x8F>; +def MULHI_INT_cm : MULHI_INT_Common<0x90>; +def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; +def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; +def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; +def LOG_IEEE_ : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; +def SIN_cm : SIN_Common<0x8D>; +def COS_cm : COS_Common<0x8E>; +} // End isVector = 1 + +def : SIN_PAT <SIN_cm>; +def : COS_PAT <COS_cm>; + +defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; + +// RECIP_UINT emulation for Cayman +def : Pat < + (AMDGPUurecip R600_Reg32:$src0), + (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)), + (MOV_IMM_I32 0x4f800000))) +>; + + +def : Pat<(fsqrt R600_Reg32:$src), + (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>; + +} // End isCayman + +//===----------------------------------------------------------------------===// +// Branch Instructions +//===----------------------------------------------------------------------===// + + +def IF_PREDICATE_SET : ILFormat<(outs), (ins GPRI32:$src), + "IF_PREDICATE_SET $src", []>; + +def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src), + "PREDICATED_BREAK $src", []>; + +//===----------------------------------------------------------------------===// +// Pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +def PRED_X : InstR600 < + 0, (outs R600_Predicate_Bit:$dst), + (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), + "", [], NullALU> { + let FlagOperandIdx = 3; +} + +let isTerminator = 1, isBranch = 1 in { +def JUMP_COND : InstR600 <0x10, + (outs), + (ins brtarget:$target, R600_Predicate_Bit:$p), + "JUMP $target ($p)", + [], AnyALU + >; + +def JUMP : InstR600 <0x10, + (outs), + (ins brtarget:$target), + "JUMP $target", + [], AnyALU + > +{ + let isPredicable = 1; + let isBarrier = 1; +} + +} // End isTerminator = 1, isBranch = 1 + +let usesCustomInserter = 1 in { + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { + +def MASK_WRITE : AMDGPUShaderInst < + (outs), + (ins R600_Reg32:$src), + "MASK_WRITE $src", + [] +>; + +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 + + +def TXD: AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] +>; + +def TXD_SHADOW: AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] +>; + +} // End isPseudo = 1 +} // End usesCustomInserter = 1 + +def CLAMP_R600 : CLAMP <R600_Reg32>; +def FABS_R600 : FABS<R600_Reg32>; +def FNEG_R600 : FNEG<R600_Reg32>; + +//===---------------------------------------------------------------------===// +// Return instruction +//===---------------------------------------------------------------------===// +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1 in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(IL_retflag)]>; +} + + +//===----------------------------------------------------------------------===// +// Constant Buffer Addressing Support +//===----------------------------------------------------------------------===// + +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +def CONST_COPY : Instruction { + let OutOperandList = (outs R600_Reg32:$dst); + let InOperandList = (ins i32imm:$src); + let Pattern = + [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; + let AsmString = "CONST_COPY"; + let neverHasSideEffects = 1; + let isAsCheapAsAMove = 1; + let Itinerary = NullALU; +} +} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" + +def TEX_VTX_CONSTBUF : + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", + [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, + VTX_WORD1_GPR, VTX_WORD0 { + + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let SRC_REL = 0; + let SRC_SEL_X = 0; + let DST_REL = 0; + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 2; + let FORMAT_COMP_ALL = 1; + let SRF_MODE_ALL = 1; + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 35; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; +} + +def TEX_VTX_TEXBUF: + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", + [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, +VTX_WORD1_GPR, VTX_WORD0 { + +let VC_INST = 0; +let FETCH_TYPE = 2; +let FETCH_WHOLE_QUAD = 0; +let SRC_REL = 0; +let SRC_SEL_X = 0; +let DST_REL = 0; +let USE_CONST_FIELDS = 1; +let NUM_FORMAT_ALL = 0; +let FORMAT_COMP_ALL = 0; +let SRF_MODE_ALL = 1; +let MEGA_FETCH_COUNT = 16; +let DST_SEL_X = 0; +let DST_SEL_Y = 1; +let DST_SEL_Z = 2; +let DST_SEL_W = 3; +let DATA_FORMAT = 0; + +let Inst{31-0} = Word0; +let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; +} + + + +//===--------------------------------------------------------------------===// +// Instructions support +//===--------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// +// Custom Inserter for Branches and returns, this eventually will be a +// seperate pass +//===---------------------------------------------------------------------===// +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { + def BRANCH : ILFormat<(outs), (ins brtarget:$target), + "; Pseudo unconditional branch instruction", + [(br bb:$target)]>; + defm BRANCH_COND : BranchConditional<IL_brcond>; +} + +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +let isTerminator=1 in { + def SWITCH : ILFormat< (outs), (ins GPRI32:$src), + !strconcat("SWITCH", " $src"), []>; + def CASE : ILFormat< (outs), (ins GPRI32:$src), + !strconcat("CASE", " $src"), []>; + def BREAK : ILFormat< (outs), (ins), + "BREAK", []>; + def CONTINUE : ILFormat< (outs), (ins), + "CONTINUE", []>; + def DEFAULT : ILFormat< (outs), (ins), + "DEFAULT", []>; + def ELSE : ILFormat< (outs), (ins), + "ELSE", []>; + def ENDSWITCH : ILFormat< (outs), (ins), + "ENDSWITCH", []>; + def ENDMAIN : ILFormat< (outs), (ins), + "ENDMAIN", []>; + def END : ILFormat< (outs), (ins), + "END", []>; + def ENDFUNC : ILFormat< (outs), (ins), + "ENDFUNC", []>; + def ENDIF : ILFormat< (outs), (ins), + "ENDIF", []>; + def WHILELOOP : ILFormat< (outs), (ins), + "WHILE", []>; + def ENDLOOP : ILFormat< (outs), (ins), + "ENDLOOP", []>; + def FUNC : ILFormat< (outs), (ins), + "FUNC", []>; + def RETDYN : ILFormat< (outs), (ins), + "RET_DYN", []>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; + defm IFC : BranchInstr2<"IFC">; + defm BREAKC : BranchInstr2<"BREAKC">; + defm CONTINUEC : BranchInstr2<"CONTINUEC">; +} + +//===----------------------------------------------------------------------===// +// ISel Patterns +//===----------------------------------------------------------------------===// + +// CND*_INT Pattterns for f32 True / False values + +class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < + (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1), + R600_Reg32:$src2, cc), + (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) +>; + +def : CND_INT_f32 <CNDE_INT, SETEQ>; +def : CND_INT_f32 <CNDGT_INT, SETGT>; +def : CND_INT_f32 <CNDGE_INT, SETGE>; + +//CNDGE_INT extra pattern +def : Pat < + (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1), + (i32 R600_Reg32:$src2), COND_GT), + (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) +>; + +// KIL Patterns +def KILP : Pat < + (int_AMDGPU_kilp), + (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) +>; + +def KIL : Pat < + (int_AMDGPU_kill R600_Reg32:$src0), + (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0))) +>; + +// SGT Reverse args +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT), + (SGT R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SGE Reverse args +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE), + (SGE R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SETGT_DX10 reverse args +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT), + (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SETGE_DX10 reverse args +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE), + (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SETGT_INT reverse args +def : Pat < + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT), + (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SETGE_INT reverse args +def : Pat < + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE), + (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SETGT_UINT reverse args +def : Pat < + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT), + (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// SETGE_UINT reverse args +def : Pat < + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE), + (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0) +>; + +// The next two patterns are special cases for handling 'true if ordered' and +// 'true if unordered' conditionals. The assumption here is that the behavior of +// SETE and SNE conforms to the Direct3D 10 rules for floating point values +// described here: +// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit +// We assume that SETE returns false when one of the operands is NAN and +// SNE returns true when on of the operands is NAN + +//SETE - 'true if ordered' +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO), + (SETE R600_Reg32:$src0, R600_Reg32:$src1) +>; + +//SETE_DX10 - 'true if ordered' +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO), + (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) +>; + +//SNE - 'true if unordered' +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO), + (SNE R600_Reg32:$src0, R600_Reg32:$src1) +>; + +//SETNE_DX10 - 'true if ordered' +def : Pat < + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO), + (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) +>; + +def : Extract_Element <f32, v4f32, R600_Reg128, 0, sub0>; +def : Extract_Element <f32, v4f32, R600_Reg128, 1, sub1>; +def : Extract_Element <f32, v4f32, R600_Reg128, 2, sub2>; +def : Extract_Element <f32, v4f32, R600_Reg128, 3, sub3>; + +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sub0>; +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sub1>; +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sub2>; +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sub3>; + +def : Extract_Element <i32, v4i32, R600_Reg128, 0, sub0>; +def : Extract_Element <i32, v4i32, R600_Reg128, 1, sub1>; +def : Extract_Element <i32, v4i32, R600_Reg128, 2, sub2>; +def : Extract_Element <i32, v4i32, R600_Reg128, 3, sub3>; + +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sub0>; +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>; +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>; +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>; + +def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>; +def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>; + +// bitconvert patterns + +def : BitConvert <i32, f32, R600_Reg32>; +def : BitConvert <f32, i32, R600_Reg32>; +def : BitConvert <v4f32, v4i32, R600_Reg128>; +def : BitConvert <v4i32, v4f32, R600_Reg128>; + +// DWORDADDR pattern +def : DwordAddrPat <i32, R600_Reg32>; + +} // End isR600toCayman Predicate diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td new file mode 100644 index 0000000000..dc8980aef1 --- /dev/null +++ b/lib/Target/R600/R600Intrinsics.td @@ -0,0 +1,31 @@ +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "R600", isTarget = 1 in { + def int_R600_load_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_load_texbuf : + Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_store_swizzle : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_stream_output : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_pixel_depth : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_pixel_stencil : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_dummy : + Intrinsic<[], [llvm_i32_ty], []>; +} diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp new file mode 100644 index 0000000000..40aec833ea --- /dev/null +++ b/lib/Target/R600/R600MachineFunctionInfo.cpp @@ -0,0 +1,18 @@ +//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + +R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) + : MachineFunctionInfo() { + memset(Outputs, 0, sizeof(Outputs)); + } diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h new file mode 100644 index 0000000000..4b901f4bbc --- /dev/null +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -0,0 +1,34 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef R600MACHINEFUNCTIONINFO_H +#define R600MACHINEFUNCTIONINFO_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include <vector> + +namespace llvm { + +class R600MachineFunctionInfo : public MachineFunctionInfo { + +public: + R600MachineFunctionInfo(const MachineFunction &MF); + SmallVector<unsigned, 4> LiveOuts; + std::vector<unsigned> IndirectRegs; + SDNode *Outputs[16]; +}; + +} // End llvm namespace + +#endif //R600MACHINEFUNCTIONINFO_H diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp new file mode 100644 index 0000000000..19baef94c7 --- /dev/null +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -0,0 +1,488 @@ +//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "misched" + +#include "R600MachineScheduler.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/Pass.h" +#include "llvm/PassManager.h" +#include "llvm/Support/raw_ostream.h" +#include <set> + +using namespace llvm; + +void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { + + DAG = dag; + TII = static_cast<const R600InstrInfo*>(DAG->TII); + TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); + MRI = &DAG->MRI; + Available[IDAlu]->clear(); + Available[IDFetch]->clear(); + Available[IDOther]->clear(); + CurInstKind = IDOther; + CurEmitted = 0; + OccupedSlotsMask = 15; + memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate)); + InstKindLimit[IDAlu] = 120; // 120 minus 8 for security + + + const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) { + InstKindLimit[IDFetch] = 7; // 8 minus 1 for security + } else { + InstKindLimit[IDFetch] = 15; // 16 minus 1 for security + } +} + +void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst) +{ + if (QSrc->empty()) + return; + for (ReadyQueue::iterator I = QSrc->begin(), + E = QSrc->end(); I != E; ++I) { + (*I)->NodeQueueId &= ~QSrc->getID(); + QDst->push(*I); + } + QSrc->clear(); +} + +SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { + SUnit *SU = 0; + IsTopNode = true; + NextInstKind = IDOther; + + // check if we might want to switch current clause type + bool AllowSwitchToAlu = (CurInstKind == IDOther) || + (CurEmitted > InstKindLimit[CurInstKind]) || + (Available[CurInstKind]->empty()); + bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) && + (!Available[IDFetch]->empty() || !Available[IDOther]->empty()); + + if ((AllowSwitchToAlu && CurInstKind != IDAlu) || + (!AllowSwitchFromAlu && CurInstKind == IDAlu)) { + // try to pick ALU + SU = pickAlu(); + if (SU) { + if (CurEmitted > InstKindLimit[IDAlu]) + CurEmitted = 0; + NextInstKind = IDAlu; + } + } + + if (!SU) { + // try to pick FETCH + SU = pickOther(IDFetch); + if (SU) + NextInstKind = IDFetch; + } + + // try to pick other + if (!SU) { + SU = pickOther(IDOther); + if (SU) + NextInstKind = IDOther; + } + + DEBUG( + if (SU) { + dbgs() << "picked node: "; + SU->dump(DAG); + } else { + dbgs() << "NO NODE "; + for (int i = 0; i < IDLast; ++i) { + Available[i]->dump(); + Pending[i]->dump(); + } + for (unsigned i = 0; i < DAG->SUnits.size(); i++) { + const SUnit &S = DAG->SUnits[i]; + if (!S.isScheduled) + S.dump(DAG); + } + } + ); + + return SU; +} + +void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + + DEBUG(dbgs() << "scheduled: "); + DEBUG(SU->dump(DAG)); + + if (NextInstKind != CurInstKind) { + DEBUG(dbgs() << "Instruction Type Switch\n"); + if (NextInstKind != IDAlu) + OccupedSlotsMask = 15; + CurEmitted = 0; + CurInstKind = NextInstKind; + } + + if (CurInstKind == IDAlu) { + switch (getAluKind(SU)) { + case AluT_XYZW: + CurEmitted += 4; + break; + case AluDiscarded: + break; + default: { + ++CurEmitted; + for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), + E = SU->getInstr()->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++CurEmitted; + } + } + } + } else { + ++CurEmitted; + } + + + DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); + + if (CurInstKind != IDFetch) { + MoveUnits(Pending[IDFetch], Available[IDFetch]); + } + MoveUnits(Pending[IDOther], Available[IDOther]); +} + +void R600SchedStrategy::releaseTopNode(SUnit *SU) { + int IK = getInstKind(SU); + + DEBUG(dbgs() << IK << " <= "); + DEBUG(SU->dump(DAG)); + + Pending[IK]->push(SU); +} + +void R600SchedStrategy::releaseBottomNode(SUnit *SU) { +} + +bool R600SchedStrategy::regBelongsToClass(unsigned Reg, + const TargetRegisterClass *RC) const { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + return RC->contains(Reg); + } else { + return MRI->getRegClass(Reg) == RC; + } +} + +R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { + MachineInstr *MI = SU->getInstr(); + + switch (MI->getOpcode()) { + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + return AluT_XYZW; + case AMDGPU::COPY: + if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) { + // %vregX = COPY Tn_X is likely to be discarded in favor of an + // assignement of Tn_X to %vregX, don't considers it in scheduling + return AluDiscarded; + } + else if (MI->getOperand(1).isUndef()) { + // MI will become a KILL, don't considers it in scheduling + return AluDiscarded; + } + default: + break; + } + + // Does the instruction take a whole IG ? + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode())) + return AluT_XYZW; + + // Is the result already assigned to a channel ? + unsigned DestSubReg = MI->getOperand(0).getSubReg(); + switch (DestSubReg) { + case AMDGPU::sub0: + return AluT_X; + case AMDGPU::sub1: + return AluT_Y; + case AMDGPU::sub2: + return AluT_Z; + case AMDGPU::sub3: + return AluT_W; + default: + break; + } + + // Is the result already member of a X/Y/Z/W class ? + unsigned DestReg = MI->getOperand(0).getReg(); + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + return AluT_X; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + return AluT_Y; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + return AluT_Z; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + return AluT_W; + if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + return AluT_XYZW; + + return AluAny; + +} + +int R600SchedStrategy::getInstKind(SUnit* SU) { + int Opcode = SU->getInstr()->getOpcode(); + + if (TII->isALUInstr(Opcode)) { + return IDAlu; + } + + switch (Opcode) { + case AMDGPU::COPY: + case AMDGPU::CONST_COPY: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT4_eg_pseudo: + case AMDGPU::DOT4_r600_pseudo: + return IDAlu; + case AMDGPU::TEX_VTX_CONSTBUF: + case AMDGPU::TEX_VTX_TEXBUF: + case AMDGPU::TEX_LD: + case AMDGPU::TEX_GET_TEXTURE_RESINFO: + case AMDGPU::TEX_GET_GRADIENTS_H: + case AMDGPU::TEX_GET_GRADIENTS_V: + case AMDGPU::TEX_SET_GRADIENTS_H: + case AMDGPU::TEX_SET_GRADIENTS_V: + case AMDGPU::TEX_SAMPLE: + case AMDGPU::TEX_SAMPLE_C: + case AMDGPU::TEX_SAMPLE_L: + case AMDGPU::TEX_SAMPLE_C_L: + case AMDGPU::TEX_SAMPLE_LB: + case AMDGPU::TEX_SAMPLE_C_LB: + case AMDGPU::TEX_SAMPLE_G: + case AMDGPU::TEX_SAMPLE_C_G: + case AMDGPU::TXD: + case AMDGPU::TXD_SHADOW: + return IDFetch; + default: + DEBUG( + dbgs() << "other inst: "; + SU->dump(DAG); + ); + return IDOther; + } +} + +class ConstPairs { +private: + unsigned XYPair; + unsigned ZWPair; +public: + ConstPairs(unsigned ReadConst[3]) : XYPair(0), ZWPair(0) { + for (unsigned i = 0; i < 3; i++) { + unsigned ReadConstChan = ReadConst[i] & 3; + unsigned ReadConstIndex = ReadConst[i] & (~3); + if (ReadConstChan < 2) { + if (!XYPair) { + XYPair = ReadConstIndex; + } + } else { + if (!ZWPair) { + ZWPair = ReadConstIndex; + } + } + } + } + + bool isCompatibleWith(const ConstPairs& CP) const { + return (!XYPair || !CP.XYPair || CP.XYPair == XYPair) && + (!ZWPair || !CP.ZWPair || CP.ZWPair == ZWPair); + } +}; + +static +const ConstPairs getPairs(const R600InstrInfo *TII, const MachineInstr& MI) { + unsigned ReadConsts[3] = {0, 0, 0}; + R600Operands::Ops OpTable[3][2] = { + {R600Operands::SRC0, R600Operands::SRC0_SEL}, + {R600Operands::SRC1, R600Operands::SRC1_SEL}, + {R600Operands::SRC2, R600Operands::SRC2_SEL}, + }; + + if (!TII->isALUInstr(MI.getOpcode())) + return ConstPairs(ReadConsts); + + for (unsigned i = 0; i < 3; i++) { + int SrcIdx = TII->getOperandIdx(MI.getOpcode(), OpTable[i][0]); + if (SrcIdx < 0) + break; + if (MI.getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) + ReadConsts[i] =MI.getOperand( + TII->getOperandIdx(MI.getOpcode(), OpTable[i][1])).getImm(); + } + return ConstPairs(ReadConsts); +} + +bool +R600SchedStrategy::isBundleable(const MachineInstr& MI) { + const ConstPairs &MIPair = getPairs(TII, MI); + for (unsigned i = 0; i < 4; i++) { + if (!InstructionsGroupCandidate[i]) + continue; + const ConstPairs &IGPair = getPairs(TII, + *InstructionsGroupCandidate[i]->getInstr()); + if (!IGPair.isCompatibleWith(MIPair)) + return false; + } + return true; +} + +SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) { + if (Q.empty()) + return NULL; + for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end(); + It != E; ++It) { + SUnit *SU = *It; + if (isBundleable(*SU->getInstr())) { + Q.erase(It); + return SU; + } + } + return NULL; +} + +void R600SchedStrategy::LoadAlu() { + ReadyQueue *QSrc = Pending[IDAlu]; + for (ReadyQueue::iterator I = QSrc->begin(), + E = QSrc->end(); I != E; ++I) { + (*I)->NodeQueueId &= ~QSrc->getID(); + AluKind AK = getAluKind(*I); + AvailableAlus[AK].insert(*I); + } + QSrc->clear(); +} + +void R600SchedStrategy::PrepareNextSlot() { + DEBUG(dbgs() << "New Slot\n"); + assert (OccupedSlotsMask && "Slot wasn't filled"); + OccupedSlotsMask = 0; + memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate)); + LoadAlu(); +} + +void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { + unsigned DestReg = MI->getOperand(0).getReg(); + // PressureRegister crashes if an operand is def and used in the same inst + // and we try to constraint its regclass + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && !MO.isDef() && + MO.getReg() == MI->getOperand(0).getReg()) + return; + } + // Constrains the regclass of DestReg to assign it to Slot + switch (Slot) { + case 0: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); + break; + case 1: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); + break; + case 2: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); + break; + case 3: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); + break; + } +} + +SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) { + static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; + SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]); + SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]); + if (!UnslotedSU) { + return SlotedSU; + } else if (!SlotedSU) { + AssignSlot(UnslotedSU->getInstr(), Slot); + return UnslotedSU; + } else { + //Determine which one to pick (the lesser one) + if (CompareSUnit()(SlotedSU, UnslotedSU)) { + AvailableAlus[AluAny].insert(UnslotedSU); + return SlotedSU; + } else { + AvailableAlus[IndexToID[Slot]].insert(SlotedSU); + AssignSlot(UnslotedSU->getInstr(), Slot); + return UnslotedSU; + } + } +} + +bool R600SchedStrategy::isAvailablesAluEmpty() const { + return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() && + AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() && + AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() && + AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty(); +} + +SUnit* R600SchedStrategy::pickAlu() { + while (!isAvailablesAluEmpty()) { + if (!OccupedSlotsMask) { + // Flush physical reg copies (RA will discard them) + if (!AvailableAlus[AluDiscarded].empty()) { + OccupedSlotsMask = 15; + return PopInst(AvailableAlus[AluDiscarded]); + } + // If there is a T_XYZW alu available, use it + if (!AvailableAlus[AluT_XYZW].empty()) { + OccupedSlotsMask = 15; + return PopInst(AvailableAlus[AluT_XYZW]); + } + } + for (unsigned Chan = 0; Chan < 4; ++Chan) { + bool isOccupied = OccupedSlotsMask & (1 << Chan); + if (!isOccupied) { + SUnit *SU = AttemptFillSlot(Chan); + if (SU) { + OccupedSlotsMask |= (1 << Chan); + InstructionsGroupCandidate[Chan] = SU; + return SU; + } + } + } + PrepareNextSlot(); + } + return NULL; +} + +SUnit* R600SchedStrategy::pickOther(int QID) { + SUnit *SU = 0; + ReadyQueue *AQ = Available[QID]; + + if (AQ->empty()) { + MoveUnits(Pending[QID], AQ); + } + if (!AQ->empty()) { + SU = *AQ->begin(); + AQ->remove(AQ->begin()); + } + return SU; +} + diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h new file mode 100644 index 0000000000..d74ff1e076 --- /dev/null +++ b/lib/Target/R600/R600MachineScheduler.h @@ -0,0 +1,121 @@ +//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef R600MACHINESCHEDULER_H_ +#define R600MACHINESCHEDULER_H_ + +#include "R600InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/PriorityQueue.h" + +using namespace llvm; + +namespace llvm { + +class CompareSUnit { +public: + bool operator()(const SUnit *S1, const SUnit *S2) { + return S1->getDepth() > S2->getDepth(); + } +}; + +class R600SchedStrategy : public MachineSchedStrategy { + + const ScheduleDAGMI *DAG; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + MachineRegisterInfo *MRI; + + enum InstQueue { + QAlu = 1, + QFetch = 2, + QOther = 4 + }; + + enum InstKind { + IDAlu, + IDFetch, + IDOther, + IDLast + }; + + enum AluKind { + AluAny, + AluT_X, + AluT_Y, + AluT_Z, + AluT_W, + AluT_XYZW, + AluDiscarded, // LLVM Instructions that are going to be eliminated + AluLast + }; + + ReadyQueue *Available[IDLast], *Pending[IDLast]; + std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast]; + + InstKind CurInstKind; + int CurEmitted; + InstKind NextInstKind; + + int InstKindLimit[IDLast]; + + int OccupedSlotsMask; + +public: + R600SchedStrategy() : + DAG(0), TII(0), TRI(0), MRI(0) { + Available[IDAlu] = new ReadyQueue(QAlu, "AAlu"); + Available[IDFetch] = new ReadyQueue(QFetch, "AFetch"); + Available[IDOther] = new ReadyQueue(QOther, "AOther"); + Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu"); + Pending[IDFetch] = new ReadyQueue(QFetch<<4, "PFetch"); + Pending[IDOther] = new ReadyQueue(QOther<<4, "POther"); + } + + virtual ~R600SchedStrategy() { + for (unsigned I = 0; I < IDLast; ++I) { + delete Available[I]; + delete Pending[I]; + } + } + + virtual void initialize(ScheduleDAGMI *dag); + virtual SUnit *pickNode(bool &IsTopNode); + virtual void schedNode(SUnit *SU, bool IsTopNode); + virtual void releaseTopNode(SUnit *SU); + virtual void releaseBottomNode(SUnit *SU); + +private: + SUnit *InstructionsGroupCandidate[4]; + + int getInstKind(SUnit *SU); + bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; + AluKind getAluKind(SUnit *SU) const; + void LoadAlu(); + bool isAvailablesAluEmpty() const; + SUnit *AttemptFillSlot (unsigned Slot); + void PrepareNextSlot(); + SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q); + + void AssignSlot(MachineInstr *MI, unsigned Slot); + SUnit* pickAlu(); + SUnit* pickOther(int QID); + bool isBundleable(const MachineInstr& MI); + void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst); +}; + +} // namespace llvm + +#endif /* R600MACHINESCHEDULER_H_ */ diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp new file mode 100644 index 0000000000..bbd7995d7d --- /dev/null +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -0,0 +1,99 @@ +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "R600RegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + +R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm, + const TargetInstrInfo &tii) +: AMDGPURegisterInfo(tm, tii), + TM(tm), + TII(tii) + { } + +BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + + Reserved.set(AMDGPU::ZERO); + Reserved.set(AMDGPU::HALF); + Reserved.set(AMDGPU::ONE); + Reserved.set(AMDGPU::ONE_INT); + Reserved.set(AMDGPU::NEG_HALF); + Reserved.set(AMDGPU::NEG_ONE); + Reserved.set(AMDGPU::PV_X); + Reserved.set(AMDGPU::ALU_LITERAL_X); + Reserved.set(AMDGPU::ALU_CONST); + Reserved.set(AMDGPU::PREDICATE_BIT); + Reserved.set(AMDGPU::PRED_SEL_OFF); + Reserved.set(AMDGPU::PRED_SEL_ZERO); + Reserved.set(AMDGPU::PRED_SEL_ONE); + + for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), + E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { + Reserved.set(*I); + } + + for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(), + E = AMDGPU::TRegMemRegClass.end(); + I != E; ++I) { + Reserved.set(*I); + } + + const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII); + std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF); + for (std::vector<unsigned>::iterator I = IndirectRegs.begin(), + E = IndirectRegs.end(); + I != E; ++I) { + Reserved.set(*I); + } + return Reserved; +} + +const TargetRegisterClass * +R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { + switch (rc->getID()) { + case AMDGPU::GPRF32RegClassID: + case AMDGPU::GPRI32RegClassID: + return &AMDGPU::R600_Reg32RegClass; + default: return rc; + } +} + +unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { + return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( + MVT VT) const { + switch(VT.SimpleTy) { + default: + case MVT::i32: return &AMDGPU::R600_TReg32RegClass; + } +} + +unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const { + switch (Channel) { + default: assert(!"Invalid channel index"); return 0; + case 0: return AMDGPU::sub0; + case 1: return AMDGPU::sub1; + case 2: return AMDGPU::sub2; + case 3: return AMDGPU::sub3; + } +} + diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h new file mode 100644 index 0000000000..f9ca918f24 --- /dev/null +++ b/lib/Target/R600/R600RegisterInfo.h @@ -0,0 +1,55 @@ +//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600RegisterInfo +// +//===----------------------------------------------------------------------===// + +#ifndef R600REGISTERINFO_H_ +#define R600REGISTERINFO_H_ + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +namespace llvm { + +class R600TargetMachine; +class TargetInstrInfo; + +struct R600RegisterInfo : public AMDGPURegisterInfo { + AMDGPUTargetMachine &TM; + const TargetInstrInfo &TII; + + R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii); + + virtual BitVector getReservedRegs(const MachineFunction &MF) const; + + /// \param RC is an AMDIL reg class. + /// + /// \returns the R600 reg class that is equivalent to \p RC. + virtual const TargetRegisterClass *getISARegClass( + const TargetRegisterClass *RC) const; + + /// \brief get the HW encoding for a register's channel. + unsigned getHWRegChan(unsigned reg) const; + + /// \brief get the register class of the specified type to use in the + /// CFGStructurizer + virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const; + + /// \returns the sub reg enum value for the given \p Channel + /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x) + unsigned getSubRegFromChannel(unsigned Channel) const; + +}; + +} // End namespace llvm + +#endif // AMDIDSAREGISTERINFO_H_ diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td new file mode 100644 index 0000000000..ce5994ca36 --- /dev/null +++ b/lib/Target/R600/R600RegisterInfo.td @@ -0,0 +1,146 @@ + +class R600Reg <string name, bits<16> encoding> : Register<name> { + let Namespace = "AMDGPU"; + let HWEncoding = encoding; +} + +class R600RegWithChan <string name, bits<9> sel, string chan> : + Register <name> { + + field bits<2> chan_encoding = !if(!eq(chan, "X"), 0, + !if(!eq(chan, "Y"), 1, + !if(!eq(chan, "Z"), 2, + !if(!eq(chan, "W"), 3, 0)))); + let HWEncoding{8-0} = sel; + let HWEncoding{10-9} = chan_encoding; + let Namespace = "AMDGPU"; +} + +class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> : + RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1, sub2, sub3]; + let HWEncoding = encoding; +} + +foreach Index = 0-127 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; + + // Indirect addressing offset registers + def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, + Index, Chan>; + def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, + Chan>; + } + // 128-bit Temporary Registers + def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW", + [!cast<Register>("T"#Index#"_X"), + !cast<Register>("T"#Index#"_Y"), + !cast<Register>("T"#Index#"_Z"), + !cast<Register>("T"#Index#"_W")], + Index>; +} + +// Array Base Register holding input in FS +foreach Index = 448-480 in { + def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; +} + + +// Special Registers + +def ZERO : R600Reg<"0.0", 248>; +def ONE : R600Reg<"1.0", 249>; +def NEG_ONE : R600Reg<"-1.0", 249>; +def ONE_INT : R600Reg<"1", 250>; +def HALF : R600Reg<"0.5", 252>; +def NEG_HALF : R600Reg<"-0.5", 252>; +def ALU_LITERAL_X : R600Reg<"literal.x", 253>; +def PV_X : R600Reg<"pv.x", 254>; +def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; +def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; +def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; +def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; +def AR_X : R600Reg<"AR.x", 0>; + +def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "ArrayBase%u", 448, 480))>; +// special registers for ALU src operands +// const buffer reference, SRCx_SEL contains index +def ALU_CONST : R600Reg<"CBuf", 0>; +// interpolation param reference, SRCx_SEL contains index +def ALU_PARAM : R600Reg<"Param", 0>; + +let isAllocatable = 0 in { + +// XXX: Only use the X channel, until we support wider stack widths +def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>; + +} // End isAllocatable = 0 + +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_X", 0, 127), AR_X)>; + +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Y", 0, 127))>; + +def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Z", 0, 127))>; + +def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_W", 0, 127))>; + +def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_TReg32_X, R600_TReg32_Y, + R600_TReg32_Z, R600_TReg32_W)>; + +def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add + R600_TReg32, + R600_ArrayBase, + R600_Addr, + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, + ALU_CONST, ALU_PARAM + )>; + +def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add + PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; + +def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add + PREDICATE_BIT)>; + +def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add (sequence "T%u_XYZW", 0, 127))> { + let CopyCost = -1; +} + +//===----------------------------------------------------------------------===// +// Register classes for indirect addressing +//===----------------------------------------------------------------------===// + +// Super register for all the Indirect Registers. This register class is used +// by the REG_SEQUENCE instruction to specify the registers to use for direct +// reads / writes which may be written / read by an indirect address. +class IndirectSuper<string n, list<Register> subregs> : + RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDGPU"; + let SubRegIndices = + [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15]; +} + +def IndirectSuperReg : IndirectSuper<"Indirect", + [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X, + TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X, + TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X] +>; + +def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>; + +// This register class defines the registers that are the storage units for +// the "Indirect Addressing" pseudo memory space. +// XXX: Only use the X channel, until we support wider stack widths +def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32, + (add (sequence "TRegMem%u_X", 0, 16)) +>; diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td new file mode 100644 index 0000000000..7ede181c51 --- /dev/null +++ b/lib/Target/R600/R600Schedule.td @@ -0,0 +1,36 @@ +//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction +// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS +// slot has been removed. +// +//===----------------------------------------------------------------------===// + + +def ALU_X : FuncUnit; +def ALU_Y : FuncUnit; +def ALU_Z : FuncUnit; +def ALU_W : FuncUnit; +def TRANS : FuncUnit; + +def AnyALU : InstrItinClass; +def VecALU : InstrItinClass; +def TransALU : InstrItinClass; + +def R600_EG_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>, + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> + ] +>; diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp new file mode 100644 index 0000000000..2477e2a9dc --- /dev/null +++ b/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -0,0 +1,329 @@ +//===-- SIAnnotateControlFlow.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Annotates the control flow with hardware specific intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +namespace { + +// Complex types used in this pass +typedef std::pair<BasicBlock *, Value *> StackEntry; +typedef SmallVector<StackEntry, 16> StackVector; + +// Intrinsic names the control flow is annotated with +static const char *IfIntrinsic = "llvm.SI.if"; +static const char *ElseIntrinsic = "llvm.SI.else"; +static const char *BreakIntrinsic = "llvm.SI.break"; +static const char *IfBreakIntrinsic = "llvm.SI.if.break"; +static const char *ElseBreakIntrinsic = "llvm.SI.else.break"; +static const char *LoopIntrinsic = "llvm.SI.loop"; +static const char *EndCfIntrinsic = "llvm.SI.end.cf"; + +class SIAnnotateControlFlow : public FunctionPass { + + static char ID; + + Type *Boolean; + Type *Void; + Type *Int64; + Type *ReturnStruct; + + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + Constant *Int64Zero; + + Constant *If; + Constant *Else; + Constant *Break; + Constant *IfBreak; + Constant *ElseBreak; + Constant *Loop; + Constant *EndCf; + + DominatorTree *DT; + StackVector Stack; + SSAUpdater PhiInserter; + + bool isTopOfStack(BasicBlock *BB); + + Value *popSaved(); + + void push(BasicBlock *BB, Value *Saved); + + bool isElse(PHINode *Phi); + + void eraseIfUnused(PHINode *Phi); + + void openIf(BranchInst *Term); + + void insertElse(BranchInst *Term); + + void handleLoopCondition(Value *Cond); + + void handleLoop(BranchInst *Term); + + void closeControlFlow(BasicBlock *BB); + +public: + SIAnnotateControlFlow(): + FunctionPass(ID) { } + + virtual bool doInitialization(Module &M); + + virtual bool runOnFunction(Function &F); + + virtual const char *getPassName() const { + return "SI annotate control flow"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + FunctionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char SIAnnotateControlFlow::ID = 0; + +/// \brief Initialize all the types and constants used in the pass +bool SIAnnotateControlFlow::doInitialization(Module &M) { + LLVMContext &Context = M.getContext(); + + Void = Type::getVoidTy(Context); + Boolean = Type::getInt1Ty(Context); + Int64 = Type::getInt64Ty(Context); + ReturnStruct = StructType::get(Boolean, Int64, (Type *)0); + + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + Int64Zero = ConstantInt::get(Int64, 0); + + If = M.getOrInsertFunction( + IfIntrinsic, ReturnStruct, Boolean, (Type *)0); + + Else = M.getOrInsertFunction( + ElseIntrinsic, ReturnStruct, Int64, (Type *)0); + + Break = M.getOrInsertFunction( + BreakIntrinsic, Int64, Int64, (Type *)0); + + IfBreak = M.getOrInsertFunction( + IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0); + + ElseBreak = M.getOrInsertFunction( + ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0); + + Loop = M.getOrInsertFunction( + LoopIntrinsic, Boolean, Int64, (Type *)0); + + EndCf = M.getOrInsertFunction( + EndCfIntrinsic, Void, Int64, (Type *)0); + + return false; +} + +/// \brief Is BB the last block saved on the stack ? +bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { + return !Stack.empty() && Stack.back().first == BB; +} + +/// \brief Pop the last saved value from the control flow stack +Value *SIAnnotateControlFlow::popSaved() { + return Stack.pop_back_val().second; +} + +/// \brief Push a BB and saved value to the control flow stack +void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { + Stack.push_back(std::make_pair(BB, Saved)); +} + +/// \brief Can the condition represented by this PHI node treated like +/// an "Else" block? +bool SIAnnotateControlFlow::isElse(PHINode *Phi) { + BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + if (Phi->getIncomingBlock(i) == IDom) { + + if (Phi->getIncomingValue(i) != BoolTrue) + return false; + + } else { + if (Phi->getIncomingValue(i) != BoolFalse) + return false; + + } + } + return true; +} + +// \brief Erase "Phi" if it is not used any more +void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { + if (!Phi->hasNUsesOrMore(1)) + Phi->eraseFromParent(); +} + +/// \brief Open a new "If" block +void SIAnnotateControlFlow::openIf(BranchInst *Term) { + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Close the last "If" block and open a new "Else" block +void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + Value *Ret = CallInst::Create(Else, popSaved(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Recursively handle the condition leading to a loop +void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { + if (PHINode *Phi = dyn_cast<PHINode>(Cond)) { + + // Handle all non constant incoming values first + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + if (isa<ConstantInt>(Incoming)) + continue; + + Phi->setIncomingValue(i, BoolFalse); + handleLoopCondition(Incoming); + } + + BasicBlock *Parent = Phi->getParent(); + BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); + + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + + Value *Incoming = Phi->getIncomingValue(i); + if (Incoming != BoolTrue) + continue; + + BasicBlock *From = Phi->getIncomingBlock(i); + if (From == IDom) { + CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); + if (OldEnd && OldEnd->getCalledFunction() == EndCf) { + Value *Args[] = { + OldEnd->getArgOperand(0), + PhiInserter.GetValueAtEndOfBlock(Parent) + }; + Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); + PhiInserter.AddAvailableValue(Parent, Ret); + continue; + } + } + + TerminatorInst *Insert = From->getTerminator(); + Value *Arg = PhiInserter.GetValueAtEndOfBlock(From); + Value *Ret = CallInst::Create(Break, Arg, "", Insert); + PhiInserter.AddAvailableValue(From, Ret); + } + eraseIfUnused(Phi); + + } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { + BasicBlock *Parent = Inst->getParent(); + TerminatorInst *Insert = Parent->getTerminator(); + Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) }; + Value *Ret = CallInst::Create(IfBreak, Args, "", Insert); + PhiInserter.AddAvailableValue(Parent, Ret); + + } else { + assert(0 && "Unhandled loop condition!"); + } +} + +/// \brief Handle a back edge (loop) +void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + BasicBlock *Target = Term->getSuccessor(1); + PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); + + PhiInserter.Initialize(Int64, ""); + PhiInserter.AddAvailableValue(Target, Broken); + + Value *Cond = Term->getCondition(); + Term->setCondition(BoolTrue); + handleLoopCondition(Cond); + + BasicBlock *BB = Term->getParent(); + Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB); + for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); + PI != PE; ++PI) { + + Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); + } + + Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); + push(Term->getSuccessor(0), Arg); +} + +/// \brief Close the last opened control flow +void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { + CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); +} + +/// \brief Annotate the control flow with intrinsics so the backend can +/// recognize if/then/else and loops. +bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTree>(); + + for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), + E = df_end(&F.getEntryBlock()); I != E; ++I) { + + BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator()); + + if (!Term || Term->isUnconditional()) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + continue; + } + + if (I.nodeVisited(Term->getSuccessor(1))) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + handleLoop(Term); + continue; + } + + if (isTopOfStack(*I)) { + PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); + if (Phi && Phi->getParent() == *I && isElse(Phi)) { + insertElse(Term); + eraseIfUnused(Phi); + continue; + } + closeControlFlow(*I); + } + openIf(Term); + } + + assert(Stack.empty()); + return true; +} + +/// \brief Create the annotation pass +FunctionPass *llvm::createSIAnnotateControlFlowPass() { + return new SIAnnotateControlFlow(); +} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp new file mode 100644 index 0000000000..063f5faa63 --- /dev/null +++ b/lib/Target/R600/SIISelLowering.cpp @@ -0,0 +1,656 @@ +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for SI +// +//===----------------------------------------------------------------------===// + +#include "SIISelLowering.h" +#include "AMDIL.h" +#include "AMDGPU.h" +#include "AMDILIntrinsicInfo.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" + +using namespace llvm; + +SITargetLowering::SITargetLowering(TargetMachine &TM) : + AMDGPUTargetLowering(TM), + TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())), + TRI(TM.getRegisterInfo()) { + + addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + + addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); + + addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); + + addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); + + addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + + addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + + addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + + addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + + computeRegisterProperties(); + + setOperationAction(ISD::ADD, MVT::i64, Legal); + setOperationAction(ISD::ADD, MVT::i32, Legal); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setTargetDAGCombine(ISD::SELECT_CC); + + setTargetDAGCombine(ISD::SETCC); + + setSchedulingPreference(Sched::Source); +} + +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + assert(CallConv == CallingConv::C); + + SmallVector<ISD::InputArg, 16> Splits; + uint32_t Skipped = 0; + + for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + + // First check if it's a PS input addr + if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { + + assert((PSInputNum <= 15) && "Too many PS inputs!"); + + if (!Arg.Used) { + // We can savely skip PS inputs + Skipped |= 1 << i; + ++PSInputNum; + continue; + } + + Info->PSInputAddr |= 1 << PSInputNum++; + } + + // Second split vertices into their elements + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eigth. + Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + + } else { + Splits.push_back(Arg); + } + } + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // At least one interpolation mode must be enabled or else the GPU will hang. + if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { + Info->PSInputAddr |= 1; + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + } + + AnalyzeFormalArguments(CCInfo, Splits); + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + + if (Skipped & (1 << i)) { + InVals.push_back(SDValue()); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + MVT VT = VA.getLocVT(); + + if (VT == MVT::i64) { + // For now assume it is a pointer + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, + &AMDGPU::SReg_64RegClass); + Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); + InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + continue; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + + const ISD::InputArg &Arg = Ins[i]; + if (Arg.VT.isVector()) { + + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector<SDValue, 4> Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + for (unsigned j = 0; j != NumElements; ++j) + Regs.push_back(DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, + Regs.data(), Regs.size())); + continue; + } + + InVals.push_back(Val); + } + return Chain; +} + +MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + MachineRegisterInfo & MRI = BB->getParent()->getRegInfo(); + MachineBasicBlock::iterator I = MI; + + switch (MI->getOpcode()) { + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + case AMDGPU::BRANCH: return BB; + case AMDGPU::SI_WQM: + LowerSI_WQM(MI, *BB, I, MRI); + break; + } + return BB; +} + +void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + + MI->eraseFromParent(); +} + +EVT SITargetLowering::getSetCCResultType(EVT VT) const { + return MVT::i1; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + } + return SDValue(); +} + +/// \brief Helper function for LowerBRCOND +static SDNode *findUser(SDValue Value, unsigned Opcode) { + + SDNode *Parent = Value.getNode(); + for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); + I != E; ++I) { + + if (I.getUse().get() != Value) + continue; + + if (I->getOpcode() == Opcode) + return *I; + } + return 0; +} + +/// This transforms the control flow intrinsics to get the branch destination as +/// last parameter, also switches branch target with BR if the need arise +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, + SelectionDAG &DAG) const { + + DebugLoc DL = BRCOND.getDebugLoc(); + + SDNode *Intr = BRCOND.getOperand(1).getNode(); + SDValue Target = BRCOND.getOperand(2); + SDNode *BR = 0; + + if (Intr->getOpcode() == ISD::SETCC) { + // As long as we negate the condition everything is fine + SDNode *SetCC = Intr; + assert(SetCC->getConstantOperandVal(1) == 1); + assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE); + Intr = SetCC->getOperand(0).getNode(); + + } else { + // Get the target from BR if we don't negate the condition + BR = findUser(BRCOND, ISD::BR); + Target = BR->getOperand(1); + } + + assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + + // Build the result and + SmallVector<EVT, 4> Res; + for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) + Res.push_back(Intr->getValueType(i)); + + // operands of the new intrinsic call + SmallVector<SDValue, 4> Ops; + Ops.push_back(BRCOND.getOperand(0)); + for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) + Ops.push_back(Intr->getOperand(i)); + Ops.push_back(Target); + + // build the new intrinsic call + SDNode *Result = DAG.getNode( + Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, + DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); + + if (BR) { + // Give the branch instruction our target + SDValue Ops[] = { + BR->getOperand(0), + BRCOND.getOperand(2) + }; + DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); + } + + SDValue Chain = SDValue(Result, Result->getNumValues() - 1); + + // Copy the intrinsic results to registers + for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { + SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); + if (!CopyToReg) + continue; + + Chain = DAG.getCopyToReg( + Chain, DL, + CopyToReg->getOperand(1), + SDValue(Result, i - 1), + SDValue()); + + DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); + } + + // Remove the old intrinsic from the chain + DAG.ReplaceAllUsesOfValueWith( + SDValue(Intr, Intr->getNumValues() - 1), + Intr->getOperand(0)); + + return Chain; +} + +SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue True = Op.getOperand(2); + SDValue False = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + + // Possible Min/Max pattern + SDValue MinMax = LowerMinMax(Op, DAG); + if (MinMax.getNode()) { + return MinMax; + } + + SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); + return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + DebugLoc DL = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + switch (N->getOpcode()) { + default: break; + case ISD::SELECT_CC: { + N->dump(); + ConstantSDNode *True, *False; + // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) + if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) + && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) + && True->isAllOnesValue() + && False->isNullValue() + && VT == MVT::i1) { + return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), + N->getOperand(1), N->getOperand(4)); + + } + break; + } + case ISD::SETCC: { + SDValue Arg0 = N->getOperand(0); + SDValue Arg1 = N->getOperand(1); + SDValue CC = N->getOperand(2); + ConstantSDNode * C = NULL; + ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); + + // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) + if (VT == MVT::i1 + && Arg0.getOpcode() == ISD::SIGN_EXTEND + && Arg0.getOperand(0).getValueType() == MVT::i1 + && (C = dyn_cast<ConstantSDNode>(Arg1)) + && C->isNullValue() + && CCOp == ISD::SETNE) { + return SimplifySetCC(VT, Arg0.getOperand(0), + DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); + } + break; + } + } + return SDValue(); +} + +/// \brief Test if RegClass is one of the VSrc classes +static bool isVSrc(unsigned RegClass) { + return AMDGPU::VSrc_32RegClassID == RegClass || + AMDGPU::VSrc_64RegClassID == RegClass; +} + +/// \brief Test if RegClass is one of the SSrc classes +static bool isSSrc(unsigned RegClass) { + return AMDGPU::SSrc_32RegClassID == RegClass || + AMDGPU::SSrc_64RegClassID == RegClass; +} + +/// \brief Analyze the possible immediate value Op +/// +/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate +/// and the immediate value if it's a literal immediate +int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { + + union { + int32_t I; + float F; + } Imm; + + if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) + Imm.I = Node->getSExtValue(); + else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) + Imm.F = Node->getValueAPF().convertToFloat(); + else + return -1; // It isn't an immediate + + if ((Imm.I >= -16 && Imm.I <= 64) || + Imm.F == 0.5f || Imm.F == -0.5f || + Imm.F == 1.0f || Imm.F == -1.0f || + Imm.F == 2.0f || Imm.F == -2.0f || + Imm.F == 4.0f || Imm.F == -4.0f) + return 0; // It's an inline immediate + + return Imm.I; // It's a literal immediate +} + +/// \brief Try to fold an immediate directly into an instruction +bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, + bool &ScalarSlotUsed) const { + + MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); + if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) + return false; + + const SDValue &Op = Mov->getOperand(0); + int32_t Value = analyzeImmediate(Op.getNode()); + if (Value == -1) { + // Not an immediate at all + return false; + + } else if (Value == 0) { + // Inline immediates can always be fold + Operand = Op; + return true; + + } else if (Value == Immediate) { + // Already fold literal immediate + Operand = Op; + return true; + + } else if (!ScalarSlotUsed && !Immediate) { + // Fold this literal immediate + ScalarSlotUsed = true; + Immediate = Value; + Operand = Op; + return true; + + } + + return false; +} + +/// \brief Does "Op" fit into register class "RegClass" ? +bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op, + unsigned RegClass) const { + + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + SDNode *Node = Op.getNode(); + + int OpClass; + if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) { + const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode()); + OpClass = Desc.OpInfo[Op.getResNo()].RegClass; + + } else if (Node->getOpcode() == ISD::CopyFromReg) { + RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode()); + OpClass = MRI.getRegClass(Reg->getReg())->getID(); + + } else + return false; + + if (OpClass == -1) + return false; + + return TRI->getRegClass(RegClass)->hasSubClassEq(TRI->getRegClass(OpClass)); +} + +/// \brief Make sure that we don't exeed the number of allowed scalars +void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, + unsigned RegClass, + bool &ScalarSlotUsed) const { + + // First map the operands register class to a destination class + if (RegClass == AMDGPU::VSrc_32RegClassID) + RegClass = AMDGPU::VReg_32RegClassID; + else if (RegClass == AMDGPU::VSrc_64RegClassID) + RegClass = AMDGPU::VReg_64RegClassID; + else + return; + + // Nothing todo if they fit naturaly + if (fitsRegClass(DAG, Operand, RegClass)) + return; + + // If the scalar slot isn't used yet use it now + if (!ScalarSlotUsed) { + ScalarSlotUsed = true; + return; + } + + // This is a conservative aproach, it is possible that we can't determine + // the correct register class and copy too often, but better save than sorry. + SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); + SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(), + Operand.getValueType(), Operand, RC); + Operand = SDValue(Node, 0); +} + +SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + + // Original encoding (either e32 or e64) + int Opcode = Node->getMachineOpcode(); + const MCInstrDesc *Desc = &TII->get(Opcode); + + unsigned NumDefs = Desc->getNumDefs(); + unsigned NumOps = Desc->getNumOperands(); + + // e64 version if available, -1 otherwise + int OpcodeE64 = AMDGPU::getVOPe64(Opcode); + const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); + + assert(!DescE64 || DescE64->getNumDefs() == NumDefs); + assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); + + int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; + bool HaveVSrc = false, HaveSSrc = false; + + // First figure out what we alread have in this instruction + for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; + i != e && Op < NumOps; ++i, ++Op) { + + unsigned RegClass = Desc->OpInfo[Op].RegClass; + if (isVSrc(RegClass)) + HaveVSrc = true; + else if (isSSrc(RegClass)) + HaveSSrc = true; + else + continue; + + int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); + if (Imm != -1 && Imm != 0) { + // Literal immediate + Immediate = Imm; + } + } + + // If we neither have VSrc nor SSrc it makes no sense to continue + if (!HaveVSrc && !HaveSSrc) + return Node; + + // No scalar allowed when we have both VSrc and SSrc + bool ScalarSlotUsed = HaveVSrc && HaveSSrc; + + // Second go over the operands and try to fold them + std::vector<SDValue> Ops; + bool Promote2e64 = false; + for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; + i != e && Op < NumOps; ++i, ++Op) { + + const SDValue &Operand = Node->getOperand(i); + Ops.push_back(Operand); + + // Already folded immediate ? + if (isa<ConstantSDNode>(Operand.getNode()) || + isa<ConstantFPSDNode>(Operand.getNode())) + continue; + + // Is this a VSrc or SSrc operand ? + unsigned RegClass = Desc->OpInfo[Op].RegClass; + if (!isVSrc(RegClass) && !isSSrc(RegClass)) { + + if (i == 1 && Desc->isCommutable() && + fitsRegClass(DAG, Ops[0], RegClass) && + foldImm(Ops[1], Immediate, ScalarSlotUsed)) { + + assert(isVSrc(Desc->OpInfo[NumDefs].RegClass) || + isSSrc(Desc->OpInfo[NumDefs].RegClass)); + + // Swap commutable operands + SDValue Tmp = Ops[1]; + Ops[1] = Ops[0]; + Ops[0] = Tmp; + + } else if (DescE64 && !Immediate) { + // Test if it makes sense to switch to e64 encoding + + RegClass = DescE64->OpInfo[Op].RegClass; + int32_t TmpImm = -1; + if ((isVSrc(RegClass) || isSSrc(RegClass)) && + foldImm(Ops[i], TmpImm, ScalarSlotUsed)) { + + Immediate = -1; + Promote2e64 = true; + Desc = DescE64; + DescE64 = 0; + } + } + continue; + } + + // Try to fold the immediates + if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { + // Folding didn't worked, make sure we don't hit the SReg limit + ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); + } + } + + if (Promote2e64) { + // Add the modifier flags while promoting + for (unsigned i = 0; i < 4; ++i) + Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); + } + + // Add optional chain and glue + for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) + Ops.push_back(Node->getOperand(i)); + + // Either create a complete new or update the current instruction + if (Promote2e64) + return DAG.getMachineNode(OpcodeE64, Node->getDebugLoc(), + Node->getVTList(), Ops.data(), Ops.size()); + else + return DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); +} diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h new file mode 100644 index 0000000000..0411565ee3 --- /dev/null +++ b/lib/Target/R600/SIISelLowering.h @@ -0,0 +1,60 @@ +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef SIISELLOWERING_H +#define SIISELLOWERING_H + +#include "AMDGPUISelLowering.h" +#include "SIInstrInfo.h" + +namespace llvm { + +class SITargetLowering : public AMDGPUTargetLowering { + const SIInstrInfo * TII; + const TargetRegisterInfo * TRI; + + void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; + + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + + bool foldImm(SDValue &Operand, int32_t &Immediate, + bool &ScalarSlotUsed) const; + bool fitsRegClass(SelectionDAG &DAG, SDValue &Op, unsigned RegClass) const; + void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, + unsigned RegClass, bool &ScalarSlotUsed) const; + +public: + SITargetLowering(TargetMachine &tm); + + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, + MachineBasicBlock * BB) const; + virtual EVT getSetCCResultType(EVT VT) const; + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const; + + int32_t analyzeImmediate(const SDNode *N) const; +}; + +} // End namespace llvm + +#endif //SIISELLOWERING_H diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp new file mode 100644 index 0000000000..67fbdf7be1 --- /dev/null +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -0,0 +1,372 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Insert wait instructions for memory reads and writes. +/// +/// Memory reads and writes are issued asynchronously, so we need to insert +/// S_WAITCNT instructions when we want to access any of their results or +/// overwrite any register that's used asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +/// \brief One variable for each of the hardware counters +typedef union { + struct { + unsigned VM; + unsigned EXP; + unsigned LGKM; + } Named; + unsigned Array[3]; + +} Counters; + +typedef Counters RegCounters[512]; +typedef std::pair<unsigned, unsigned> RegInterval; + +class SIInsertWaits : public MachineFunctionPass { + +private: + static char ID; + const SIInstrInfo *TII; + const SIRegisterInfo &TRI; + const MachineRegisterInfo *MRI; + + /// \brief Constant hardware limits + static const Counters WaitCounts; + + /// \brief Constant zero value + static const Counters ZeroCounts; + + /// \brief Counter values we have already waited on. + Counters WaitedOn; + + /// \brief Counter values for last instruction issued. + Counters LastIssued; + + /// \brief Registers used by async instructions. + RegCounters UsedRegs; + + /// \brief Registers defined by async instructions. + RegCounters DefinedRegs; + + /// \brief Different export instruction types seen since last wait. + unsigned ExpInstrTypesSeen; + + /// \brief Get increment/decrement amount for this instruction. + Counters getHwCounts(MachineInstr &MI); + + /// \brief Is operand relevant for async execution? + bool isOpRelevant(MachineOperand &Op); + + /// \brief Get register interval an operand affects. + RegInterval getRegInterval(MachineOperand &Op); + + /// \brief Handle instructions async components + void pushInstruction(MachineInstr &MI); + + /// \brief Insert the actual wait instruction + bool insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Counts); + + /// \brief Do we need def2def checks? + bool unorderedDefines(MachineInstr &MI); + + /// \brief Resolve all operand dependencies to counter requirements + Counters handleOperands(MachineInstr &MI); + +public: + SIInsertWaits(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())), + TRI(TII->getRegisterInfo()) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { + return "SI insert wait instructions"; + } + +}; + +} // End anonymous namespace + +char SIInsertWaits::ID = 0; + +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; +const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; + +FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { + return new SIInsertWaits(tm); +} + +Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { + + uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; + Counters Result; + + Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); + + // Only consider stores or EXP for EXP_CNT + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && + (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); + + // LGKM may uses larger values + if (TSFlags & SIInstrFlags::LGKM_CNT) { + + MachineOperand &Op = MI.getOperand(0); + assert(Op.isReg() && "First LGKM operand must be a register!"); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + + } else { + Result.Named.LGKM = 0; + } + + return Result; +} + +bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { + + // Constants are always irrelevant + if (!Op.isReg()) + return false; + + // Defines are always relevant + if (Op.isDef()) + return true; + + // For exports all registers are relevant + MachineInstr &MI = *Op.getParent(); + if (MI.getOpcode() == AMDGPU::EXP) + return true; + + // For stores the stored value is also relevant + if (!MI.getDesc().mayStore()) + return false; + + for (MachineInstr::mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); I != E; ++I) { + + if (I->isReg() && I->isUse()) + return Op.isIdenticalTo(*I); + } + + return false; +} + +RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { + + if (!Op.isReg()) + return std::make_pair(0, 0); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize(); + + assert(Size >= 4); + + RegInterval Result; + Result.first = TRI.getEncodingValue(Reg); + Result.second = Result.first + Size / 4; + + return Result; +} + +void SIInsertWaits::pushInstruction(MachineInstr &MI) { + + // Get the hardware counter increments and sum them up + Counters Increment = getHwCounts(MI); + unsigned Sum = 0; + + for (unsigned i = 0; i < 3; ++i) { + LastIssued.Array[i] += Increment.Array[i]; + Sum += Increment.Array[i]; + } + + // If we don't increase anything then that's it + if (Sum == 0) + return; + + // Remember which export instructions we have seen + if (Increment.Named.EXP) { + ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2; + } + + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + + MachineOperand &Op = MI.getOperand(i); + if (!isOpRelevant(Op)) + continue; + + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + // Remember which registers we define + if (Op.isDef()) + DefinedRegs[j] = LastIssued; + + // and which one we are using + if (Op.isUse()) + UsedRegs[j] = LastIssued; + } + } +} + +bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Required) { + + // End of program? No need to wait on anything + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + return false; + + // Figure out if the async instructions execute in order + bool Ordered[3]; + + // VM_CNT is always ordered + Ordered[0] = true; + + // EXP_CNT is unordered if we have both EXP & VM-writes + Ordered[1] = ExpInstrTypesSeen == 3; + + // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS + Ordered[2] = false; + + // The values we are going to put into the S_WAITCNT instruction + Counters Counts = WaitCounts; + + // Do we really need to wait? + bool NeedWait = false; + + for (unsigned i = 0; i < 3; ++i) { + + if (Required.Array[i] <= WaitedOn.Array[i]) + continue; + + NeedWait = true; + + if (Ordered[i]) { + unsigned Value = LastIssued.Array[i] - Required.Array[i]; + + // adjust the value to the real hardware posibilities + Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); + + } else + Counts.Array[i] = 0; + + // Remember on what we have waited on + WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + } + + if (!NeedWait) + return false; + + // Reset EXP_CNT instruction types + if (Counts.Named.EXP == 0) + ExpInstrTypesSeen = 0; + + // Build the wait instruction + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm((Counts.Named.VM & 0xF) | + ((Counts.Named.EXP & 0x7) << 4) | + ((Counts.Named.LGKM & 0x7) << 8)); + + return true; +} + +/// \brief helper function for handleOperands +static void increaseCounters(Counters &Dst, const Counters &Src) { + + for (unsigned i = 0; i < 3; ++i) + Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); +} + +bool SIInsertWaits::unorderedDefines(MachineInstr &MI) { + + uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; + if (TSFlags & SIInstrFlags::LGKM_CNT) + return true; + + if (TSFlags & SIInstrFlags::EXP_CNT) + return ExpInstrTypesSeen == 3; + + return false; +} + +Counters SIInsertWaits::handleOperands(MachineInstr &MI) { + + bool UnorderedDefines = unorderedDefines(MI); + Counters Result = ZeroCounts; + + // For each register affected by this + // instruction increase the result sequence + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + + MachineOperand &Op = MI.getOperand(i); + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + if (Op.isDef()) { + increaseCounters(Result, UsedRegs[j]); + if (UnorderedDefines) + increaseCounters(Result, DefinedRegs[j]); + } + + if (Op.isUse()) + increaseCounters(Result, DefinedRegs[j]); + } + } + + return Result; +} + +bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { + + bool Changes = false; + + MRI = &MF.getRegInfo(); + + WaitedOn = ZeroCounts; + LastIssued = ZeroCounts; + + memset(&UsedRegs, 0, sizeof(UsedRegs)); + memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + + Changes |= insertWait(MBB, I, handleOperands(*I)); + pushInstruction(*I); + } + + // Wait for everything at the end of the MBB + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + } + + return Changes; +} diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td new file mode 100644 index 0000000000..3891ddb2db --- /dev/null +++ b/lib/Target/R600/SIInstrFormats.td @@ -0,0 +1,426 @@ +//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : + AMDGPUInst<outs, ins, asm, pattern> { + + field bits<1> VM_CNT = 0; + field bits<1> EXP_CNT = 0; + field bits<1> LGKM_CNT = 0; + + let TSFlags{0} = VM_CNT; + let TSFlags{1} = EXP_CNT; + let TSFlags{2} = LGKM_CNT; +} + +class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + field bits<32> Inst; + let Size = 4; +} + +class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + field bits<64> Inst; + let Size = 8; +} + +//===----------------------------------------------------------------------===// +// Scalar operations +//===----------------------------------------------------------------------===// + +class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32<outs, ins, asm, pattern> { + + bits<7> SDST; + bits<8> SSRC0; + + let Inst{7-0} = SSRC0; + let Inst{15-8} = op; + let Inst{22-16} = SDST; + let Inst{31-23} = 0x17d; //encoding; + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<7> SDST; + bits<8> SSRC0; + bits<8> SSRC1; + + let Inst{7-0} = SSRC0; + let Inst{15-8} = SSRC1; + let Inst{22-16} = SDST; + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32<outs, ins, asm, pattern> { + + bits<8> SSRC0; + bits<8> SSRC1; + + let Inst{7-0} = SSRC0; + let Inst{15-8} = SSRC1; + let Inst{22-16} = op; + let Inst{31-23} = 0x17e; + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins , asm, pattern> { + + bits <7> SDST; + bits <16> SIMM16; + + let Inst{15-0} = SIMM16; + let Inst{22-16} = SDST; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 < + (outs), + ins, + asm, + pattern > { + + bits <16> SIMM16; + + let Inst{15-0} = SIMM16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm, + list<dag> pattern> : Enc32<outs, ins, asm, pattern> { + + bits<7> SDST; + bits<7> SBASE; + bits<8> OFFSET; + + let Inst{7-0} = OFFSET; + let Inst{8} = imm; + let Inst{14-9} = SBASE{6-1}; + let Inst{21-15} = SDST; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding + + let LGKM_CNT = 1; +} + +//===----------------------------------------------------------------------===// +// Vector ALU operations +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { + +class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<9> SRC0; + + let Inst{8-0} = SRC0; + let Inst{16-9} = op; + let Inst{24-17} = VDST; + let Inst{31-25} = 0x3f; //encoding + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<9> SRC0; + bits<8> VSRC1; + + let Inst{8-0} = SRC0; + let Inst{16-9} = VSRC1; + let Inst{24-17} = VDST; + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<9> SRC0; + bits<9> SRC1; + bits<9> SRC2; + bits<3> ABS; + bits<1> CLAMP; + bits<2> OMOD; + bits<3> NEG; + + let Inst{7-0} = VDST; + let Inst{10-8} = ABS; + let Inst{11} = CLAMP; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = SRC0; + let Inst{49-41} = SRC1; + let Inst{58-50} = SRC2; + let Inst{60-59} = OMOD; + let Inst{63-61} = NEG; + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<9> SRC0; + bits<9> SRC1; + bits<9> SRC2; + bits<7> SDST; + bits<2> OMOD; + bits<3> NEG; + + let Inst{7-0} = VDST; + let Inst{14-8} = SDST; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = SRC0; + let Inst{49-41} = SRC1; + let Inst{58-50} = SRC2; + let Inst{60-59} = OMOD; + let Inst{63-61} = NEG; + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : + Enc32 <(outs VCCReg:$dst), ins, asm, pattern> { + + bits<9> SRC0; + bits<8> VSRC1; + + let Inst{8-0} = SRC0; + let Inst{16-9} = VSRC1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<8> VSRC; + bits<2> ATTRCHAN; + bits<6> ATTR; + + let Inst{7-0} = VSRC; + let Inst{9-8} = ATTRCHAN; + let Inst{15-10} = ATTR; + let Inst{17-16} = op; + let Inst{25-18} = VDST; + let Inst{31-26} = 0x32; // encoding + + let neverHasSideEffects = 1; + let mayLoad = 1; + let mayStore = 0; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Vector I/O operations +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { + +class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64<outs, ins, asm, pattern> { + + bits<8> VDATA; + bits<12> OFFSET; + bits<1> OFFEN; + bits<1> IDXEN; + bits<1> GLC; + bits<1> ADDR64; + bits<1> LDS; + bits<8> VADDR; + bits<7> SRSRC; + bits<1> SLC; + bits<1> TFE; + bits<8> SOFFSET; + + let Inst{11-0} = OFFSET; + let Inst{12} = OFFEN; + let Inst{13} = IDXEN; + let Inst{14} = GLC; + let Inst{15} = ADDR64; + let Inst{16} = LDS; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = VADDR; + let Inst{47-40} = VDATA; + let Inst{52-48} = SRSRC{6-2}; + let Inst{54} = SLC; + let Inst{55} = TFE; + let Inst{63-56} = SOFFSET; + + let VM_CNT = 1; + let EXP_CNT = 1; + + let neverHasSideEffects = 1; +} + +class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64<outs, ins, asm, pattern> { + + bits<8> VDATA; + bits<12> OFFSET; + bits<1> OFFEN; + bits<1> IDXEN; + bits<1> GLC; + bits<1> ADDR64; + bits<4> DFMT; + bits<3> NFMT; + bits<8> VADDR; + bits<7> SRSRC; + bits<1> SLC; + bits<1> TFE; + bits<8> SOFFSET; + + let Inst{11-0} = OFFSET; + let Inst{12} = OFFEN; + let Inst{13} = IDXEN; + let Inst{14} = GLC; + let Inst{15} = ADDR64; + let Inst{18-16} = op; + let Inst{22-19} = DFMT; + let Inst{25-23} = NFMT; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = VADDR; + let Inst{47-40} = VDATA; + let Inst{52-48} = SRSRC{6-2}; + let Inst{54} = SLC; + let Inst{55} = TFE; + let Inst{63-56} = SOFFSET; + + let VM_CNT = 1; + let EXP_CNT = 1; + + let neverHasSideEffects = 1; +} + +class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64 <outs, ins, asm, pattern> { + + bits<8> VDATA; + bits<4> DMASK; + bits<1> UNORM; + bits<1> GLC; + bits<1> DA; + bits<1> R128; + bits<1> TFE; + bits<1> LWE; + bits<1> SLC; + bits<8> VADDR; + bits<7> SRSRC; + bits<7> SSAMP; + + let Inst{11-8} = DMASK; + let Inst{12} = UNORM; + let Inst{13} = GLC; + let Inst{14} = DA; + let Inst{15} = R128; + let Inst{16} = TFE; + let Inst{17} = LWE; + let Inst{24-18} = op; + let Inst{25} = SLC; + let Inst{31-26} = 0x3c; + let Inst{39-32} = VADDR; + let Inst{47-40} = VDATA; + let Inst{52-48} = SRSRC{6-2}; + let Inst{57-53} = SSAMP{6-2}; + + let VM_CNT = 1; + let EXP_CNT = 1; +} + +def EXP : Enc64< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] > { + + bits<4> EN; + bits<6> TGT; + bits<1> COMPR; + bits<1> DONE; + bits<1> VM; + bits<8> VSRC0; + bits<8> VSRC1; + bits<8> VSRC2; + bits<8> VSRC3; + + let Inst{3-0} = EN; + let Inst{9-4} = TGT; + let Inst{10} = COMPR; + let Inst{11} = DONE; + let Inst{12} = VM; + let Inst{31-26} = 0x3e; + let Inst{39-32} = VSRC0; + let Inst{47-40} = VSRC1; + let Inst{55-48} = VSRC2; + let Inst{63-56} = VSRC3; + + let EXP_CNT = 1; +} + +} // End Uses = [EXEC] diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp new file mode 100644 index 0000000000..de2373b11a --- /dev/null +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -0,0 +1,224 @@ +//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#include "SIInstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInstrDesc.h" +#include <stdio.h> + +using namespace llvm; + +SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm) + : AMDGPUInstrInfo(tm), + RI(tm, *this) + { } + +const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const { + return RI; +} + +void +SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + + // If we are trying to copy to or from SCC, there is a bug somewhere else in + // the backend. While it may be theoretically possible to do this, it should + // never be necessary. + assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); + + const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 + }; + + const int16_t Sub0_7[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 + }; + + const int16_t Sub0_3[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 + }; + + const int16_t Sub0_1[] = { + AMDGPU::sub0, AMDGPU::sub1, 0 + }; + + unsigned Opcode; + const int16_t *SubIndices; + + if (AMDGPU::SReg_32RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_3; + + } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_7; + + } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_15; + + } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || + AMDGPU::SReg_64RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_1; + + } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || + AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_3; + + } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || + AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_7; + + } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || + AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_15; + + } else { + llvm_unreachable("Can't copy register!"); + } + + while (unsigned SubIdx = *SubIndices++) { + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, SubIdx)); + + Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); + + if (*SubIndices) + Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + } +} + +MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, + bool NewMI) const { + + if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg() || + !MI->getOperand(2).isReg()) + return 0; + + return TargetInstrInfo::commuteInstruction(MI, NewMI); +} + +MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg, + int64_t Imm) const { + MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc()); + MachineInstrBuilder MIB(*MF, MI); + MIB.addReg(DstReg, RegState::Define); + MIB.addImm(Imm); + + return MI; + +} + +bool SIInstrInfo::isMov(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + return true; + } +} + +bool +SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + return RC != &AMDGPU::EXECRegRegClass; +} + +//===----------------------------------------------------------------------===// +// Indirect addressing callbacks +//===----------------------------------------------------------------------===// + +unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + assert(Channel == 0); + return RegIndex; +} + + +int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + llvm_unreachable("Unimplemented"); +} + +int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + llvm_unreachable("Unimplemented"); +} + +const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass( + unsigned SourceReg) const { + llvm_unreachable("Unimplemented"); +} + +const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const { + llvm_unreachable("Unimplemented"); +} + +MachineInstrBuilder SIInstrInfo::buildIndirectWrite( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + llvm_unreachable("Unimplemented"); +} + +MachineInstrBuilder SIInstrInfo::buildIndirectRead( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + llvm_unreachable("Unimplemented"); +} + +const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const { + llvm_unreachable("Unimplemented"); +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h new file mode 100644 index 0000000000..5789af5d21 --- /dev/null +++ b/lib/Target/R600/SIInstrInfo.h @@ -0,0 +1,93 @@ +//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#ifndef SIINSTRINFO_H +#define SIINSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "SIRegisterInfo.h" + +namespace llvm { + +class SIInstrInfo : public AMDGPUInstrInfo { +private: + const SIRegisterInfo RI; + +public: + explicit SIInstrInfo(AMDGPUTargetMachine &tm); + + const SIRegisterInfo &getRegisterInfo() const; + + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + + virtual MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI=false) const; + + virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg, + int64_t Imm) const; + + virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;} + virtual bool isMov(unsigned Opcode) const; + + virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + + virtual int getIndirectIndexBegin(const MachineFunction &MF) const; + + virtual int getIndirectIndexEnd(const MachineFunction &MF) const; + + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const; + + virtual const TargetRegisterClass *getIndirectAddrStoreRegClass( + unsigned SourceReg) const; + + virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const; + + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const; + + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const; + + virtual const TargetRegisterClass *getSuperIndirectRegClass() const; + }; + +namespace AMDGPU { + + int getVOPe64(uint16_t Opcode); + +} // End namespace AMDGPU + +} // End namespace llvm + +namespace SIInstrFlags { + enum Flags { + // First 4 bits are the instruction encoding + VM_CNT = 1 << 0, + EXP_CNT = 1 << 1, + LGKM_CNT = 1 << 2 + }; +} + +#endif //SIINSTRINFO_H diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td new file mode 100644 index 0000000000..260c651dd4 --- /dev/null +++ b/lib/Target/R600/SIInstrInfo.td @@ -0,0 +1,329 @@ +//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SI DAG Nodes +//===----------------------------------------------------------------------===// + +// SMRD takes a 64bit memory address and can only add an 32bit offset +def SIadd64bit32bit : SDNode<"ISD::ADD", + SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]> +>; + +// Transformation function, extract the lower 32bit of a 64bit immediate +def LO32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32); +}]>; + +// Transformation function, extract the upper 32bit of a 64bit immediate +def HI32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32); +}]>; + +def IMM8bitDWORD : ImmLeaf < + i32, [{ + return (Imm & ~0x3FC) == 0; + }], SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant( + N->getZExtValue() >> 2, MVT::i32); + }]> +>; + +def IMM12bit : ImmLeaf < + i16, + [{return isUInt<12>(Imm);}] +>; + +class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ + return ((const SITargetLowering &)TLI).analyzeImmediate(N) == 0; +}]>; + +//===----------------------------------------------------------------------===// +// SI assembler operands +//===----------------------------------------------------------------------===// + +def SIOperand { + int ZERO = 0x80; + int VCC = 0x6A; +} + +include "SIInstrFormats.td" + +//===----------------------------------------------------------------------===// +// +// SI Instruction multiclass helpers. +// +// Instructions with _32 take 32-bit operands. +// Instructions with _64 take 64-bit operands. +// +// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit +// encoding is the standard encoding, but instruction that make use of +// any of the instruction modifiers must use the 64-bit encoding. +// +// Instructions with _e32 use the 32-bit encoding. +// Instructions with _e64 use the 64-bit encoding. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Scalar classes +//===----------------------------------------------------------------------===// + +class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 < + op, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern +>; + +class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < + op, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + +class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < + op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < + op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC < + op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC < + op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK < + op, (outs SReg_32:$dst), (ins i16imm:$src0), + opName#" $dst, $src0", pattern +>; + +class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK < + op, (outs SReg_64:$dst), (ins i16imm:$src0), + opName#" $dst, $src0", pattern +>; + +multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> { + def _IMM : SMRD < + op, 1, (outs dstClass:$dst), + (ins SReg_64:$sbase, i32imm:$offset), + asm#" $dst, $sbase, $offset", [] + >; + + def _SGPR : SMRD < + op, 0, (outs dstClass:$dst), + (ins SReg_64:$sbase, SReg_32:$soff), + asm#" $dst, $sbase, $soff", [] + >; +} + +//===----------------------------------------------------------------------===// +// Vector ALU classes +//===----------------------------------------------------------------------===// + +class VOP <string opName> { + string OpName = opName; +} + +multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src, + string opName, list<dag> pattern> { + + def _e32 : VOP1 < + op, (outs drc:$dst), (ins src:$src0), + opName#"_e32 $dst, $src0", pattern + >, VOP <opName>; + + def _e64 : VOP3 < + {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + (outs drc:$dst), + (ins src:$src0, + i32imm:$abs, i32imm:$clamp, + i32imm:$omod, i32imm:$neg), + opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", [] + >, VOP <opName> { + let SRC1 = SIOperand.ZERO; + let SRC2 = SIOperand.ZERO; + } +} + +multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> + : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>; + +multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> + : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>; + +multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc, + string opName, list<dag> pattern> { + def _e32 : VOP2 < + op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), + opName#"_e32 $dst, $src0, $src1", pattern + >, VOP <opName>; + + def _e64 : VOP3 < + {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + (outs vrc:$dst), + (ins arc:$src0, arc:$src1, + i32imm:$abs, i32imm:$clamp, + i32imm:$omod, i32imm:$neg), + opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", [] + >, VOP <opName> { + let SRC2 = SIOperand.ZERO; + } +} + +multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> + : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern>; + +multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> + : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern>; + +multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern> { + + def _e32 : VOP2 < + op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1), + opName#"_e32 $dst, $src0, $src1", pattern + >, VOP <opName>; + + def _e64 : VOP3b < + {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + (outs VReg_32:$dst), + (ins VSrc_32:$src0, VSrc_32:$src1, + i32imm:$abs, i32imm:$clamp, + i32imm:$omod, i32imm:$neg), + opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", [] + >, VOP <opName> { + let SRC2 = SIOperand.ZERO; + /* the VOP2 variant puts the carry out into VCC, the VOP3 variant + can write it into any SGPR. We currently don't use the carry out, + so for now hardcode it to VCC as well */ + let SDST = SIOperand.VCC; + } +} + +multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, + string opName, ValueType vt, PatLeaf cond> { + + def _e32 : VOPC < + op, (ins arc:$src0, vrc:$src1), + opName#"_e32 $dst, $src0, $src1", [] + >, VOP <opName>; + + def _e64 : VOP3 < + {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + (outs SReg_64:$dst), + (ins arc:$src0, arc:$src1, + InstFlag:$abs, InstFlag:$clamp, + InstFlag:$omod, InstFlag:$neg), + opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", + !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>, + [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))] + ) + >, VOP <opName> { + let SRC2 = SIOperand.ZERO; + } +} + +multiclass VOPC_32 <bits<8> op, string opName, + ValueType vt = untyped, PatLeaf cond = COND_NULL> + : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>; + +multiclass VOPC_64 <bits<8> op, string opName, + ValueType vt = untyped, PatLeaf cond = COND_NULL> + : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>; + +class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 < + op, (outs VReg_32:$dst), + (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2, + i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), + opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern +>, VOP <opName>; + +class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < + op, (outs VReg_64:$dst), + (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2, + i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), + opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern +>, VOP <opName>; + +//===----------------------------------------------------------------------===// +// Vector I/O classes +//===----------------------------------------------------------------------===// + +class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < + op, + (outs), + (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), + asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", + []> { + let mayStore = 1; + let mayLoad = 0; +} + +class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF < + op, + (outs regClass:$dst), + (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, + i1imm:$tfe, SSrc_32:$soffset), + asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, " + #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset", + []> { + let mayLoad = 1; + let mayStore = 0; +} + +class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < + op, + (outs regClass:$dst), + (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), + asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", + []> { + let mayLoad = 1; + let mayStore = 0; +} + +class MIMG_Load_Helper <bits<7> op, string asm> : MIMG < + op, + (outs VReg_128:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; +} + +//===----------------------------------------------------------------------===// +// Vector instruction mappings +//===----------------------------------------------------------------------===// + +// Maps an opcode in e32 form to its e64 equivalent +def getVOPe64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["4"]; + let ValueCols = [["8"]]; +} + +include "SIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td new file mode 100644 index 0000000000..0ab9e4ec0c --- /dev/null +++ b/lib/Target/R600/SIInstructions.td @@ -0,0 +1,1429 @@ +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file was originally auto-generated from a GPU register header file and +// all the instruction definitions were originally commented out. Instructions +// that are not yet supported remain commented out. +//===----------------------------------------------------------------------===// + +class InterpSlots { +int P0 = 2; +int P10 = 0; +int P20 = 1; +} +def INTERP : InterpSlots; + +def InterpSlot : Operand<i32> { + let PrintMethod = "printInterpSlot"; +} + +def isSI : Predicate<"Subtarget.device()" + "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">; + +let Predicates = [isSI] in { + +let neverHasSideEffects = 1 in { + +let isMoveImm = 1 in { +def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>; +def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>; +def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>; +def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>; +} // End isMoveImm = 1 + +def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>; +def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>; +def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; +def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; +def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>; +def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; +} // End neverHasSideEffects = 1 + +////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>; +////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>; +////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>; +////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>; +////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>; +////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; +////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>; +////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>; +//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>; +//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; +def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; +//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; +//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>; +//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>; +////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>; +////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>; +////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>; +////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>; +def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>; +def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>; +def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>; +def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>; + +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in { + +def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>; +def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>; +def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>; +def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>; +def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>; +def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>; +def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>; +def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>; + +} // End hasSideEffects = 1 + +def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>; +def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>; +def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>; +def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>; +def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>; +def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>; +//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>; +def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>; +def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>; +def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; +def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>; +def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>; + +/* +This instruction is disabled for now until we can figure out how to teach +the instruction selector to correctly use the S_CMP* vs V_CMP* +instructions. + +When this instruction is enabled the code generator sometimes produces this +invalid sequence: + +SCC = S_CMPK_EQ_I32 SGPR0, imm +VCC = COPY SCC +VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 + +def S_CMPK_EQ_I32 : SOPK < + 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1), + "S_CMPK_EQ_I32", + [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))] +>; +*/ + +let isCompare = 1 in { +def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>; +def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>; +def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>; +def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>; +def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>; +def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>; +def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>; +def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>; +def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>; +def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>; +def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>; +} // End isCompare = 1 + +def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>; +def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>; +//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>; +def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>; +def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>; +def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>; +//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>; +//def EXP : EXP_ <0x00000000, "EXP", []>; + +let isCompare = 1 in { + +defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">; +defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_LT>; +defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_EQ>; +defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_LE>; +defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_GT>; +defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", f32, COND_NE>; +defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_GE>; +defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32">; +defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32">; +defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">; +defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">; +defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">; +defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">; +defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_NE>; +defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">; +defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">; +defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">; +defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">; +defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">; +defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">; +defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">; +defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">; +defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">; +defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">; +defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">; +defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">; +defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">; +defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">; +defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">; +defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">; +defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">; +defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64">; +defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64">; +defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64">; +defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64">; +defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">; +defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64">; +defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64">; +defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64">; +defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">; +defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">; +defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">; +defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">; +defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64">; +defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">; +defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">; +defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">; +defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">; +defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">; +defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">; +defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">; +defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">; +defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">; +defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">; +defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">; +defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">; +defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">; +defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">; +defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">; +defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">; +defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">; +defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">; +defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">; +defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">; +defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">; +defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">; +defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">; +defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">; +defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">; +defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">; +defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">; +defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">; +defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">; +defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">; +defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">; +defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">; +defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">; +defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">; +defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">; +defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">; +defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">; +defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">; +defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">; +defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">; +defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">; +defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">; +defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">; +defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">; +defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">; +defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">; +defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">; +defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">; +defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">; +defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">; +defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">; +defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">; +defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">; +defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">; +defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">; +defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">; +defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">; +defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">; +defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">; +defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">; +defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">; +defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">; +defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">; +defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">; +defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">; +defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">; +defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">; +defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">; +defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">; +defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">; +defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">; +defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">; +defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">; +defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">; +defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">; +defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">; +defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">; +defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_LT>; +defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>; +defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_LE>; +defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_GT>; +defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>; +defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_GE>; +defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">; +defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">; +defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">; +defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">; +defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">; +defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">; +defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">; +defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">; +defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64">; +defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64">; +defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64">; +defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64">; +defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64">; +defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64">; +defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">; +defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">; +defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">; +defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">; +defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">; +defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">; +defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">; +defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">; +defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32">; +defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32">; +defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32">; +defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32">; +defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32">; +defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32">; +defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">; +defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">; +defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">; +defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">; +defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">; +defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">; +defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">; +defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">; +defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64">; +defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64">; +defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64">; +defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64">; +defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64">; +defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64">; +defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">; + +let hasSideEffects = 1, Defs = [EXEC] in { + +defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">; +defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">; +defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">; +defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">; +defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">; +defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">; +defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">; +defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">; + +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">; + +let hasSideEffects = 1, Defs = [EXEC] in { +defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">; +} // End hasSideEffects = 1, Defs = [EXEC] + +defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">; + +let hasSideEffects = 1, Defs = [EXEC] in { +defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">; +} // End hasSideEffects = 1, Defs = [EXEC] + +} // End isCompare = 1 + +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>; +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>; +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>; +def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>; +//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>; +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; +//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>; +//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>; +//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>; +//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>; +//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>; +//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>; +//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>; +//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>; +//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>; +//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>; +//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>; +//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>; +//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; +//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>; +//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>; +//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>; +//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>; +//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>; +//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>; +//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>; +//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>; +//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>; +//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>; +//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>; +//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>; +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>; +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>; +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>; +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>; +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>; +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>; +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>; +def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>; +//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>; +//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>; +//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>; +//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>; + +let mayLoad = 1 in { + +defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>; + +//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>; +//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>; +//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>; +//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>; +//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>; +//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>; +//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>; + +} // mayLoad = 1 + +//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; +//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>; +//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>; +//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>; +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>; +//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>; +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>; +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>; +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>; +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>; +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>; +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>; +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>; +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>; +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>; +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>; +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>; +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>; +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>; +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>; +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>; +def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; +//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>; +def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">; +//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>; +def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">; +def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">; +//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>; +//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>; +def IMAGE_SAMPLE_C : MIMG_Load_Helper <0x00000028, "IMAGE_SAMPLE_C">; +//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>; +//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>; +//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>; +def IMAGE_SAMPLE_C_L : MIMG_Load_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">; +def IMAGE_SAMPLE_C_B : MIMG_Load_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">; +//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>; +//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>; +//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>; +//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>; +//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>; +//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>; +//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>; +//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>; +//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>; +//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>; +//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>; +//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>; +//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>; +//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>; +//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>; +//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>; +//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>; +//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>; +//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>; +//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>; +//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>; +//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>; +//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>; +//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>; +//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>; +//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>; +//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>; +//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>; +//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>; +//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>; +//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>; +//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>; +//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>; +//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>; +//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>; +//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>; +//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>; +//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>; +//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>; +//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>; +//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>; +//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>; +//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>; +//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>; +//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>; +//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>; +//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>; +//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>; +//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>; +//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>; +//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; +//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>; + + +let neverHasSideEffects = 1, isMoveImm = 1 in { +defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>; +} // End neverHasSideEffects = 1, isMoveImm = 1 + +defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>; +//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>; +//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>; +defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", + [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))] +>; +//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>; +//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; +defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", + [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))] +>; +defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; +////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; +//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>; +//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; +//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; +//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; +//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>; +//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>; +//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>; +//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>; +//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>; +//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>; +//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>; +//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>; +defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", + [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))] +>; +defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>; +defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", + [(set VReg_32:$dst, (fceil VSrc_32:$src0))] +>; +defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", + [(set VReg_32:$dst, (frint VSrc_32:$src0))] +>; +defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", + [(set VReg_32:$dst, (ffloor VSrc_32:$src0))] +>; +defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", + [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))] +>; +defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; +defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", + [(set VReg_32:$dst, (flog2 VSrc_32:$src0))] +>; +defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; +defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; +defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", + [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))] +>; +defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; +defm V_RSQ_LEGACY_F32 : VOP1_32 < + 0x0000002d, "V_RSQ_LEGACY_F32", + [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))] +>; +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; +defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>; +defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; +defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>; +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>; +defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>; +defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>; +defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>; +defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>; +defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>; +defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>; +defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>; +defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>; +defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>; +//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>; +defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>; +defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>; +//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>; +defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>; +//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>; +defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>; +defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>; +defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>; + +def V_INTERP_P1_F32 : VINTRP < + 0x00000000, + (outs VReg_32:$dst), + (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), + "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]", + []> { + let DisableEncoding = "$m0"; +} + +def V_INTERP_P2_F32 : VINTRP < + 0x00000001, + (outs VReg_32:$dst), + (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), + "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]", + []> { + + let Constraints = "$src0 = $dst"; + let DisableEncoding = "$src0,$m0"; + +} + +def V_INTERP_MOV_F32 : VINTRP < + 0x00000002, + (outs VReg_32:$dst), + (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), + "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]", + []> { + let DisableEncoding = "$m0"; +} + +//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>; + +let isTerminator = 1 in { + +def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", + [(IL_retflag)]> { + let SIMM16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; +} + +let isBranch = 1 in { +def S_BRANCH : SOPP < + 0x00000002, (ins brtarget:$target), "S_BRANCH $target", + [(br bb:$target)]> { + let isBarrier = 1; +} + +let DisableEncoding = "$scc" in { +def S_CBRANCH_SCC0 : SOPP < + 0x00000004, (ins brtarget:$target, SCCReg:$scc), + "S_CBRANCH_SCC0 $target", [] +>; +def S_CBRANCH_SCC1 : SOPP < + 0x00000005, (ins brtarget:$target, SCCReg:$scc), + "S_CBRANCH_SCC1 $target", + [] +>; +} // End DisableEncoding = "$scc" + +def S_CBRANCH_VCCZ : SOPP < + 0x00000006, (ins brtarget:$target, VCCReg:$vcc), + "S_CBRANCH_VCCZ $target", + [] +>; +def S_CBRANCH_VCCNZ : SOPP < + 0x00000007, (ins brtarget:$target, VCCReg:$vcc), + "S_CBRANCH_VCCNZ $target", + [] +>; + +let DisableEncoding = "$exec" in { +def S_CBRANCH_EXECZ : SOPP < + 0x00000008, (ins brtarget:$target, EXECReg:$exec), + "S_CBRANCH_EXECZ $target", + [] +>; +def S_CBRANCH_EXECNZ : SOPP < + 0x00000009, (ins brtarget:$target, EXECReg:$exec), + "S_CBRANCH_EXECNZ $target", + [] +>; +} // End DisableEncoding = "$exec" + + +} // End isBranch = 1 +} // End isTerminator = 1 + +//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>; +let hasSideEffects = 1 in { +def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16", + [] +>; +} // End hasSideEffects +//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>; +//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>; +//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>; +//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>; +//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>; +//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>; +//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>; +//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>; +//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>; +//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; + +def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), + (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), + "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]", + [] +>{ + let DisableEncoding = "$vcc"; +} + +def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), + (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2, + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), + "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", + [(set (i32 VReg_32:$dst), (select (i1 SSrc_64:$src2), + VSrc_32:$src1, VSrc_32:$src0))] +>; + +//f32 pattern for V_CNDMASK_B32_e64 +def : Pat < + (f32 (select (i1 SSrc_64:$src2), VSrc_32:$src1, VSrc_32:$src0)), + (V_CNDMASK_B32_e64 VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2) +>; + +defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>; +defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>; + +let isCommutable = 1 in { +defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", + [(set VReg_32:$dst, (fadd VSrc_32:$src0, VReg_32:$src1))] +>; +} // End isCommutable = 1 + +defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", + [(set VReg_32:$dst, (fsub VSrc_32:$src0, VReg_32:$src1))] +>; + +defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>; +defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>; + +let isCommutable = 1 in { + +defm V_MUL_LEGACY_F32 : VOP2_32 < + 0x00000007, "V_MUL_LEGACY_F32", + [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))] +>; + +defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", + [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))] +>; + +} // End isCommutable = 1 + +//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>; +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; +//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>; +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; + +let isCommutable = 1 in { + +defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", + [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))] +>; + +defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", + [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))] +>; + +defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; +defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; +defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>; +defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>; +defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>; +defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>; + +} // End isCommutable = 1 + +defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>; +defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>; +defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>; +defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>; +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>; +defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>; + +let isCommutable = 1 in { + +defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", + [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))] +>; +defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", + [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))] +>; +defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", + [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))] +>; + +} // End isCommutable = 1 + +defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>; +defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; +defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; +defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; +//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; +//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; +//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; +let Defs = [VCC] in { // Carry-out goes to VCC + +let isCommutable = 1 in { +defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", + [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] +>; +} // End isCommutable = 1 + +defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", + [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] +>; + +defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", []>; +let Uses = [VCC] in { // Carry-out comes from VCC +defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>; +defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>; +defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", []>; +} // End Uses = [VCC] +} // End Defs = [VCC] +defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>; +////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>; +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", + [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))] +>; +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>; + +let neverHasSideEffects = 1 in { + +def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; +def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>; +//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>; +//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>; + +} // End neverHasSideEffects +def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; +def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; +def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; +def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; +def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>; +def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>; +def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>; +def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>; +def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>; +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; +def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>; +def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>; +def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; +////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>; +////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>; +////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>; +////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>; +////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>; +////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>; +////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>; +////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>; +////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>; +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>; +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>; +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; +def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; +def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; +def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>; +def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>; +def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>; +def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>; +def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>; +def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>; +def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>; +def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>; +def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; +def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; +def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; +def : Pat < + (mul VSrc_32:$src0, VReg_32:$src1), + (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0) +>; +def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; +def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; +def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; +def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; +def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; +def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>; +def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>; +def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>; +def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>; +def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>; +def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>; +def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>; +def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>; +def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>; +def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>; + +def S_CSELECT_B32 : SOP2 < + 0x0000000a, (outs SReg_32:$dst), + (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", + [(set (i32 SReg_32:$dst), (select (i1 SCCReg:$scc), + SReg_32:$src0, SReg_32:$src1))] +>; + +def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; + +// f32 pattern for S_CSELECT_B32 +def : Pat < + (f32 (select (i1 SCCReg:$scc), SReg_32:$src0, SReg_32:$src1)), + (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc) +>; + +def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>; + +def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", + [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))] +>; + +def : Pat < + (i1 (and SSrc_64:$src0, SSrc_64:$src1)), + (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1) +>; + +def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>; +def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>; +def : Pat < + (i1 (or SSrc_64:$src0, SSrc_64:$src1)), + (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1) +>; +def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>; +def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>; +def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>; +def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>; +def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>; +def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>; +def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>; +def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>; +def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>; +def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>; +def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>; +def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>; +def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>; +def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>; +def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>; +def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>; +def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>; +def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>; +def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>; +def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>; +def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>; +def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>; +def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>; +def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>; +def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>; +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; + +let isCodeGenOnly = 1, isPseudo = 1 in { + +def LOAD_CONST : AMDGPUShaderInst < + (outs GPRF32:$dst), + (ins i32imm:$src), + "LOAD_CONST $dst, $src", + [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))] +>; + +let usesCustomInserter = 1 in { + +def SI_WQM : InstSI < + (outs), + (ins), + "SI_WQM", + [(int_SI_wqm)] +>; + +} // end usesCustomInserter + +// SI Psuedo instructions. These are used by the CFG structurizer pass +// and should be lowered to ISA instructions prior to codegen. + +let mayLoad = 1, mayStore = 1, hasSideEffects = 1, + Uses = [EXEC], Defs = [EXEC] in { + +let isBranch = 1, isTerminator = 1 in { + +def SI_IF : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, brtarget:$target), + "SI_IF $dst, $vcc, $target", + [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))] +>; + +def SI_ELSE : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src, brtarget:$target), + "SI_ELSE $dst, $src, $target", + [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> { + + let Constraints = "$src = $dst"; +} + +def SI_LOOP : InstSI < + (outs), + (ins SReg_64:$saved, brtarget:$target), + "SI_LOOP $saved, $target", + [(int_SI_loop SReg_64:$saved, bb:$target)] +>; + +} // end isBranch = 1, isTerminator = 1 + +def SI_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src), + "SI_ELSE $dst, $src", + [(set SReg_64:$dst, (int_SI_break SReg_64:$src))] +>; + +def SI_IF_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, SReg_64:$src), + "SI_IF_BREAK $dst, $vcc, $src", + [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))] +>; + +def SI_ELSE_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src0, SReg_64:$src1), + "SI_ELSE_BREAK $dst, $src0, $src1", + [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))] +>; + +def SI_END_CF : InstSI < + (outs), + (ins SReg_64:$saved), + "SI_END_CF $saved", + [(int_SI_end_cf SReg_64:$saved)] +>; + +def SI_KILL : InstSI < + (outs), + (ins VReg_32:$src), + "SI_KIL $src", + [(int_AMDGPU_kill VReg_32:$src)] +>; + +} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 + // Uses = [EXEC], Defs = [EXEC] + +} // end IsCodeGenOnly, isPseudo + +def : Pat< + (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2), + (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0)) +>; + +def : Pat < + (int_AMDGPU_kilp), + (SI_KILL (V_MOV_B32_e32 0xbf800000)) +>; + +/* int_SI_vs_load_input */ +def : Pat< + (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset, + VReg_32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0, + VReg_32:$buf_idx_vgpr, SReg_128:$tlst, + 0, 0, 0) +>; + +/* int_SI_export */ +def : Pat < + (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, + VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3) +>; + + +/* int_SI_sample for simple 1D texture lookup */ +def : Pat < + (int_SI_sample imm:$writemask, (v1i32 VReg_32:$addr), + SReg_256:$rsrc, SReg_128:$sampler, imm), + (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, + (i32 (COPY_TO_REGCLASS VReg_32:$addr, VReg_32)), + SReg_256:$rsrc, SReg_128:$sampler) +>; + +class SamplePattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, + ValueType addr_type> : Pat < + (name imm:$writemask, (addr_type addr_class:$addr), + SReg_256:$rsrc, SReg_128:$sampler, imm), + (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, + (EXTRACT_SUBREG addr_class:$addr, sub0), + SReg_256:$rsrc, SReg_128:$sampler) +>; + +class SampleRectPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, + ValueType addr_type> : Pat < + (name imm:$writemask, (addr_type addr_class:$addr), + SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT), + (opcode imm:$writemask, 1, 0, 0, 0, 0, 0, 0, + (EXTRACT_SUBREG addr_class:$addr, sub0), + SReg_256:$rsrc, SReg_128:$sampler) +>; + +class SampleArrayPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, + ValueType addr_type> : Pat < + (name imm:$writemask, (addr_type addr_class:$addr), + SReg_256:$rsrc, SReg_128:$sampler, TEX_ARRAY), + (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, + (EXTRACT_SUBREG addr_class:$addr, sub0), + SReg_256:$rsrc, SReg_128:$sampler) +>; + +class SampleShadowPattern<Intrinsic name, MIMG opcode, + RegisterClass addr_class, ValueType addr_type> : Pat < + (name imm:$writemask, (addr_type addr_class:$addr), + SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW), + (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, + (EXTRACT_SUBREG addr_class:$addr, sub0), + SReg_256:$rsrc, SReg_128:$sampler) +>; + +class SampleShadowArrayPattern<Intrinsic name, MIMG opcode, + RegisterClass addr_class, ValueType addr_type> : Pat < + (name imm:$writemask, (addr_type addr_class:$addr), + SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW_ARRAY), + (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, + (EXTRACT_SUBREG addr_class:$addr, sub0), + SReg_256:$rsrc, SReg_128:$sampler) +>; + +/* int_SI_sample* for texture lookups consuming more address parameters */ +multiclass SamplePatterns<RegisterClass addr_class, ValueType addr_type> { + def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; + def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; + def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; + def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>; + def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>; + + def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>; + def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>; + def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>; + def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>; + + def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>; + def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>; + def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>; + def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>; +} + +defm : SamplePatterns<VReg_64, v2i32>; +defm : SamplePatterns<VReg_128, v4i32>; +defm : SamplePatterns<VReg_256, v8i32>; +defm : SamplePatterns<VReg_512, v16i32>; + +def : Extract_Element <f32, v4f32, VReg_128, 0, sub0>; +def : Extract_Element <f32, v4f32, VReg_128, 1, sub1>; +def : Extract_Element <f32, v4f32, VReg_128, 2, sub2>; +def : Extract_Element <f32, v4f32, VReg_128, 3, sub3>; + +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sub0>; +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sub1>; +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sub2>; +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sub3>; + +def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>; +def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>; +def : Vector_Build <v4f32, VReg_128, f32, VReg_32>; +def : Vector_Build <v4i32, VReg_128, i32, VReg_32>; +def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>; +def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>; + +def : BitConvert <i32, f32, SReg_32>; +def : BitConvert <i32, f32, VReg_32>; + +def : BitConvert <f32, i32, SReg_32>; +def : BitConvert <f32, i32, VReg_32>; + +/********** =================== **********/ +/********** Src & Dst modifiers **********/ +/********** =================== **********/ + +def : Pat < + (int_AMDIL_clamp VReg_32:$src, (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + 0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */) +>; + +def : Pat < + (fabs VReg_32:$src), + (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + 1 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */) +>; + +def : Pat < + (fneg VReg_32:$src), + (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + 0 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 1 /* NEG */) +>; + +/********** ================== **********/ +/********** Immediate Patterns **********/ +/********** ================== **********/ + +def : Pat < + (i32 imm:$imm), + (V_MOV_B32_e32 imm:$imm) +>; + +def : Pat < + (f32 fpimm:$imm), + (V_MOV_B32_e32 fpimm:$imm) +>; + +def : Pat < + (i1 imm:$imm), + (S_MOV_B64 imm:$imm) +>; + +def : Pat < + (i64 InlineImm<i64>:$imm), + (S_MOV_B64 InlineImm<i64>:$imm) +>; + +// i64 immediates aren't supported in hardware, split it into two 32bit values +def : Pat < + (i64 imm:$imm), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0), + (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1) +>; + +/********** ===================== **********/ +/********** Interpolation Paterns **********/ +/********** ===================== **********/ + +def : Pat < + (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params), + (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params) +>; + +def : Pat < + (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij), + (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0), + imm:$attr_chan, imm:$attr, M0Reg:$params), + (EXTRACT_SUBREG VReg_64:$ij, sub1), + imm:$attr_chan, imm:$attr, M0Reg:$params) +>; + +/********** ================== **********/ +/********** Intrinsic Patterns **********/ +/********** ================== **********/ + +/* llvm.AMDGPU.pow */ +/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */ +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>; + +def : Pat < + (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1), + (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1)) +>; + +def : Pat< + (fdiv VSrc_32:$src0, VSrc_32:$src1), + (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1)) +>; + +def : Pat < + (fcos VSrc_32:$src0), + (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) +>; + +def : Pat < + (fsin VSrc_32:$src0), + (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) +>; + +def : Pat < + (int_AMDGPU_cube VReg_128:$src), + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), + (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), + (EXTRACT_SUBREG VReg_128:$src, sub1), + (EXTRACT_SUBREG VReg_128:$src, sub2), + 0, 0, 0, 0), sub0), + (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), + (EXTRACT_SUBREG VReg_128:$src, sub1), + (EXTRACT_SUBREG VReg_128:$src, sub2), + 0, 0, 0, 0), sub1), + (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), + (EXTRACT_SUBREG VReg_128:$src, sub1), + (EXTRACT_SUBREG VReg_128:$src, sub2), + 0, 0, 0, 0), sub2), + (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), + (EXTRACT_SUBREG VReg_128:$src, sub1), + (EXTRACT_SUBREG VReg_128:$src, sub2), + 0, 0, 0, 0), sub3) +>; + +def : Pat < + (i32 (sext (i1 SReg_64:$src0))), + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0) +>; + +/********** ================== **********/ +/********** VOP3 Patterns **********/ +/********** ================== **********/ + +def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)), + (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2, + 0, 0, 0, 0)>; + +/********** ================== **********/ +/********** SMRD Patterns **********/ +/********** ================== **********/ + +multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + // 1. Offset as 8bit DWORD immediate + def : Pat < + (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)), + (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset)) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)), + (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset))) + >; + + // 3. No offset at all + def : Pat < + (constant_load SReg_64:$sbase), + (vt (Instr_IMM SReg_64:$sbase, 0)) + >; +} + +defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; +defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>; +defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; + +} // End isSI predicate diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td new file mode 100644 index 0000000000..7c23d1706b --- /dev/null +++ b/lib/Target/R600/SIIntrinsics.td @@ -0,0 +1,44 @@ +//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + + +let TargetPrefix = "SI", isTarget = 1 in { + + def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + /* XXX: We may need a seperate intrinsic here for loading integer values */ + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ; + def int_SI_wqm : Intrinsic <[], [], []>; + + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>; + + def int_SI_sample : Sample; + def int_SI_sampleb : Sample; + def int_SI_samplel : Sample; + + /* Interpolation Intrinsics */ + + def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; + def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrReadMem]>; + + /* Control flow Intrinsics */ + + def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; + def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; + def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; + def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; + def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; +} diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp new file mode 100644 index 0000000000..b215aa22db --- /dev/null +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -0,0 +1,371 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass lowers the pseudo control flow instructions to real +/// machine instructions. +/// +/// All control flow is handled using predicated instructions and +/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector +/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs +/// by writting to the 64-bit EXEC register (each bit corresponds to a +/// single vector ALU). Typically, for predicates, a vector ALU will write +/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each +/// Vector ALU) and then the ScalarALU will AND the VCC register with the +/// EXEC to update the predicates. +/// +/// For example: +/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 +/// %SGPR0 = SI_IF %VCC +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 +/// %SGPR0 = SI_ELSE %SGPR0 +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 +/// SI_END_CF %SGPR0 +/// +/// becomes: +/// +/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask +/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_CBRANCH_EXECZ label0 // This instruction is an optional +/// // optimization which allows us to +/// // branch if all the bits of +/// // EXEC are zero. +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch +/// +/// label0: +/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block +/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_BRANCH_EXECZ label1 // Use our branch optimization +/// // instruction again. +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block +/// label1: +/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +class SILowerControlFlowPass : public MachineFunctionPass { + +private: + static const unsigned SkipThreshold = 12; + + static char ID; + const TargetInstrInfo *TII; + + bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); + + void Skip(MachineInstr &From, MachineOperand &To); + void SkipIfDead(MachineInstr &MI); + + void If(MachineInstr &MI); + void Else(MachineInstr &MI); + void Break(MachineInstr &MI); + void IfBreak(MachineInstr &MI); + void ElseBreak(MachineInstr &MI); + void Loop(MachineInstr &MI); + void EndCf(MachineInstr &MI); + + void Kill(MachineInstr &MI); + void Branch(MachineInstr &MI); + +public: + SILowerControlFlowPass(TargetMachine &tm) : + MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { + return "SI Lower control flow instructions"; + } + +}; + +} // End anonymous namespace + +char SILowerControlFlowPass::ID = 0; + +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { + return new SILowerControlFlowPass(tm); +} + +bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, + MachineBasicBlock *To) { + + unsigned NumInstr = 0; + + for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); + MBB = *MBB->succ_begin()) { + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + NumInstr < SkipThreshold && I != E; ++I) { + + if (I->isBundle() || !I->isBundled()) + if (++NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { + + if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) + return; + + DebugLoc DL = From.getDebugLoc(); + BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addOperand(To) + .addReg(AMDGPU::EXEC); +} + +void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + if (!shouldSkip(&MBB, &MBB.getParent()->back())) + return; + + MachineBasicBlock::iterator Insert = &MI; + ++Insert; + + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3) + .addReg(AMDGPU::EXEC); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0); + + // ... and terminate wavefront + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); +} + +void SILowerControlFlowPass::If(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) + .addReg(Vcc); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Else(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + .addReg(Src); // Saved EXEC + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Break(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Vcc) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Saved = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Saved) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Loop(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Src = MI.getOperand(0).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)) + .addReg(AMDGPU::EXEC); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::EndCf(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Branch(MachineInstr &MI) { + MachineBasicBlock *Next = MI.getParent()->getNextNode(); + MachineBasicBlock *Target = MI.getOperand(0).getMBB(); + if (Target == Next) + MI.eraseFromParent(); + else + assert(0); +} + +void SILowerControlFlowPass::Kill(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // Kill is only allowed in pixel shaders + assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType == + ShaderType::PIXEL); + + // Clear this pixel from the exec mask if the operand is negative + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) + .addImm(0) + .addOperand(MI.getOperand(0)); + + MI.eraseFromParent(); +} + +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { + + bool HaveKill = false; + unsigned Depth = 0; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + + Next = llvm::next(I); + MachineInstr &MI = *I; + switch (MI.getOpcode()) { + default: break; + case AMDGPU::SI_IF: + ++Depth; + If(MI); + break; + + case AMDGPU::SI_ELSE: + Else(MI); + break; + + case AMDGPU::SI_BREAK: + Break(MI); + break; + + case AMDGPU::SI_IF_BREAK: + IfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + ElseBreak(MI); + break; + + case AMDGPU::SI_LOOP: + ++Depth; + Loop(MI); + break; + + case AMDGPU::SI_END_CF: + if (--Depth == 0 && HaveKill) { + SkipIfDead(MI); + HaveKill = false; + } + EndCf(MI); + break; + + case AMDGPU::SI_KILL: + if (Depth == 0) + SkipIfDead(MI); + else + HaveKill = true; + Kill(MI); + break; + + case AMDGPU::S_BRANCH: + Branch(MI); + break; + } + } + } + + return true; +} diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp new file mode 100644 index 0000000000..1a4e4cbbbb --- /dev/null +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -0,0 +1,34 @@ +//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + + +#include "SIMachineFunctionInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +const char *SIMachineFunctionInfo::ShaderTypeAttribute = "ShaderType"; + +SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) + : MachineFunctionInfo(), + ShaderType(0), + PSInputAddr(0) { + + AttributeSet Set = MF.getFunction()->getAttributes(); + Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, + ShaderTypeAttribute); + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, ShaderType)) + llvm_unreachable("Can't parse shader type!"); + } +} diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h new file mode 100644 index 0000000000..91a809b124 --- /dev/null +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -0,0 +1,36 @@ +//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + + +#ifndef SIMACHINEFUNCTIONINFO_H_ +#define SIMACHINEFUNCTIONINFO_H_ + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which +/// tells the hardware which interpolation parameters to load. +class SIMachineFunctionInfo : public MachineFunctionInfo { +public: + static const char *ShaderTypeAttribute; + + SIMachineFunctionInfo(const MachineFunction &MF); + unsigned ShaderType; + unsigned PSInputAddr; +}; + +} // End namespace llvm + + +#endif //_SIMACHINEFUNCTIONINFO_H_ diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp new file mode 100644 index 0000000000..88275c523f --- /dev/null +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -0,0 +1,48 @@ +//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + + +#include "SIRegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm, + const TargetInstrInfo &tii) +: AMDGPURegisterInfo(tm, tii), + TM(tm), + TII(tii) + { } + +BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + return Reserved; +} + +const TargetRegisterClass * +SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { + switch (rc->getID()) { + case AMDGPU::GPRF32RegClassID: + return &AMDGPU::VReg_32RegClass; + default: return rc; + } +} + +const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( + MVT VT) const { + switch(VT.SimpleTy) { + default: + case MVT::i32: return &AMDGPU::VReg_32RegClass; + } +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h new file mode 100644 index 0000000000..40171e4450 --- /dev/null +++ b/lib/Target/R600/SIRegisterInfo.h @@ -0,0 +1,47 @@ +//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIRegisterInfo +// +//===----------------------------------------------------------------------===// + + +#ifndef SIREGISTERINFO_H_ +#define SIREGISTERINFO_H_ + +#include "AMDGPURegisterInfo.h" + +namespace llvm { + +class AMDGPUTargetMachine; +class TargetInstrInfo; + +struct SIRegisterInfo : public AMDGPURegisterInfo { + AMDGPUTargetMachine &TM; + const TargetInstrInfo &TII; + + SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii); + + virtual BitVector getReservedRegs(const MachineFunction &MF) const; + + /// \param RC is an AMDIL reg class. + /// + /// \returns the SI register class that is equivalent to \p RC. + virtual const TargetRegisterClass * + getISARegClass(const TargetRegisterClass *RC) const; + + /// \brief get the register class of the specified type to use in the + /// CFGStructurizer + virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const; +}; + +} // End namespace llvm + +#endif // SIREGISTERINFO_H_ diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td new file mode 100644 index 0000000000..3dcad506d2 --- /dev/null +++ b/lib/Target/R600/SIRegisterInfo.td @@ -0,0 +1,182 @@ +//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the SI registers +//===----------------------------------------------------------------------===// + +class SIReg <string n, bits<16> encoding = 0> : Register<n> { + let Namespace = "AMDGPU"; + let HWEncoding = encoding; +} + +// Special Registers +def VCC : SIReg<"VCC", 106>; +def EXEC : SIReg<"EXEC", 126>; +def SCC : SIReg<"SCC", 253>; +def M0 : SIReg <"M0", 124>; + +// SGPR registers +foreach Index = 0-101 in { + def SGPR#Index : SIReg <"SGPR"#Index, Index>; +} + +// VGPR registers +foreach Index = 0-255 in { + def VGPR#Index : SIReg <"VGPR"#Index, Index> { + let HWEncoding{8} = 1; + } +} + +//===----------------------------------------------------------------------===// +// Groupings using register classes and tuples +//===----------------------------------------------------------------------===// + +// SGPR 32-bit registers +def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32, + (add (sequence "SGPR%u", 0, 101))>; + +// SGPR 64-bit registers +def SGPR_64 : RegisterTuples<[sub0, sub1], + [(add (decimate (trunc SGPR_32, 101), 2)), + (add (decimate (shl SGPR_32, 1), 2))]>; + +// SGPR 128-bit registers +def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate (trunc SGPR_32, 99), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4))]>; + +// SGPR 256-bit registers +def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (decimate (trunc SGPR_32, 95), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4))]>; + +// SGPR 512-bit registers +def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (decimate (trunc SGPR_32, 87), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4)), + (add (decimate (shl SGPR_32, 8), 4)), + (add (decimate (shl SGPR_32, 9), 4)), + (add (decimate (shl SGPR_32, 10), 4)), + (add (decimate (shl SGPR_32, 11), 4)), + (add (decimate (shl SGPR_32, 12), 4)), + (add (decimate (shl SGPR_32, 13), 4)), + (add (decimate (shl SGPR_32, 14), 4)), + (add (decimate (shl SGPR_32, 15), 4))]>; + +// VGPR 32-bit registers +def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32, + (add (sequence "VGPR%u", 0, 255))>; + +// VGPR 64-bit registers +def VGPR_64 : RegisterTuples<[sub0, sub1], + [(add (trunc VGPR_32, 255)), + (add (shl VGPR_32, 1))]>; + +// VGPR 128-bit registers +def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (trunc VGPR_32, 253)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3))]>; + +// VGPR 256-bit registers +def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (trunc VGPR_32, 249)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7))]>; + +// VGPR 512-bit registers +def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (trunc VGPR_32, 241)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7)), + (add (shl VGPR_32, 8)), + (add (shl VGPR_32, 9)), + (add (shl VGPR_32, 10)), + (add (shl VGPR_32, 11)), + (add (shl VGPR_32, 12)), + (add (shl VGPR_32, 13)), + (add (shl VGPR_32, 14)), + (add (shl VGPR_32, 15))]>; + +//===----------------------------------------------------------------------===// +// Register classes used as source and destination +//===----------------------------------------------------------------------===// + +// Special register classes for predicates and the M0 register +def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>; +def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; +def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; +def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>; + +// Register class for all scalar registers (SGPRs + Special Registers) +def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, + (add SGPR_32, M0Reg) +>; + +def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64, + (add SGPR_64, VCCReg, EXECReg) +>; + +def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>; + +def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>; + +def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>; + +// Register class for all vector registers (VGPRs + Interploation Registers) +def VReg_32 : RegisterClass<"AMDGPU", [f32, i32, v1i32], 32, (add VGPR_32)>; + +def VReg_64 : RegisterClass<"AMDGPU", [i64, v2i32], 64, (add VGPR_64)>; + +def VReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add VGPR_128)>; + +def VReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add VGPR_256)>; + +def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>; + +//===----------------------------------------------------------------------===// +// [SV]Src_* register classes, can have either an immediate or an register +//===----------------------------------------------------------------------===// + +def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>; + +def SSrc_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SReg_64)>; + +def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; + +def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add VReg_64, SReg_64)>; + diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td new file mode 100644 index 0000000000..28b65b8258 --- /dev/null +++ b/lib/Target/R600/SISchedule.td @@ -0,0 +1,15 @@ +//===-- SISchedule.td - SI Scheduling definitons -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: This is just a place holder for now. +// +//===----------------------------------------------------------------------===// + + +def SI_Itin : ProcessorItineraries <[], [], []>; diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp new file mode 100644 index 0000000000..46b1f18c62 --- /dev/null +++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp @@ -0,0 +1,26 @@ +//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +/// \brief The target for the AMDGPU backend +Target llvm::TheAMDGPUTarget; + +/// \brief Extern function to initialize the targets for the AMDGPU backend +extern "C" void LLVMInitializeR600TargetInfo() { + RegisterTarget<Triple::r600, false> + R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); +} diff --git a/lib/Target/R600/TargetInfo/CMakeLists.txt b/lib/Target/R600/TargetInfo/CMakeLists.txt new file mode 100644 index 0000000000..3d1584eba3 --- /dev/null +++ b/lib/Target/R600/TargetInfo/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMR600Info + AMDGPUTargetInfo.cpp + ) + +add_dependencies(LLVMR600Info AMDGPUCommonTableGen intrinsics_gen) diff --git a/lib/Target/R600/TargetInfo/LLVMBuild.txt b/lib/Target/R600/TargetInfo/LLVMBuild.txt new file mode 100644 index 0000000000..4c6fea4aa0 --- /dev/null +++ b/lib/Target/R600/TargetInfo/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = R600Info +parent = R600 +required_libraries = MC Support +add_to_library_groups = R600 diff --git a/lib/Target/R600/TargetInfo/Makefile b/lib/Target/R600/TargetInfo/Makefile new file mode 100644 index 0000000000..b8ac4e7823 --- /dev/null +++ b/lib/Target/R600/TargetInfo/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600Info + +# Hack: we need to include 'main' target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common |