diff options
-rw-r--r-- | include/llvm/Target/TargetInstrInfo.h | 68 | ||||
-rw-r--r-- | lib/CodeGen/ExecutionDepsFix.cpp | 27 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 52 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.h | 5 | ||||
-rw-r--r-- | test/CodeGen/X86/sse-domains.ll | 41 |
5 files changed, 193 insertions, 0 deletions
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 07f614d61d..590fc1e8f7 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -718,6 +718,74 @@ public: /// virtual void setExecutionDomain(MachineInstr *MI, unsigned Domain) const {} + + /// getPartialRegUpdateClearance - Returns the preferred minimum clearance + /// before an instruction with an unwanted partial register update. + /// + /// Some instructions only write part of a register, and implicitly need to + /// read the other parts of the register. This may cause unwanted stalls + /// preventing otherwise unrelated instructions from executing in parallel in + /// an out-of-order CPU. + /// + /// For example, the x86 instruction cvtsi2ss writes its result to bits + /// [31:0] of the destination xmm register. Bits [127:32] are unaffected, so + /// the instruction needs to wait for the old value of the register to become + /// available: + /// + /// addps %xmm1, %xmm0 + /// movaps %xmm0, (%rax) + /// cvtsi2ss %rbx, %xmm0 + /// + /// In the code above, the cvtsi2ss instruction needs to wait for the addps + /// instruction before it can issue, even though the high bits of %xmm0 + /// probably aren't needed. + /// + /// This hook returns the preferred clearance before MI, measured in + /// instructions. Other defs of MI's operand OpNum are avoided in the last N + /// instructions before MI. It should only return a positive value for + /// unwanted dependencies. If the old bits of the defined register have + /// useful values, or if MI is determined to otherwise read the dependency, + /// the hook should return 0. + /// + /// The unwanted dependency may be handled by: + /// + /// 1. Allocating the same register for an MI def and use. That makes the + /// unwanted dependency identical to a required dependency. + /// + /// 2. Allocating a register for the def that has no defs in the previous N + /// instructions. + /// + /// 3. Calling breakPartialRegDependency() with the same arguments. This + /// allows the target to insert a dependency breaking instruction. + /// + virtual unsigned + getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + // The default implementation returns 0 for no partial register dependency. + return 0; + } + + /// breakPartialRegDependency - Insert a dependency-breaking instruction + /// before MI to eliminate an unwanted dependency on OpNum. + /// + /// If it wasn't possible to avoid a def in the last N instructions before MI + /// (see getPartialRegUpdateClearance), this hook will be called to break the + /// unwanted dependency. + /// + /// On x86, an xorps instruction can be used as a dependency breaker: + /// + /// addps %xmm1, %xmm0 + /// movaps %xmm0, (%rax) + /// xorps %xmm0, %xmm0 + /// cvtsi2ss %rbx, %xmm0 + /// + /// An <imp-kill> operand should be added to MI if an instruction was + /// inserted. This ties the instructions together in the post-ra scheduler. + /// + virtual void + breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const {} + private: int CallFrameSetupOpcode, CallFrameDestroyOpcode; }; diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp index d094411116..050edce2ec 100644 --- a/lib/CodeGen/ExecutionDepsFix.cpp +++ b/lib/CodeGen/ExecutionDepsFix.cpp @@ -471,11 +471,34 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr << '\t' << *MI); + // How many instructions since rx was last written? + unsigned Clearance = CurInstr - LiveRegs[rx].Def; LiveRegs[rx].Def = CurInstr; // Kill off domains redefined by generic instructions. if (Kill) kill(rx); + + // Verify clearance before partial register updates. + unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI); + if (!Pref) + continue; + DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref); + if (Pref > Clearance) { + DEBUG(dbgs() << ": Break dependency.\n"); + TII->breakPartialRegDependency(MI, i, TRI); + continue; + } + + // The current clearance seems OK, but we may be ignoring a def from a + // back-edge. + if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) { + DEBUG(dbgs() << ": OK.\n"); + continue; + } + + // A def from an unprocessed back-edge may make us break this dependency. + DEBUG(dbgs() << ": Wait for back-edge to resolve.\n"); } ++CurInstr; @@ -663,6 +686,10 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { for (unsigned i = 0, e = Loops.size(); i != e; ++i) { MachineBasicBlock *MBB = Loops[i]; enterBasicBlock(MBB); + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) + if (!I->isDebugValue()) + processDefs(I, false); leaveBasicBlock(MBB); } diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index d9ffd8161f..9428fffae8 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2761,6 +2761,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, /// static bool hasPartialRegUpdate(unsigned Opcode) { switch (Opcode) { + case X86::CVTSI2SSrr: + case X86::CVTSI2SS64rr: + case X86::CVTSI2SDrr: + case X86::CVTSI2SD64rr: case X86::CVTSD2SSrr: case X86::Int_CVTSD2SSrr: case X86::CVTSS2SDrr: @@ -2789,6 +2793,54 @@ static bool hasPartialRegUpdate(unsigned Opcode) { return false; } +/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle +/// instructions we would like before a partial register update. +unsigned X86InstrInfo:: +getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode())) + return 0; + + // If MI is marked as reading Reg, the partial register update is wanted. + const MachineOperand &MO = MI->getOperand(0); + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (MO.readsReg() || MI->readsVirtualRegister(Reg)) + return 0; + } else { + if (MI->readsRegister(Reg, TRI)) + return 0; + } + + // If any of the preceding 16 instructions are reading Reg, insert a + // dependency breaking instruction. The magic number is based on a few + // Nehalem experiments. + return 16; +} + +void X86InstrInfo:: +breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + unsigned Reg = MI->getOperand(OpNum).getReg(); + if (X86::VR128RegClass.contains(Reg)) { + // These instructions are all floating point domain, so xorps is the best + // choice. + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + } else if (X86::VR256RegClass.contains(Reg)) { + // Use vxorps to clear the full ymm register. + // It wants to read and write the xmm sub-register. + unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) + .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + } else + return; + MI->addRegisterKilled(Reg, TRI, true); +} + MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 97009dbdbe..ee488d8f01 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -345,6 +345,11 @@ public: void setExecutionDomain(MachineInstr *MI, unsigned Domain) const; + unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const; + void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const; + MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, unsigned OpNum, diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll index d26d32287e..3b66f4fd5c 100644 --- a/test/CodeGen/X86/sse-domains.ll +++ b/test/CodeGen/X86/sse-domains.ll @@ -43,3 +43,44 @@ while.body: while.end: ret void } + +; CHECK: f2 +; +; This loop contains two cvtsi2ss instructions that update the same xmm +; register. Verify that the execution dependency fix pass breaks those +; dependencies by inserting xorps instructions. +; +; If the register allocator chooses different registers for the two cvtsi2ss +; instructions, they are still dependent on themselves. +; CHECK: xorps [[XMM1:%xmm[0-9]+]] +; CHECK: , [[XMM1]] +; CHECK: cvtsi2ss %{{.*}}, [[XMM1]] +; CHECK: xorps [[XMM2:%xmm[0-9]+]] +; CHECK: , [[XMM2]] +; CHECK: cvtsi2ss %{{.*}}, [[XMM2]] +; +define float @f2(i32 %m) nounwind uwtable readnone ssp { +entry: + %tobool3 = icmp eq i32 %m, 0 + br i1 %tobool3, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ] + %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] + %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ] + %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ] + %conv = sitofp i32 %n.04 to float + %add = fadd float %s1.06, %conv + %conv1 = sitofp i32 %m.addr.07 to float + %add2 = fadd float %s2.05, %conv1 + %inc = add nsw i32 %n.04, 1 + %dec = add nsw i32 %m.addr.07, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ] + %sub = fsub float %s1.0.lcssa, %s2.0.lcssa + ret float %sub +} |