diff options
author | Andrew Trick <atrick@apple.com> | 2013-01-09 03:36:49 +0000 |
---|---|---|
committer | Andrew Trick <atrick@apple.com> | 2013-01-09 03:36:49 +0000 |
commit | 47579cf390c42e0577519e0a2b6044baece9df00 (patch) | |
tree | 2744c5de5c5c825a168a20f90b9e099d8feaaf88 | |
parent | 2af949ddddfaf2feb4a446c754e09d2d8c207ce4 (diff) |
MIsched: add an ILP window property to machine model.
This was an experimental option, but needs to be defined
per-target. e.g. PPC A2 needs to aggressively hide latency.
I converted some in-order scheduling tests to A2. Hal is working on
more test cases.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171946 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/CodeGen/TargetSchedule.h | 3 | ||||
-rw-r--r-- | include/llvm/MC/MCSchedule.h | 19 | ||||
-rw-r--r-- | include/llvm/Target/TargetSchedule.td | 1 | ||||
-rw-r--r-- | lib/CodeGen/MachineScheduler.cpp | 12 | ||||
-rw-r--r-- | lib/Target/ARM/ARMScheduleA9.td | 3 | ||||
-rw-r--r-- | lib/Target/X86/X86Schedule.td | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86ScheduleAtom.td | 1 | ||||
-rw-r--r-- | test/CodeGen/PowerPC/misched-inorder-latency.ll (renamed from test/CodeGen/ARM/misched-inorder-latency.ll) | 33 | ||||
-rw-r--r-- | utils/TableGen/SubtargetEmitter.cpp | 1 |
9 files changed, 51 insertions, 27 deletions
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h index 4c4a2a8b95..484d7e200a 100644 --- a/include/llvm/CodeGen/TargetSchedule.h +++ b/include/llvm/CodeGen/TargetSchedule.h @@ -84,6 +84,9 @@ public: /// \brief Maximum number of micro-ops that may be scheduled per cycle. unsigned getIssueWidth() const { return SchedModel.IssueWidth; } + /// \brief Number of cycles the OOO processor is expected to hide. + unsigned getILPWindow() const { return SchedModel.ILPWindow; } + /// \brief Return the number of issue slots required for this MI. unsigned getNumMicroOps(const MachineInstr *MI, const MCSchedClassDesc *SC = 0) const; diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index 0c71ee5135..9e9474952a 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -155,7 +155,7 @@ public: // Optional InstrItinerary OperandCycles provides expected latency. // TODO: can't yet specify both min and expected latency per operand. int MinLatency; - static const unsigned DefaultMinLatency = -1; + static const int DefaultMinLatency = -1; // LoadLatency is the expected latency of load instructions. // @@ -172,6 +172,16 @@ public: unsigned HighLatency; static const unsigned DefaultHighLatency = 10; + // ILPWindow is the number of cycles that the scheduler effectively ignores + // before attempting to hide latency. This should be zero for in-order cpus to + // always hide expected latency. For out-of-order cpus, it may be tweaked as + // desired to roughly approximate instruction buffers. The actual threshold is + // not very important for an OOO processor, as long as it isn't too high. A + // nonzero value helps avoid rescheduling to hide latency when its is fairly + // obviously useless and makes register pressure heuristics more effective. + unsigned ILPWindow; + static const unsigned DefaultILPWindow = 0; + // MispredictPenalty is the typical number of extra cycles the processor // takes to recover from a branch misprediction. unsigned MispredictPenalty; @@ -196,6 +206,7 @@ public: MinLatency(DefaultMinLatency), LoadLatency(DefaultLoadLatency), HighLatency(DefaultHighLatency), + ILPWindow(DefaultILPWindow), MispredictPenalty(DefaultMispredictPenalty), ProcID(0), ProcResourceTable(0), SchedClassTable(0), NumProcResourceKinds(0), NumSchedClasses(0), @@ -205,12 +216,12 @@ public: } // Table-gen driven ctor. - MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp, - unsigned pi, const MCProcResourceDesc *pr, + MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned ilp, + unsigned mp, unsigned pi, const MCProcResourceDesc *pr, const MCSchedClassDesc *sc, unsigned npr, unsigned nsc, const InstrItinerary *ii): IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl), - MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr), + ILPWindow(ilp), MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc), InstrItineraries(ii) {} diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td index 0da82fdd89..b7920bae8a 100644 --- a/include/llvm/Target/TargetSchedule.td +++ b/include/llvm/Target/TargetSchedule.td @@ -76,6 +76,7 @@ class SchedMachineModel { int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle. int MinLatency = -1; // Determines which instrucions are allowed in a group. // (-1) inorder (0) ooo, (1): inorder +var latencies. + int ILPWindow = -1; // Cycles of latency likely hidden by hardware buffers. int LoadLatency = -1; // Cycles for loads to access the cache. int HighLatency = -1; // Approximation of cycles for "high latency" ops. int MispredictPenalty = -1; // Extra cycles for a mispredicted branch. diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 117b2bdccf..a32df7805b 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -48,15 +48,6 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden, static bool ViewMISchedDAGs = false; #endif // NDEBUG -// Threshold to very roughly model an out-of-order processor's instruction -// buffers. If the actual value of this threshold matters much in practice, then -// it can be specified by the machine model. For now, it's an experimental -// tuning knob to determine when and if it matters. -static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden, - cl::desc("Allow expected latency to exceed the critical path by N cycles " - "before attempting to balance ILP"), - cl::init(10U)); - // Experimental heuristics static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, cl::desc("Enable load clustering."), cl::init(true)); @@ -1297,7 +1288,8 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) { if (L > RemLatency) RemLatency = L; } - if (RemLatency + ExpectedLatency >= Rem->CriticalPath + ILPWindow + unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow(); + if (RemLatency + ExpectedLatency >= CriticalPathLimit && RemLatency > Rem->getMaxRemainingCount(SchedModel)) { Policy.ReduceLatency = true; DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n'); diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 404634fee9..4191931a5a 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -1887,6 +1887,9 @@ def CortexA9Model : SchedMachineModel { let LoadLatency = 2; // Optimistic load latency assuming bypass. // This is overriden by OperandCycles if the // Itineraries are queried instead. + let ILPWindow = 10; // Don't reschedule small blocks to hide + // latency. Minimum latency requirements are already + // modeled strictly by reserving resources. let MispredictPenalty = 8; // Based on estimate of pipeline depth. let Itineraries = CortexA9Itineraries; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index c14407f9ac..d99d085298 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -470,12 +470,17 @@ def IIC_NOP : InstrItinClass; // latencies. Since these latencies are not used for pipeline hazards, // they do not need to be exact. // +// ILPWindow=10 is an arbitrary threshold that approximates cycles of +// latency hidden by instruction buffers. The actual value is not very +// important but should be zero for inorder and nonzero for OOO processors. +// // The GenericModel contains no instruciton itineraries. def GenericModel : SchedMachineModel { let IssueWidth = 4; let MinLatency = 0; let LoadLatency = 4; let HighLatency = 10; + let ILPWindow = 10; } include "X86ScheduleAtom.td" diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 87102614cc..1e5f2d6c9a 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -525,6 +525,7 @@ def AtomModel : SchedMachineModel { // OperandCycles may be used for expected latency. let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles. + let ILPWindow = 0; // Always try to hide expected latency. let Itineraries = AtomItineraries; } diff --git a/test/CodeGen/ARM/misched-inorder-latency.ll b/test/CodeGen/PowerPC/misched-inorder-latency.ll index 8c06b4ce6e..8fae7ad4d1 100644 --- a/test/CodeGen/ARM/misched-inorder-latency.ll +++ b/test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -1,15 +1,15 @@ -; RUN: llc < %s -enable-misched -march=thumb -mcpu=swift \ -; RUN: -pre-RA-sched=source -scheditins=false -ilp-window=0 \ +; RUN: llc < %s -enable-misched -pre-RA-sched=source -scheditins=false \ ; RUN: -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s ; -; For these tests, we set -ilp-window=0 to simulate in order processor. +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-bgq-linux" -; %val1 is a 3-cycle load live out of %entry. It should be hoisted +; %val1 is a load live out of %entry. It should be hoisted ; above the add. -; CHECK: @testload +; CHECK: testload: ; CHECK: %entry -; CHECK: ldr -; CHECK: adds +; CHECK: lwz +; CHECK: addi ; CHECK: bne ; CHECK: %true define i32 @testload(i32 *%ptr, i32 %sumin) { @@ -34,15 +34,22 @@ end: ; The prefetch gets a default latency of 3 cycles and should be hoisted ; above the add. ; -; CHECK: @testprefetch +; CHECK: testprefetch: ; CHECK: %entry -; CHECK: pld -; CHECK: adds -; CHECK: bx +; CHECK: dcbt +; CHECK: addi +; CHECK: blr define i32 @testprefetch(i8 *%ptr, i32 %i) { entry: - %tmp = add i32 %i, 1 + %val1 = add i32 %i, 1 tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 ) - ret i32 %tmp + %p = icmp eq i32 %i, 0 + br i1 %p, label %true, label %end +true: + %val2 = add i32 %val1, 1 + br label %end +end: + %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ] + ret i32 %valmerge } declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp index 3b7d006fd1..fc8d00dd83 100644 --- a/utils/TableGen/SubtargetEmitter.cpp +++ b/utils/TableGen/SubtargetEmitter.cpp @@ -1108,6 +1108,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) { EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ','); + EmitProcessorProp(OS, PI->ModelDef, "ILPWindow", ','); EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ','); OS << " " << PI->Index << ", // Processor ID\n"; if (PI->hasInstrSchedModel()) |