aboutsummaryrefslogtreecommitdiff
path: root/lib/CodeGen/MachineScheduler.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/CodeGen/MachineScheduler.cpp')
-rw-r--r--lib/CodeGen/MachineScheduler.cpp269
1 files changed, 249 insertions, 20 deletions
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index a4817d09c0..8d43360e67 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -58,6 +58,14 @@ static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
"before attempting to balance ILP"),
cl::init(10U));
+// Experimental heuristics
+static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
+ cl::desc("Enable load clustering."), cl::init(true));
+
+// Experimental heuristics
+static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
+ cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
//===----------------------------------------------------------------------===//
// Machine Instruction Scheduling Pass and Registry
//===----------------------------------------------------------------------===//
@@ -303,6 +311,19 @@ void ReadyQueue::dump() {
// preservation.
//===----------------------------------------------------------------------===//
+bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
+ if (SuccSU != &ExitSU) {
+ // Do not use WillCreateCycle, it assumes SD scheduling.
+ // If Pred is reachable from Succ, then the edge creates a cycle.
+ if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
+ return false;
+ Topo.AddPred(SuccSU, PredDep.getSUnit());
+ }
+ SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
+ // Return true regardless of whether a new edge needed to be inserted.
+ return true;
+}
+
/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
/// NumPredsLeft reaches zero, release the successor node.
///
@@ -310,6 +331,12 @@ void ReadyQueue::dump() {
void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
SUnit *SuccSU = SuccEdge->getSUnit();
+ if (SuccEdge->isWeak()) {
+ --SuccSU->WeakPredsLeft;
+ if (SuccEdge->isCluster())
+ NextClusterSucc = SuccSU;
+ return;
+ }
#ifndef NDEBUG
if (SuccSU->NumPredsLeft == 0) {
dbgs() << "*** Scheduling failed! ***\n";
@@ -338,6 +365,12 @@ void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
SUnit *PredSU = PredEdge->getSUnit();
+ if (PredEdge->isWeak()) {
+ --PredSU->WeakSuccsLeft;
+ if (PredEdge->isCluster())
+ NextClusterPred = PredSU;
+ return;
+ }
#ifndef NDEBUG
if (PredSU->NumSuccsLeft == 0) {
dbgs() << "*** Scheduling failed! ***\n";
@@ -474,6 +507,8 @@ updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
void ScheduleDAGMI::schedule() {
buildDAGWithRegPressure();
+ Topo.InitDAGTopologicalSorting();
+
postprocessDAG();
DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
@@ -530,17 +565,20 @@ void ScheduleDAGMI::postprocessDAG() {
}
// Release all DAG roots for scheduling.
+//
+// Nodes with unreleased weak edges can still be roots.
void ScheduleDAGMI::releaseRoots() {
SmallVector<SUnit*, 16> BotRoots;
for (std::vector<SUnit>::iterator
I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+ SUnit *SU = &(*I);
// A SUnit is ready to top schedule if it has no predecessors.
- if (I->Preds.empty())
- SchedImpl->releaseTopNode(&(*I));
+ if (!I->NumPredsLeft && SU != &EntrySU)
+ SchedImpl->releaseTopNode(SU);
// A SUnit is ready to bottom schedule if it has no successors.
- if (I->Succs.empty())
- BotRoots.push_back(&(*I));
+ if (!I->NumSuccsLeft && SU != &ExitSU)
+ BotRoots.push_back(SU);
}
// Release bottom roots in reverse order so the higher priority nodes appear
// first. This is more natural and slightly more efficient.
@@ -551,17 +589,18 @@ void ScheduleDAGMI::releaseRoots() {
/// Identify DAG roots and setup scheduler queues.
void ScheduleDAGMI::initQueues() {
+ NextClusterSucc = NULL;
+ NextClusterPred = NULL;
// Initialize the strategy before modifying the DAG.
SchedImpl->initialize(this);
- // Release edges from the special Entry node or to the special Exit node.
+ // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
+ releaseRoots();
+
releaseSuccessors(&EntrySU);
releasePredecessors(&ExitSU);
- // Release all DAG roots for scheduling.
- releaseRoots();
-
SchedImpl->registerRoots();
CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
@@ -655,6 +694,166 @@ void ScheduleDAGMI::dumpSchedule() const {
#endif
//===----------------------------------------------------------------------===//
+// LoadClusterMutation - DAG post-processing to cluster loads.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between neighboring
+/// loads.
+class LoadClusterMutation : public ScheduleDAGMutation {
+ struct LoadInfo {
+ SUnit *SU;
+ unsigned BaseReg;
+ unsigned Offset;
+ LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
+ : SU(su), BaseReg(reg), Offset(ofs) {}
+ };
+ static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS,
+ const LoadClusterMutation::LoadInfo &RHS);
+
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+public:
+ LoadClusterMutation(const TargetInstrInfo *tii,
+ const TargetRegisterInfo *tri)
+ : TII(tii), TRI(tri) {}
+
+ virtual void apply(ScheduleDAGMI *DAG);
+protected:
+ void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+bool LoadClusterMutation::LoadInfoLess(
+ const LoadClusterMutation::LoadInfo &LHS,
+ const LoadClusterMutation::LoadInfo &RHS) {
+ if (LHS.BaseReg != RHS.BaseReg)
+ return LHS.BaseReg < RHS.BaseReg;
+ return LHS.Offset < RHS.Offset;
+}
+
+void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
+ ScheduleDAGMI *DAG) {
+ SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
+ for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
+ SUnit *SU = Loads[Idx];
+ unsigned BaseReg;
+ unsigned Offset;
+ if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
+ LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
+ }
+ if (LoadRecords.size() < 2)
+ return;
+ std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess);
+ unsigned ClusterLength = 1;
+ for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
+ if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
+ ClusterLength = 1;
+ continue;
+ }
+
+ SUnit *SUa = LoadRecords[Idx].SU;
+ SUnit *SUb = LoadRecords[Idx+1].SU;
+ if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
+ && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+
+ DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
+ << SUb->NodeNum << ")\n");
+ // Copy successor edges from SUa to SUb. Interleaving computation
+ // dependent on SUa can prevent load combining due to register reuse.
+ // Predecessor edges do not need to be copied from SUb to SUa since nearby
+ // loads should have effectively the same inputs.
+ for (SUnit::const_succ_iterator
+ SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
+ if (SI->getSUnit() == SUb)
+ continue;
+ DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
+ DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
+ }
+ ++ClusterLength;
+ }
+ else
+ ClusterLength = 1;
+ }
+}
+
+/// \brief Callback from DAG postProcessing to create cluster edges for loads.
+void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
+ // Map DAG NodeNum to store chain ID.
+ DenseMap<unsigned, unsigned> StoreChainIDs;
+ // Map each store chain to a set of dependent loads.
+ SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+ SUnit *SU = &DAG->SUnits[Idx];
+ if (!SU->getInstr()->mayLoad())
+ continue;
+ unsigned ChainPredID = DAG->SUnits.size();
+ for (SUnit::const_pred_iterator
+ PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+ if (PI->isCtrl()) {
+ ChainPredID = PI->getSUnit()->NodeNum;
+ break;
+ }
+ }
+ // Check if this chain-like pred has been seen
+ // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
+ unsigned NumChains = StoreChainDependents.size();
+ std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
+ StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
+ if (Result.second)
+ StoreChainDependents.resize(NumChains + 1);
+ StoreChainDependents[Result.first->second].push_back(SU);
+ }
+ // Iterate over the store chains.
+ for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
+ clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+ const TargetInstrInfo *TII;
+public:
+ MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
+
+ virtual void apply(ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+/// \brief Callback from DAG postProcessing to create cluster edges to encourage
+/// fused operations.
+void MacroFusion::apply(ScheduleDAGMI *DAG) {
+ // For now, assume targets can only fuse with the branch.
+ MachineInstr *Branch = DAG->ExitSU.getInstr();
+ if (!Branch)
+ return;
+
+ for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
+ SUnit *SU = &DAG->SUnits[--Idx];
+ if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
+ continue;
+
+ // Create a single weak edge from SU to ExitSU. The only effect is to cause
+ // bottom-up scheduling to heavily prioritize the clustered SU. There is no
+ // need to copy predecessor edges from ExitSU to SU, since top-down
+ // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+ // of SU, we could create an artificial edge from the deepest root, but it
+ // hasn't been needed yet.
+ bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
+ (void)Success;
+ assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+ DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
+ break;
+ }
+}
+
+//===----------------------------------------------------------------------===//
// ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
//===----------------------------------------------------------------------===//
@@ -666,9 +865,10 @@ public:
/// Represent the type of SchedCandidate found within a single queue.
/// pickNodeBidirectional depends on these listed by decreasing priority.
enum CandReason {
- NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand,
- BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce,
- SingleMax, MultiPressure, NextDefUse, NodeOrder};
+ NoCand, SingleExcess, SingleCritical, Cluster,
+ ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
+ TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
+ NodeOrder};
#ifndef NDEBUG
static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
@@ -1019,6 +1219,8 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
I != E; ++I) {
+ if (I->isWeak())
+ continue;
unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
unsigned MinLatency = I->getMinLatency();
#ifndef NDEBUG
@@ -1414,6 +1616,7 @@ static bool tryLess(unsigned TryVal, unsigned CandVal,
}
return false;
}
+
static bool tryGreater(unsigned TryVal, unsigned CandVal,
ConvergingScheduler::SchedCandidate &TryCand,
ConvergingScheduler::SchedCandidate &Cand,
@@ -1430,6 +1633,10 @@ static bool tryGreater(unsigned TryVal, unsigned CandVal,
return false;
}
+static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+ return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
+}
+
/// Apply a set of heursitics to a new candidate. Heuristics are currently
/// hierarchical. This may be more efficient than a graduated cost model because
/// we don't need to evaluate all aspects of the model for each node in the
@@ -1472,6 +1679,26 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
if (Cand.Reason == SingleCritical)
Cand.Reason = MultiPressure;
+ // Keep clustered nodes together to encourage downstream peephole
+ // optimizations which may reduce resource requirements.
+ //
+ // This is a best effort to set things up for a post-RA pass. Optimizations
+ // like generating loads of multiple registers should ideally be done within
+ // the scheduler pass by combining the loads during DAG postprocessing.
+ const SUnit *NextClusterSU =
+ Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
+ TryCand, Cand, Cluster))
+ return;
+ // Currently, weak edges are for clustering, so we hard-code that reason.
+ // However, deferring the current TryCand will not change Cand's reason.
+ CandReason OrigReason = Cand.Reason;
+ if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
+ getWeakLeft(Cand.SU, Zone.isTop()),
+ TryCand, Cand, Cluster)) {
+ Cand.Reason = OrigReason;
+ return;
+ }
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
@@ -1518,15 +1745,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
// Prefer immediate defs/users of the last scheduled instruction. This is a
// nice pressure avoidance strategy that also conserves the processor's
// register renaming resources and keeps the machine code readable.
- if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) {
- TryCand.Reason = NextDefUse;
+ if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU),
+ TryCand, Cand, NextDefUse))
return;
- }
- if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) {
- if (Cand.Reason > NextDefUse)
- Cand.Reason = NextDefUse;
- return;
- }
+
// Fall through to original instruction order.
if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
|| (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
@@ -1572,6 +1794,7 @@ const char *ConvergingScheduler::getReasonStr(
case NoCand: return "NOCAND ";
case SingleExcess: return "REG-EXCESS";
case SingleCritical: return "REG-CRIT ";
+ case Cluster: return "CLUSTER ";
case SingleMax: return "REG-MAX ";
case MultiPressure: return "REG-MULTI ";
case ResourceReduce: return "RES-REDUCE";
@@ -1812,7 +2035,13 @@ void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
assert((!ForceTopDown || !ForceBottomUp) &&
"-misched-topdown incompatible with -misched-bottomup");
- return new ScheduleDAGMI(C, new ConvergingScheduler());
+ ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
+ // Register DAG post-processors.
+ if (EnableLoadCluster)
+ DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
+ if (EnableMacroFusion)
+ DAG->addMutation(new MacroFusion(DAG->TII));
+ return DAG;
}
static MachineSchedRegistry
ConvergingSchedRegistry("converge", "Standard converging scheduler.",