From ae692f2baedf53504af2715993b166950e185a55 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Mon, 12 Nov 2012 19:28:57 +0000
Subject: misched: Infrastructure for weak DAG edges.

This adds support for weak DAG edges to the general scheduling
infrastructure in preparation for MachineScheduler support for
heuristics based on weak edges.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167738 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index a4817d09c0..71cc072d47 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -310,6 +310,10 @@ void ReadyQueue::dump() {
 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
   SUnit *SuccSU = SuccEdge->getSUnit();
 
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
 #ifndef NDEBUG
   if (SuccSU->NumPredsLeft == 0) {
     dbgs() << "*** Scheduling failed! ***\n";
@@ -338,6 +342,10 @@ void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
   SUnit *PredSU = PredEdge->getSUnit();
 
+  if (PredEdge->isWeak()) {
+    --PredSU->WeakSuccsLeft;
+    return;
+  }
 #ifndef NDEBUG
   if (PredSU->NumSuccsLeft == 0) {
     dbgs() << "*** Scheduling failed! ***\n";
@@ -530,17 +538,20 @@ void ScheduleDAGMI::postprocessDAG() {
 }
 
 // Release all DAG roots for scheduling.
+//
+// Nodes with unreleased weak edges can still be roots.
 void ScheduleDAGMI::releaseRoots() {
   SmallVector<SUnit*, 16> BotRoots;
 
   for (std::vector<SUnit>::iterator
          I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+    SUnit *SU = &(*I);
     // A SUnit is ready to top schedule if it has no predecessors.
-    if (I->Preds.empty())
-      SchedImpl->releaseTopNode(&(*I));
+    if (!I->NumPredsLeft && SU != &EntrySU)
+      SchedImpl->releaseTopNode(SU);
     // A SUnit is ready to bottom schedule if it has no successors.
-    if (I->Succs.empty())
-      BotRoots.push_back(&(*I));
+    if (!I->NumSuccsLeft && SU != &ExitSU)
+      BotRoots.push_back(SU);
   }
   // Release bottom roots in reverse order so the higher priority nodes appear
   // first. This is more natural and slightly more efficient.
@@ -555,13 +566,12 @@ void ScheduleDAGMI::initQueues() {
   // Initialize the strategy before modifying the DAG.
   SchedImpl->initialize(this);
 
-  // Release edges from the special Entry node or to the special Exit node.
+  // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
+  releaseRoots();
+
   releaseSuccessors(&EntrySU);
   releasePredecessors(&ExitSU);
 
-  // Release all DAG roots for scheduling.
-  releaseRoots();
-
   SchedImpl->registerRoots();
 
   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
-- 
cgit v1.2.3-70-g09d2


From 9b5caaa9c452f262a52dd5ac7ebbc722da5a63de Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Mon, 12 Nov 2012 19:40:10 +0000
Subject: misched: Target-independent support for load/store clustering.

This infrastructure is generally useful for any target that wants to
strongly prefer two instructions to be adjacent after scheduling.

A following checkin will add target-specific hooks with unit
tests. Then this feature will be enabled by default with misched.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167742 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineScheduler.h |  27 ++++-
 include/llvm/Target/TargetInstrInfo.h   |  13 +++
 lib/CodeGen/MachineScheduler.cpp        | 188 ++++++++++++++++++++++++++++++--
 lib/Target/ARM/ARMBaseInstrInfo.cpp     |   6 +
 4 files changed, 220 insertions(+), 14 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 31bd606f93..08f9182805 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -202,6 +202,10 @@ protected:
   RegisterClassInfo *RegClassInfo;
   MachineSchedStrategy *SchedImpl;
 
+  /// Topo - A topological ordering for SUnits which permits fast IsReachable
+  /// and similar queries.
+  ScheduleDAGTopologicalSort Topo;
+
   /// Ordered list of DAG postprocessing steps.
   std::vector<ScheduleDAGMutation*> Mutations;
 
@@ -226,6 +230,10 @@ protected:
   IntervalPressure BotPressure;
   RegPressureTracker BotRPTracker;
 
+  /// Record the next node in a scheduled cluster.
+  const SUnit *NextClusterPred;
+  const SUnit *NextClusterSucc;
+
 #ifndef NDEBUG
   /// The number of instructions scheduled so far. Used to cut off the
   /// scheduler at the point determined by misched-cutoff.
@@ -236,24 +244,35 @@ public:
   ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
     ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
     AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
-    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
-    CurrentBottom(), BotRPTracker(BotPressure) {
+    Topo(SUnits, &ExitSU), RPTracker(RegPressure), CurrentTop(),
+    TopRPTracker(TopPressure), CurrentBottom(), BotRPTracker(BotPressure),
+    NextClusterPred(NULL), NextClusterSucc(NULL) {
 #ifndef NDEBUG
     NumInstrsScheduled = 0;
 #endif
   }
 
   virtual ~ScheduleDAGMI() {
+    DeleteContainerPointers(Mutations);
     delete SchedImpl;
   }
 
   /// Add a postprocessing step to the DAG builder.
   /// Mutations are applied in the order that they are added after normal DAG
   /// building and before MachineSchedStrategy initialization.
+  ///
+  /// ScheduleDAGMI takes ownership of the Mutation object.
   void addMutation(ScheduleDAGMutation *Mutation) {
     Mutations.push_back(Mutation);
   }
 
+  /// \brief Add a DAG edge to the given SU with the given predecessor
+  /// dependence data.
+  ///
+  /// \returns true if the edge may be added without creating a cycle OR if an
+  /// equivalent edge already existed (false indicates failure).
+  bool addEdge(SUnit *SuccSU, const SDep &PredDep);
+
   MachineBasicBlock::iterator top() const { return CurrentTop; }
   MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
 
@@ -285,6 +304,10 @@ public:
     return RegionCriticalPSets;
   }
 
+  const SUnit *getNextClusterPred() const { return NextClusterPred; }
+
+  const SUnit *getNextClusterSucc() const { return NextClusterSucc; }
+
 protected:
   // Top-Level entry points for the schedule() driver...
 
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 4570813ba6..4f8ae01291 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -621,6 +621,19 @@ public:
     return false;
   }
 
+  /// \brief Get the base register and byte offset of a load/store instr.
+  virtual bool getLdStBaseRegImmOfs(MachineInstr *LdSt,
+                                    unsigned &BaseReg, unsigned &Offset,
+                                    const TargetRegisterInfo *TRI) const {
+    return false;
+  }
+
+  virtual bool shouldScheduleLoadsNear(MachineInstr *FirstLdSt,
+                                       MachineInstr *SecondLdSt,
+                                       unsigned NumLoads) const {
+    return false;
+  }
+
   /// ReverseBranchCondition - Reverses the branch condition of the specified
   /// condition list, returning false on success and true if it cannot be
   /// reversed.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 71cc072d47..dbab6bae28 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -58,6 +58,10 @@ static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
            "before attempting to balance ILP"),
   cl::init(10U));
 
+// Experimental heuristics
+static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
+  cl::desc("Enable load clustering."));
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -303,6 +307,17 @@ void ReadyQueue::dump() {
 // preservation.
 //===----------------------------------------------------------------------===//
 
+bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
+  // Do not use WillCreateCycle, it assumes SD scheduling.
+  // If Pred is reachable from Succ, then the edge creates a cycle.
+  if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
+    return false;
+  Topo.AddPred(SuccSU, PredDep.getSUnit());
+  SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
+  // Return true regardless of whether a new edge needed to be inserted.
+  return true;
+}
+
 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
 /// NumPredsLeft reaches zero, release the successor node.
 ///
@@ -312,6 +327,8 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
 
   if (SuccEdge->isWeak()) {
     --SuccSU->WeakPredsLeft;
+    if (SuccEdge->isCluster())
+      NextClusterSucc = SuccSU;
     return;
   }
 #ifndef NDEBUG
@@ -344,6 +361,8 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
 
   if (PredEdge->isWeak()) {
     --PredSU->WeakSuccsLeft;
+    if (PredEdge->isCluster())
+      NextClusterPred = PredSU;
     return;
   }
 #ifndef NDEBUG
@@ -482,6 +501,8 @@ updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
 void ScheduleDAGMI::schedule() {
   buildDAGWithRegPressure();
 
+  Topo.InitDAGTopologicalSorting();
+
   postprocessDAG();
 
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
@@ -562,6 +583,8 @@ void ScheduleDAGMI::releaseRoots() {
 
 /// Identify DAG roots and setup scheduler queues.
 void ScheduleDAGMI::initQueues() {
+  NextClusterSucc = NULL;
+  NextClusterPred = NULL;
 
   // Initialize the strategy before modifying the DAG.
   SchedImpl->initialize(this);
@@ -664,6 +687,119 @@ void ScheduleDAGMI::dumpSchedule() const {
 }
 #endif
 
+namespace {
+/// \brief Post-process the DAG to create cluster edges between neighboring
+/// loads.
+class LoadClusterMutation : public ScheduleDAGMutation {
+  struct LoadInfo {
+    SUnit *SU;
+    unsigned BaseReg;
+    unsigned Offset;
+    LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
+      : SU(su), BaseReg(reg), Offset(ofs) {}
+  };
+  static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS,
+                           const LoadClusterMutation::LoadInfo &RHS);
+
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+public:
+  LoadClusterMutation(const TargetInstrInfo *tii,
+                      const TargetRegisterInfo *tri)
+    : TII(tii), TRI(tri) {}
+
+  virtual void apply(ScheduleDAGMI *DAG);
+protected:
+  void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+bool LoadClusterMutation::LoadInfoLess(
+  const LoadClusterMutation::LoadInfo &LHS,
+  const LoadClusterMutation::LoadInfo &RHS) {
+  if (LHS.BaseReg != RHS.BaseReg)
+    return LHS.BaseReg < RHS.BaseReg;
+  return LHS.Offset < RHS.Offset;
+}
+
+void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
+                                                  ScheduleDAGMI *DAG) {
+  SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
+  for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
+    SUnit *SU = Loads[Idx];
+    unsigned BaseReg;
+    unsigned Offset;
+    if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
+      LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
+  }
+  if (LoadRecords.size() < 2)
+    return;
+  std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess);
+  unsigned ClusterLength = 1;
+  for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
+    if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
+      ClusterLength = 1;
+      continue;
+    }
+
+    SUnit *SUa = LoadRecords[Idx].SU;
+    SUnit *SUb = LoadRecords[Idx+1].SU;
+    if (TII->shouldScheduleLoadsNear(SUa->getInstr(), SUb->getInstr(),
+                                     ClusterLength)
+        && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+
+      DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
+            << SUb->NodeNum << ")\n");
+      // Copy successor edges from SUa to SUb. Interleaving computation
+      // dependent on SUa can prevent load combining due to register reuse.
+      // Predecessor edges do not need to be copied from SUb to SUa since nearby
+      // loads should have effectively the same inputs.
+      for (SUnit::const_succ_iterator
+             SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
+        if (SI->getSUnit() == SUb)
+          continue;
+        DEBUG(dbgs() << "  Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
+        DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
+      }
+      ++ClusterLength;
+    }
+    else
+      ClusterLength = 1;
+  }
+}
+
+/// \brief Callback from DAG postProcessing to create cluster edges for loads.
+void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
+  // Map DAG NodeNum to store chain ID.
+  DenseMap<unsigned, unsigned> StoreChainIDs;
+  // Map each store chain to a set of dependent loads.
+  SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
+  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+    SUnit *SU = &DAG->SUnits[Idx];
+    if (!SU->getInstr()->mayLoad())
+      continue;
+    unsigned ChainPredID = DAG->SUnits.size();
+    for (SUnit::const_pred_iterator
+           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+      if (PI->isCtrl()) {
+        ChainPredID = PI->getSUnit()->NodeNum;
+        break;
+      }
+    }
+    // Check if this chain-like pred has been seen
+    // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
+    unsigned NumChains = StoreChainDependents.size();
+    std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
+      StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
+    if (Result.second)
+      StoreChainDependents.resize(NumChains + 1);
+    StoreChainDependents[Result.first->second].push_back(SU);
+  }
+  // Iterate over the store chains.
+  for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
+    clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
+}
+
 //===----------------------------------------------------------------------===//
 // ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
@@ -676,9 +812,10 @@ public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
   enum CandReason {
-    NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand,
-    BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce,
-    SingleMax, MultiPressure, NextDefUse, NodeOrder};
+    NoCand, SingleExcess, SingleCritical, Cluster,
+    ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
+    TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
+    NodeOrder};
 
 #ifndef NDEBUG
   static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
@@ -1029,6 +1166,8 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
 
   for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
+    if (I->isWeak())
+      continue;
     unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
     unsigned MinLatency = I->getMinLatency();
 #ifndef NDEBUG
@@ -1424,6 +1563,7 @@ static bool tryLess(unsigned TryVal, unsigned CandVal,
   }
   return false;
 }
+
 static bool tryGreater(unsigned TryVal, unsigned CandVal,
                        ConvergingScheduler::SchedCandidate &TryCand,
                        ConvergingScheduler::SchedCandidate &Cand,
@@ -1440,6 +1580,10 @@ static bool tryGreater(unsigned TryVal, unsigned CandVal,
   return false;
 }
 
+static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+  return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
+}
+
 /// Apply a set of heursitics to a new candidate. Heuristics are currently
 /// hierarchical. This may be more efficient than a graduated cost model because
 /// we don't need to evaluate all aspects of the model for each node in the
@@ -1482,6 +1626,26 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
   if (Cand.Reason == SingleCritical)
     Cand.Reason = MultiPressure;
 
+  // Keep clustered nodes together to encourage downstream peephole
+  // optimizations which may reduce resource requirements.
+  //
+  // This is a best effort to set things up for a post-RA pass. Optimizations
+  // like generating loads of multiple registers should ideally be done within
+  // the scheduler pass by combining the loads during DAG postprocessing.
+  const SUnit *NextClusterSU =
+    Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
+                 TryCand, Cand, Cluster))
+    return;
+  // Currently, weak edges are for clustering, so we hard-code that reason.
+  // However, deferring the current TryCand will not change Cand's reason.
+  CandReason OrigReason = Cand.Reason;
+  if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
+              getWeakLeft(Cand.SU, Zone.isTop()),
+              TryCand, Cand, Cluster)) {
+    Cand.Reason = OrigReason;
+    return;
+  }
   // Avoid critical resource consumption and balance the schedule.
   TryCand.initResourceDelta(DAG, SchedModel);
   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
@@ -1528,15 +1692,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
   // Prefer immediate defs/users of the last scheduled instruction. This is a
   // nice pressure avoidance strategy that also conserves the processor's
   // register renaming resources and keeps the machine code readable.
-  if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) {
-    TryCand.Reason = NextDefUse;
-    return;
-  }
-  if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) {
-    if (Cand.Reason > NextDefUse)
-      Cand.Reason = NextDefUse;
+  if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU),
+                 TryCand, Cand, NextDefUse))
     return;
-  }
+
   // Fall through to original instruction order.
   if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
       || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
@@ -1582,6 +1741,7 @@ const char *ConvergingScheduler::getReasonStr(
   case NoCand:         return "NOCAND    ";
   case SingleExcess:   return "REG-EXCESS";
   case SingleCritical: return "REG-CRIT  ";
+  case Cluster:        return "CLUSTER   ";
   case SingleMax:      return "REG-MAX   ";
   case MultiPressure:  return "REG-MULTI ";
   case ResourceReduce: return "RES-REDUCE";
@@ -1822,7 +1982,11 @@ void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
   assert((!ForceTopDown || !ForceBottomUp) &&
          "-misched-topdown incompatible with -misched-bottomup");
-  return new ScheduleDAGMI(C, new ConvergingScheduler());
+  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
+  // Register DAG post-processors.
+  if (EnableLoadCluster)
+    DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
+  return DAG;
 }
 static MachineSchedRegistry
 ConvergingSchedRegistry("converge", "Standard converging scheduler.",
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 3c7bb24f42..3288a71171 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1373,6 +1373,9 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
 /// only return true if the base pointers are the same and the only differences
 /// between the two addresses is the offset. It also returns the offsets by
 /// reference.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
 bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
                                                int64_t &Offset1,
                                                int64_t &Offset2) const {
@@ -1447,6 +1450,9 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
 /// from the common base address. It returns true if it decides it's desirable
 /// to schedule the two loads together. "NumLoads" is the number of loads that
 /// have already been scheduled after Load1.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
 bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                                int64_t Offset1, int64_t Offset2,
                                                unsigned NumLoads) const {
-- 
cgit v1.2.3-70-g09d2


From 6996fd0b543cf8bd4a0d4e09e80a168f0ae052c5 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Mon, 12 Nov 2012 19:52:20 +0000
Subject: misched: Target-independent support for MacroFusion.

Uses the infrastructure from r167742 to support clustering instructure
that the target processor can "fuse". e.g. cmp+jmp.

Next step: target hook implementations with test cases, and enable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167744 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetInstrInfo.h |  7 ++++
 lib/CodeGen/MachineScheduler.cpp      | 66 ++++++++++++++++++++++++++++++++---
 2 files changed, 68 insertions(+), 5 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 4f8ae01291..97fddeeca1 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -634,6 +634,13 @@ public:
     return false;
   }
 
+  /// \brief Can this target fuse the given instructions if they are scheduled
+  /// adjacent.
+  virtual bool shouldScheduleAdjacent(MachineInstr* First,
+                                      MachineInstr *Second) const {
+    return false;
+  }
+
   /// ReverseBranchCondition - Reverses the branch condition of the specified
   /// condition list, returning false on success and true if it cannot be
   /// reversed.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index dbab6bae28..b05d7263cd 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -62,6 +62,10 @@ static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
   cl::desc("Enable load clustering."));
 
+// Experimental heuristics
+static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."));
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -308,11 +312,13 @@ void ReadyQueue::dump() {
 //===----------------------------------------------------------------------===//
 
 bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
-  // Do not use WillCreateCycle, it assumes SD scheduling.
-  // If Pred is reachable from Succ, then the edge creates a cycle.
-  if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
-    return false;
-  Topo.AddPred(SuccSU, PredDep.getSUnit());
+  if (SuccSU != &ExitSU) {
+    // Do not use WillCreateCycle, it assumes SD scheduling.
+    // If Pred is reachable from Succ, then the edge creates a cycle.
+    if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
+      return false;
+    Topo.AddPred(SuccSU, PredDep.getSUnit());
+  }
   SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
   // Return true regardless of whether a new edge needed to be inserted.
   return true;
@@ -687,6 +693,10 @@ void ScheduleDAGMI::dumpSchedule() const {
 }
 #endif
 
+//===----------------------------------------------------------------------===//
+// LoadClusterMutation - DAG post-processing to cluster loads.
+//===----------------------------------------------------------------------===//
+
 namespace {
 /// \brief Post-process the DAG to create cluster edges between neighboring
 /// loads.
@@ -800,6 +810,50 @@ void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
     clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
 }
 
+//===----------------------------------------------------------------------===//
+// MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+  const TargetInstrInfo *TII;
+public:
+  MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
+
+  virtual void apply(ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+/// \brief Callback from DAG postProcessing to create cluster edges to encourage
+/// fused operations.
+void MacroFusion::apply(ScheduleDAGMI *DAG) {
+  // For now, assume targets can only fuse with the branch.
+  MachineInstr *Branch = DAG->ExitSU.getInstr();
+  if (!Branch)
+    return;
+
+  for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
+    SUnit *SU = &DAG->SUnits[--Idx];
+    if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
+      continue;
+
+    // Create a single weak edge from SU to ExitSU. The only effect is to cause
+    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
+    // need to copy predecessor edges from ExitSU to SU, since top-down
+    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+    // of SU, we could create an artificial edge from the deepest root, but it
+    // hasn't been needed yet.
+    bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
+    (void)Success;
+    assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+    DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
+    break;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
@@ -1986,6 +2040,8 @@ static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
   // Register DAG post-processors.
   if (EnableLoadCluster)
     DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
+  if (EnableMacroFusion)
+    DAG->addMutation(new MacroFusion(DAG->TII));
   return DAG;
 }
 static MachineSchedRegistry
-- 
cgit v1.2.3-70-g09d2


From a7d2d564d918a9cb9105d3b2b4176b45af36a20e Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Mon, 12 Nov 2012 21:28:10 +0000
Subject: misched: rename interfaceto avoid gcc warnings

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167753 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetInstrInfo.h | 6 +++---
 lib/CodeGen/MachineScheduler.cpp      | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 97fddeeca1..95d2b916aa 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -628,9 +628,9 @@ public:
     return false;
   }
 
-  virtual bool shouldScheduleLoadsNear(MachineInstr *FirstLdSt,
-                                       MachineInstr *SecondLdSt,
-                                       unsigned NumLoads) const {
+  virtual bool shouldClusterLoads(MachineInstr *FirstLdSt,
+                                  MachineInstr *SecondLdSt,
+                                  unsigned NumLoads) const {
     return false;
   }
 
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index b05d7263cd..ee8138c0c4 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -754,8 +754,7 @@ void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
 
     SUnit *SUa = LoadRecords[Idx].SU;
     SUnit *SUb = LoadRecords[Idx+1].SU;
-    if (TII->shouldScheduleLoadsNear(SUa->getInstr(), SUb->getInstr(),
-                                     ClusterLength)
+    if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
         && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
 
       DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
-- 
cgit v1.2.3-70-g09d2


From ad1cc1d1bfc0accd3f1af5c02ac367ff46a4bfdf Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Tue, 13 Nov 2012 08:47:29 +0000
Subject: misched: Allow subtargets to enable misched and dependent options.

This allows me to begin enabling (or backing out) misched by default
for one subtarget at a time. To run misched we typically want to:
- Disable SelectionDAG scheduling (use the source order scheduler)
- Enable more aggressive coalescing (until we decide to always run the coalescer this way)
- Enable MachineScheduler pass itself.

Disabling PostRA sched may follow for some subtargets.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167826 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetSubtargetInfo.h     |  7 +++++
 lib/CodeGen/MachineScheduler.cpp              |  4 +--
 lib/CodeGen/Passes.cpp                        |  5 ++-
 lib/CodeGen/RegisterCoalescer.cpp             | 44 +++++++++++++++++++++------
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  4 ++-
 lib/Target/TargetSubtargetInfo.cpp            |  4 +++
 6 files changed, 54 insertions(+), 14 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index 6db96d980b..3f22f47a0d 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -54,6 +54,13 @@ public:
     return 0;
   }
 
+  /// \brief True if the subtarget should run MachineScheduler after aggressive
+  /// coalescing.
+  ///
+  /// This currently replaces the SelectionDAG scheduler with the "source" order
+  /// scheduler. It does not yet disable the postRA scheduler.
+  virtual bool enableMachineScheduler() const;
+
   // enablePostRAScheduler - If the target can benefit from post-regalloc
   // scheduling and the specified optimization level meets the requirement
   // return true to enable post-register-allocation scheduling. In
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index ee8138c0c4..8d43360e67 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -60,11 +60,11 @@ static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
 
 // Experimental heuristics
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
-  cl::desc("Enable load clustering."));
+  cl::desc("Enable load clustering."), cl::init(true));
 
 // Experimental heuristics
 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."));
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
 
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 4ea21d4ff7..ee6e2838c2 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Support/CommandLine.h"
@@ -241,7 +242,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   disablePass(&EarlyIfConverterID);
 
   // Temporarily disable experimental passes.
-  substitutePass(&MachineSchedulerID, 0);
+  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+  if (!ST.enableMachineScheduler())
+    disablePass(&MachineSchedulerID);
 }
 
 /// Insert InsertedPassID pass after TargetPassID.
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index e4c2044b25..8e6533c747 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cmath>
 using namespace llvm;
@@ -64,16 +65,16 @@ EnableJoining("join-liveintervals",
               cl::init(true));
 
 // Temporary flag to test critical edge unsplitting.
-static cl::opt<bool>
+static cl::opt<cl::boolOrDefault>
 EnableJoinSplits("join-splitedges",
-                 cl::desc("Coalesce copies on split edges (default=false)"),
-                 cl::init(false), cl::Hidden);
+  cl::desc("Coalesce copies on split edges (default=subtarget)"),
+  cl::init(cl::BOU_UNSET), cl::Hidden);
 
 // Temporary flag to test global copy optimization.
-static cl::opt<bool>
+static cl::opt<cl::boolOrDefault>
 EnableGlobalCopies("join-globalcopies",
-                   cl::desc("Coalesce copies that don't locally define an lrg"),
-                   cl::init(false));
+  cl::desc("Coalesce copies that span blocks (default=subtarget)"),
+  cl::init(cl::BOU_UNSET), cl::Hidden);
 
 static cl::opt<bool>
 VerifyCoalescing("verify-coalescing",
@@ -94,6 +95,14 @@ namespace {
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
 
+    /// \brief True if the coalescer should aggressively coalesce global copies
+    /// in favor of keeping local copies.
+    bool JoinGlobalCopies;
+
+    /// \brief True if the coalescer should aggressively coalesce fall-thru
+    /// blocks exclusively containing copies.
+    bool JoinSplitEdges;
+
     /// WorkList - Copy instructions yet to be coalesced.
     SmallVector<MachineInstr*, 8> WorkList;
     SmallVector<MachineInstr*, 8> LocalWorkList;
@@ -1943,6 +1952,10 @@ namespace {
   //
   // EnableGlobalCopies assumes that the primary sort key is loop depth.
   struct MBBPriorityCompare {
+    bool JoinSplitEdges;
+
+    MBBPriorityCompare(bool joinsplits): JoinSplitEdges(joinsplits) {}
+
     bool operator()(const MBBPriorityInfo &LHS,
                     const MBBPriorityInfo &RHS) const {
       // Deeper loops first
@@ -1950,7 +1963,7 @@ namespace {
         return LHS.Depth > RHS.Depth;
 
       // Try to unsplit critical edges next.
-      if (EnableJoinSplits && LHS.IsSplit != RHS.IsSplit)
+      if (JoinSplitEdges && LHS.IsSplit != RHS.IsSplit)
         return LHS.IsSplit;
 
       // Prefer blocks that are more connected in the CFG. This takes care of
@@ -2011,7 +2024,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
   // Collect all copy-like instructions in MBB. Don't start coalescing anything
   // yet, it might invalidate the iterator.
   const unsigned PrevSize = WorkList.size();
-  if (EnableGlobalCopies) {
+  if (JoinGlobalCopies) {
     // Coalesce copies bottom-up to coalesce local defs before local uses. They
     // are not inherently easier to resolve, but slightly preferable until we
     // have local live range splitting. In particular this is required by
@@ -2061,13 +2074,13 @@ void RegisterCoalescer::joinAllIntervals() {
     MBBs.push_back(MBBPriorityInfo(MBB, Loops->getLoopDepth(MBB),
                                    isSplitEdge(MBB)));
   }
-  std::sort(MBBs.begin(), MBBs.end(), MBBPriorityCompare());
+  std::sort(MBBs.begin(), MBBs.end(), MBBPriorityCompare(JoinSplitEdges));
 
   // Coalesce intervals in MBB priority order.
   unsigned CurrDepth = UINT_MAX;
   for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
     // Try coalescing the collected local copies for deeper loops.
-    if (EnableGlobalCopies && MBBs[i].Depth < CurrDepth)
+    if (JoinGlobalCopies && MBBs[i].Depth < CurrDepth)
       coalesceLocals();
     copyCoalesceInMBB(MBBs[i].MBB);
   }
@@ -2097,6 +2110,17 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   AA = &getAnalysis<AliasAnalysis>();
   Loops = &getAnalysis<MachineLoopInfo>();
 
+  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+  if (EnableGlobalCopies == cl::BOU_UNSET)
+    JoinGlobalCopies = ST.enableMachineScheduler();
+  else
+    JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
+
+  if (EnableJoinSplits == cl::BOU_UNSET)
+    JoinSplitEdges = ST.enableMachineScheduler();
+  else
+    JoinSplitEdges = (EnableJoinSplits == cl::BOU_TRUE);
+
   DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
                << "********** Function: " << MF->getName() << '\n');
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c314fa5b51..d63a5b7a3b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -216,8 +217,9 @@ namespace llvm {
   ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
                                              CodeGenOpt::Level OptLevel) {
     const TargetLowering &TLI = IS->getTargetLowering();
+    const TargetSubtargetInfo &ST = IS->TM.getSubtarget<TargetSubtargetInfo>();
 
-    if (OptLevel == CodeGenOpt::None ||
+    if (OptLevel == CodeGenOpt::None || ST.enableMachineScheduler() ||
         TLI.getSchedulingPreference() == Sched::Source)
       return createSourceListDAGScheduler(IS, OptLevel);
     if (TLI.getSchedulingPreference() == Sched::RegPressure)
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 59ffdea00e..af0cef62d5 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -22,6 +22,10 @@ TargetSubtargetInfo::TargetSubtargetInfo() {}
 
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
+bool TargetSubtargetInfo::enableMachineScheduler() const {
+  return false;
+}
+
 bool TargetSubtargetInfo::enablePostRAScheduler(
           CodeGenOpt::Level OptLevel,
           AntiDepBreakMode& Mode,
-- 
cgit v1.2.3-70-g09d2