From fff2d3aed9d50780d9d07859b03120dcd712d96e Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Fri, 8 Mar 2013 05:40:34 +0000
Subject: Add -verify-misched option.

This verifies live intervals both before and after scheduling. It's
useful for anyone hacking on live interval update.

Note that we don't yet pass verification all the time. We don't yet
handle updating nonallocatable live intervals perfectly.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176685 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'lib/CodeGen/MachineScheduler.cpp')
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 589fa1fa02..7f4e36f236 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -57,6 +57,9 @@ static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
   cl::desc("Enable scheduling for macro fusion."), cl::init(true));
 
+static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
+  cl::desc("Verify machine instrs before and after machine scheduling"));
+
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
 
@@ -197,6 +200,10 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   LIS = &getAnalysis<LiveIntervals>();
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
+  if (VerifyScheduling) {
+    DEBUG(LIS->print(dbgs()));
+    MF->verify(this, "Before machine scheduling.");
+  }
   RegClassInfo->runOnMachineFunction(*MF);
 
   // Select the scheduler, or set the default.
@@ -285,6 +292,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   }
   Scheduler->finalizeSchedule();
   DEBUG(LIS->print(dbgs()));
+  if (VerifyScheduling)
+    MF->verify(this, "After machine scheduling.");
   return true;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 760fa5dc8022dcf6982969c26ef566dfbeea979c Mon Sep 17 00:00:00 2001
From: Jakub Staszak <kubastaszak@gmail.com>
Date: Sun, 10 Mar 2013 13:11:23 +0000
Subject: Cleanup #includes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176787 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ScheduleDAGInstrs.h       | 5 +----
 lib/CodeGen/MachineScheduler.cpp               | 2 ++
 lib/Target/Hexagon/HexagonMachineScheduler.cpp | 3 ++-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 94abec2002..2219520ca1 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -15,18 +15,15 @@
 #ifndef LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
 #define LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
 
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/SparseMultiSet.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <map>
 
 namespace llvm {
+  class MachineFrameInfo;
   class MachineLoopInfo;
   class MachineDominatorTree;
   class LiveIntervals;
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 7f4e36f236..103b058c13 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -19,6 +19,8 @@
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 9d3c8b8c91..1388ad4f16 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -15,7 +15,8 @@
 #define DEBUG_TYPE "misched"
 
 #include "HexagonMachineScheduler.h"
-#include <queue>
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
-- 
cgit v1.2.3-70-g09d2


From 26c417bb588a3e5f9957cf5ba2a034b92513ec15 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 21 Mar 2013 00:57:21 +0000
Subject: Fix missing std::. Not sure how this compiles for anyone else.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177620 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp    | 2 +-
 lib/DebugInfo/DWARFDebugAranges.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 103b058c13..c872355e37 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -2182,7 +2182,7 @@ public:
   /// Callback to select the highest priority node from the ready Q.
   virtual SUnit *pickNode(bool &IsTopNode) {
     if (ReadyQ.empty()) return NULL;
-    pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+    std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
     SUnit *SU = ReadyQ.back();
     ReadyQ.pop_back();
     IsTopNode = false;
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index b077eb5e38..f79862d606 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -186,7 +186,7 @@ uint32_t DWARFDebugAranges::findAddress(uint64_t address) const {
     Range range(address);
     RangeCollIterator begin = Aranges.begin();
     RangeCollIterator end = Aranges.end();
-    RangeCollIterator pos = lower_bound(begin, end, range, RangeLessThan);
+    RangeCollIterator pos = std::lower_bound(begin, end, range, RangeLessThan);
 
     if (pos != end && pos->LoPC <= address && address < pos->HiPC()) {
       return pos->Offset;
-- 
cgit v1.2.3-70-g09d2


From 11189f7a014c04a1fe4366ec4bf9a6187d98986a Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Fri, 5 Apr 2013 00:31:29 +0000
Subject: MachineScheduler: format DEBUG output.

I'm getting more serious about tuning and enabling on x86/ARM. Start
by making the trace readable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178821 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index c872355e37..c621ca96bf 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -305,7 +305,7 @@ void MachineScheduler::print(raw_ostream &O, const Module* m) const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ReadyQueue::dump() {
-  dbgs() << Name << ": ";
+  dbgs() << "  " << Name << ": ";
   for (unsigned i = 0, e = Queue.size(); i < e; ++i)
     dbgs() << Queue[i]->NodeNum << " ";
   dbgs() << "\n";
@@ -1192,7 +1192,7 @@ protected:
                          SchedCandidate &Candidate);
 
 #ifndef NDEBUG
-  void traceCandidate(const SchedCandidate &Cand, const SchedBoundary &Zone);
+  void traceCandidate(const SchedCandidate &Cand);
 #endif
 };
 } // namespace
@@ -1403,8 +1403,8 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() {
   CheckPending = true;
   IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
 
-  DEBUG(dbgs() << "  *** " << Available.getName() << " cycle "
-        << CurrCycle << '\n');
+  DEBUG(dbgs() << "  " << Available.getName()
+        << " Cycle: " << CurrCycle << '\n');
 }
 
 /// Add the given processor resource to this scheduled zone.
@@ -1870,9 +1870,7 @@ const char *ConvergingScheduler::getReasonStr(
   llvm_unreachable("Unknown reason!");
 }
 
-void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
-                                         const SchedBoundary &Zone) {
-  const char *Label = getReasonStr(Cand.Reason);
+void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
   PressureElement P;
   unsigned ResIdx = 0;
   unsigned Latency = 0;
@@ -1907,21 +1905,21 @@ void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
     Latency = Cand.SU->getDepth();
     break;
   }
-  dbgs() << Label << " " << Zone.Available.getName() << " ";
+  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
   if (P.isValid())
-    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
-           << " ";
+    dbgs() << " " << TRI->getRegPressureSetName(P.PSetID)
+           << ":" << P.UnitIncrease << " ";
   else
-    dbgs() << "     ";
+    dbgs() << "      ";
   if (ResIdx)
-    dbgs() << SchedModel->getProcResource(ResIdx)->Name << " ";
+    dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
   else
-    dbgs() << "        ";
+    dbgs() << "         ";
   if (Latency)
-    dbgs() << Latency << " cycles ";
+    dbgs() << " " << Latency << " cycles ";
   else
-    dbgs() << "         ";
-  Cand.SU->dump(DAG);
+    dbgs() << "          ";
+  dbgs() << '\n';
 }
 #endif
 
@@ -1950,14 +1948,14 @@ void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
       if (TryCand.ResDelta == SchedResourceDelta())
         TryCand.initResourceDelta(DAG, SchedModel);
       Cand.setBest(TryCand);
-      DEBUG(traceCandidate(Cand, Zone));
+      DEBUG(traceCandidate(Cand));
     }
   }
 }
 
 static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
                       bool IsTop) {
-  DEBUG(dbgs() << "Pick " << (IsTop ? "top" : "bot")
+  DEBUG(dbgs() << "Pick " << (IsTop ? "Top" : "Bot")
         << " SU(" << Cand.SU->NodeNum << ") "
         << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
 }
@@ -2069,10 +2067,7 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   if (SU->isBottomReady())
     Bot.removeReady(SU);
 
-  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
-        << " Scheduling Instruction in cycle "
-        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
-        SU->dump(DAG));
+  DEBUG(dbgs() << "Scheduling " << *SU->getInstr());
   return SU;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 2182977ff2cc8ff1259e3a2c3528651496d818d4 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Fri, 5 Apr 2013 00:31:31 +0000
Subject: Disable DFSResult for ConvergingScheduler.

For now, just save the compile time since the ConvergingScheduler
heuristics don't use this analysis. We'll probably enable it later
after compile-time investigation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178822 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index c621ca96bf..ed6b9861dc 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1243,8 +1243,6 @@ void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   Top.init(DAG, SchedModel, &Rem);
   Bot.init(DAG, SchedModel, &Rem);
 
-  DAG->computeDFSResult();
-
   // Initialize resource counts.
 
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
-- 
cgit v1.2.3-70-g09d2


From 614dacc9102ed3bf3fe4c985b5bc12857a26bae2 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Fri, 5 Apr 2013 00:31:34 +0000
Subject: RegisterPressure heuristics currently require signed comparisons.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178823 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp   |   4 +-
 test/CodeGen/X86/misched-matmul.ll | 228 +++++++++++++++++++++++++++++++++++++
 2 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/X86/misched-matmul.ll

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index ed6b9861dc..5bd2349b50 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1660,7 +1660,7 @@ initResourceDelta(const ScheduleDAGMI *DAG,
 }
 
 /// Return true if this heuristic determines order.
-static bool tryLess(unsigned TryVal, unsigned CandVal,
+static bool tryLess(int TryVal, int CandVal,
                     ConvergingScheduler::SchedCandidate &TryCand,
                     ConvergingScheduler::SchedCandidate &Cand,
                     ConvergingScheduler::CandReason Reason) {
@@ -1676,7 +1676,7 @@ static bool tryLess(unsigned TryVal, unsigned CandVal,
   return false;
 }
 
-static bool tryGreater(unsigned TryVal, unsigned CandVal,
+static bool tryGreater(int TryVal, int CandVal,
                        ConvergingScheduler::SchedCandidate &TryCand,
                        ConvergingScheduler::SchedCandidate &Cand,
                        ConvergingScheduler::CandReason Reason) {
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
new file mode 100644
index 0000000000..0f6e442b1a
--- /dev/null
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -0,0 +1,228 @@
+; REQUIRES: asserts
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -stats 2>&1 | FileCheck %s
+;
+; Verify that register pressure heuristics are working in MachineScheduler.
+;
+; When we enable subtree scheduling heuristics on X86, we may need a
+; flag to disable it for this test case.
+;
+; CHECK: @wrap_mul4
+; CHECK: 30 regalloc - Number of spills inserted
+
+define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
+entry:
+  %arrayidx1.i = getelementptr inbounds [4 x double]* %A, i64 0, i64 0
+  %0 = load double* %arrayidx1.i, align 8, !tbaa !0
+  %arrayidx3.i = getelementptr inbounds [4 x double]* %B, i64 0, i64 0
+  %1 = load double* %arrayidx3.i, align 8, !tbaa !0
+  %mul.i = fmul double %0, %1
+  %arrayidx5.i = getelementptr inbounds [4 x double]* %A, i64 0, i64 1
+  %2 = load double* %arrayidx5.i, align 8, !tbaa !0
+  %arrayidx7.i = getelementptr inbounds [4 x double]* %B, i64 1, i64 0
+  %3 = load double* %arrayidx7.i, align 8, !tbaa !0
+  %mul8.i = fmul double %2, %3
+  %add.i = fadd double %mul.i, %mul8.i
+  %arrayidx10.i = getelementptr inbounds [4 x double]* %A, i64 0, i64 2
+  %4 = load double* %arrayidx10.i, align 8, !tbaa !0
+  %arrayidx12.i = getelementptr inbounds [4 x double]* %B, i64 2, i64 0
+  %5 = load double* %arrayidx12.i, align 8, !tbaa !0
+  %mul13.i = fmul double %4, %5
+  %add14.i = fadd double %add.i, %mul13.i
+  %arrayidx16.i = getelementptr inbounds [4 x double]* %A, i64 0, i64 3
+  %6 = load double* %arrayidx16.i, align 8, !tbaa !0
+  %arrayidx18.i = getelementptr inbounds [4 x double]* %B, i64 3, i64 0
+  %7 = load double* %arrayidx18.i, align 8, !tbaa !0
+  %mul19.i = fmul double %6, %7
+  %add20.i = fadd double %add14.i, %mul19.i
+  %arrayidx25.i = getelementptr inbounds [4 x double]* %B, i64 0, i64 1
+  %8 = load double* %arrayidx25.i, align 8, !tbaa !0
+  %mul26.i = fmul double %0, %8
+  %arrayidx30.i = getelementptr inbounds [4 x double]* %B, i64 1, i64 1
+  %9 = load double* %arrayidx30.i, align 8, !tbaa !0
+  %mul31.i = fmul double %2, %9
+  %add32.i = fadd double %mul26.i, %mul31.i
+  %arrayidx36.i = getelementptr inbounds [4 x double]* %B, i64 2, i64 1
+  %10 = load double* %arrayidx36.i, align 8, !tbaa !0
+  %mul37.i = fmul double %4, %10
+  %add38.i = fadd double %add32.i, %mul37.i
+  %arrayidx42.i = getelementptr inbounds [4 x double]* %B, i64 3, i64 1
+  %11 = load double* %arrayidx42.i, align 8, !tbaa !0
+  %mul43.i = fmul double %6, %11
+  %add44.i = fadd double %add38.i, %mul43.i
+  %arrayidx49.i = getelementptr inbounds [4 x double]* %B, i64 0, i64 2
+  %12 = load double* %arrayidx49.i, align 8, !tbaa !0
+  %mul50.i = fmul double %0, %12
+  %arrayidx54.i = getelementptr inbounds [4 x double]* %B, i64 1, i64 2
+  %13 = load double* %arrayidx54.i, align 8, !tbaa !0
+  %mul55.i = fmul double %2, %13
+  %add56.i = fadd double %mul50.i, %mul55.i
+  %arrayidx60.i = getelementptr inbounds [4 x double]* %B, i64 2, i64 2
+  %14 = load double* %arrayidx60.i, align 8, !tbaa !0
+  %mul61.i = fmul double %4, %14
+  %add62.i = fadd double %add56.i, %mul61.i
+  %arrayidx66.i = getelementptr inbounds [4 x double]* %B, i64 3, i64 2
+  %15 = load double* %arrayidx66.i, align 8, !tbaa !0
+  %mul67.i = fmul double %6, %15
+  %add68.i = fadd double %add62.i, %mul67.i
+  %arrayidx73.i = getelementptr inbounds [4 x double]* %B, i64 0, i64 3
+  %16 = load double* %arrayidx73.i, align 8, !tbaa !0
+  %mul74.i = fmul double %0, %16
+  %arrayidx78.i = getelementptr inbounds [4 x double]* %B, i64 1, i64 3
+  %17 = load double* %arrayidx78.i, align 8, !tbaa !0
+  %mul79.i = fmul double %2, %17
+  %add80.i = fadd double %mul74.i, %mul79.i
+  %arrayidx84.i = getelementptr inbounds [4 x double]* %B, i64 2, i64 3
+  %18 = load double* %arrayidx84.i, align 8, !tbaa !0
+  %mul85.i = fmul double %4, %18
+  %add86.i = fadd double %add80.i, %mul85.i
+  %arrayidx90.i = getelementptr inbounds [4 x double]* %B, i64 3, i64 3
+  %19 = load double* %arrayidx90.i, align 8, !tbaa !0
+  %mul91.i = fmul double %6, %19
+  %add92.i = fadd double %add86.i, %mul91.i
+  %arrayidx95.i = getelementptr inbounds [4 x double]* %A, i64 1, i64 0
+  %20 = load double* %arrayidx95.i, align 8, !tbaa !0
+  %mul98.i = fmul double %1, %20
+  %arrayidx100.i = getelementptr inbounds [4 x double]* %A, i64 1, i64 1
+  %21 = load double* %arrayidx100.i, align 8, !tbaa !0
+  %mul103.i = fmul double %3, %21
+  %add104.i = fadd double %mul98.i, %mul103.i
+  %arrayidx106.i = getelementptr inbounds [4 x double]* %A, i64 1, i64 2
+  %22 = load double* %arrayidx106.i, align 8, !tbaa !0
+  %mul109.i = fmul double %5, %22
+  %add110.i = fadd double %add104.i, %mul109.i
+  %arrayidx112.i = getelementptr inbounds [4 x double]* %A, i64 1, i64 3
+  %23 = load double* %arrayidx112.i, align 8, !tbaa !0
+  %mul115.i = fmul double %7, %23
+  %add116.i = fadd double %add110.i, %mul115.i
+  %mul122.i = fmul double %8, %20
+  %mul127.i = fmul double %9, %21
+  %add128.i = fadd double %mul122.i, %mul127.i
+  %mul133.i = fmul double %10, %22
+  %add134.i = fadd double %add128.i, %mul133.i
+  %mul139.i = fmul double %11, %23
+  %add140.i = fadd double %add134.i, %mul139.i
+  %mul146.i = fmul double %12, %20
+  %mul151.i = fmul double %13, %21
+  %add152.i = fadd double %mul146.i, %mul151.i
+  %mul157.i = fmul double %14, %22
+  %add158.i = fadd double %add152.i, %mul157.i
+  %mul163.i = fmul double %15, %23
+  %add164.i = fadd double %add158.i, %mul163.i
+  %mul170.i = fmul double %16, %20
+  %mul175.i = fmul double %17, %21
+  %add176.i = fadd double %mul170.i, %mul175.i
+  %mul181.i = fmul double %18, %22
+  %add182.i = fadd double %add176.i, %mul181.i
+  %mul187.i = fmul double %19, %23
+  %add188.i = fadd double %add182.i, %mul187.i
+  %arrayidx191.i = getelementptr inbounds [4 x double]* %A, i64 2, i64 0
+  %24 = load double* %arrayidx191.i, align 8, !tbaa !0
+  %mul194.i = fmul double %1, %24
+  %arrayidx196.i = getelementptr inbounds [4 x double]* %A, i64 2, i64 1
+  %25 = load double* %arrayidx196.i, align 8, !tbaa !0
+  %mul199.i = fmul double %3, %25
+  %add200.i = fadd double %mul194.i, %mul199.i
+  %arrayidx202.i = getelementptr inbounds [4 x double]* %A, i64 2, i64 2
+  %26 = load double* %arrayidx202.i, align 8, !tbaa !0
+  %mul205.i = fmul double %5, %26
+  %add206.i = fadd double %add200.i, %mul205.i
+  %arrayidx208.i = getelementptr inbounds [4 x double]* %A, i64 2, i64 3
+  %27 = load double* %arrayidx208.i, align 8, !tbaa !0
+  %mul211.i = fmul double %7, %27
+  %add212.i = fadd double %add206.i, %mul211.i
+  %mul218.i = fmul double %8, %24
+  %mul223.i = fmul double %9, %25
+  %add224.i = fadd double %mul218.i, %mul223.i
+  %mul229.i = fmul double %10, %26
+  %add230.i = fadd double %add224.i, %mul229.i
+  %mul235.i = fmul double %11, %27
+  %add236.i = fadd double %add230.i, %mul235.i
+  %mul242.i = fmul double %12, %24
+  %mul247.i = fmul double %13, %25
+  %add248.i = fadd double %mul242.i, %mul247.i
+  %mul253.i = fmul double %14, %26
+  %add254.i = fadd double %add248.i, %mul253.i
+  %mul259.i = fmul double %15, %27
+  %add260.i = fadd double %add254.i, %mul259.i
+  %mul266.i = fmul double %16, %24
+  %mul271.i = fmul double %17, %25
+  %add272.i = fadd double %mul266.i, %mul271.i
+  %mul277.i = fmul double %18, %26
+  %add278.i = fadd double %add272.i, %mul277.i
+  %mul283.i = fmul double %19, %27
+  %add284.i = fadd double %add278.i, %mul283.i
+  %arrayidx287.i = getelementptr inbounds [4 x double]* %A, i64 3, i64 0
+  %28 = load double* %arrayidx287.i, align 8, !tbaa !0
+  %mul290.i = fmul double %1, %28
+  %arrayidx292.i = getelementptr inbounds [4 x double]* %A, i64 3, i64 1
+  %29 = load double* %arrayidx292.i, align 8, !tbaa !0
+  %mul295.i = fmul double %3, %29
+  %add296.i = fadd double %mul290.i, %mul295.i
+  %arrayidx298.i = getelementptr inbounds [4 x double]* %A, i64 3, i64 2
+  %30 = load double* %arrayidx298.i, align 8, !tbaa !0
+  %mul301.i = fmul double %5, %30
+  %add302.i = fadd double %add296.i, %mul301.i
+  %arrayidx304.i = getelementptr inbounds [4 x double]* %A, i64 3, i64 3
+  %31 = load double* %arrayidx304.i, align 8, !tbaa !0
+  %mul307.i = fmul double %7, %31
+  %add308.i = fadd double %add302.i, %mul307.i
+  %mul314.i = fmul double %8, %28
+  %mul319.i = fmul double %9, %29
+  %add320.i = fadd double %mul314.i, %mul319.i
+  %mul325.i = fmul double %10, %30
+  %add326.i = fadd double %add320.i, %mul325.i
+  %mul331.i = fmul double %11, %31
+  %add332.i = fadd double %add326.i, %mul331.i
+  %mul338.i = fmul double %12, %28
+  %mul343.i = fmul double %13, %29
+  %add344.i = fadd double %mul338.i, %mul343.i
+  %mul349.i = fmul double %14, %30
+  %add350.i = fadd double %add344.i, %mul349.i
+  %mul355.i = fmul double %15, %31
+  %add356.i = fadd double %add350.i, %mul355.i
+  %mul362.i = fmul double %16, %28
+  %mul367.i = fmul double %17, %29
+  %add368.i = fadd double %mul362.i, %mul367.i
+  %mul373.i = fmul double %18, %30
+  %add374.i = fadd double %add368.i, %mul373.i
+  %mul379.i = fmul double %19, %31
+  %add380.i = fadd double %add374.i, %mul379.i
+  store double %add20.i, double* %Out, align 8
+  %Res.i.sroa.1.8.idx2 = getelementptr inbounds double* %Out, i64 1
+  store double %add44.i, double* %Res.i.sroa.1.8.idx2, align 8
+  %Res.i.sroa.2.16.idx4 = getelementptr inbounds double* %Out, i64 2
+  store double %add68.i, double* %Res.i.sroa.2.16.idx4, align 8
+  %Res.i.sroa.3.24.idx6 = getelementptr inbounds double* %Out, i64 3
+  store double %add92.i, double* %Res.i.sroa.3.24.idx6, align 8
+  %Res.i.sroa.4.32.idx8 = getelementptr inbounds double* %Out, i64 4
+  store double %add116.i, double* %Res.i.sroa.4.32.idx8, align 8
+  %Res.i.sroa.5.40.idx10 = getelementptr inbounds double* %Out, i64 5
+  store double %add140.i, double* %Res.i.sroa.5.40.idx10, align 8
+  %Res.i.sroa.6.48.idx12 = getelementptr inbounds double* %Out, i64 6
+  store double %add164.i, double* %Res.i.sroa.6.48.idx12, align 8
+  %Res.i.sroa.7.56.idx14 = getelementptr inbounds double* %Out, i64 7
+  store double %add188.i, double* %Res.i.sroa.7.56.idx14, align 8
+  %Res.i.sroa.8.64.idx16 = getelementptr inbounds double* %Out, i64 8
+  store double %add212.i, double* %Res.i.sroa.8.64.idx16, align 8
+  %Res.i.sroa.9.72.idx18 = getelementptr inbounds double* %Out, i64 9
+  store double %add236.i, double* %Res.i.sroa.9.72.idx18, align 8
+  %Res.i.sroa.10.80.idx20 = getelementptr inbounds double* %Out, i64 10
+  store double %add260.i, double* %Res.i.sroa.10.80.idx20, align 8
+  %Res.i.sroa.11.88.idx22 = getelementptr inbounds double* %Out, i64 11
+  store double %add284.i, double* %Res.i.sroa.11.88.idx22, align 8
+  %Res.i.sroa.12.96.idx24 = getelementptr inbounds double* %Out, i64 12
+  store double %add308.i, double* %Res.i.sroa.12.96.idx24, align 8
+  %Res.i.sroa.13.104.idx26 = getelementptr inbounds double* %Out, i64 13
+  store double %add332.i, double* %Res.i.sroa.13.104.idx26, align 8
+  %Res.i.sroa.14.112.idx28 = getelementptr inbounds double* %Out, i64 14
+  store double %add356.i, double* %Res.i.sroa.14.112.idx28, align 8
+  %Res.i.sroa.15.120.idx30 = getelementptr inbounds double* %Out, i64 15
+  store double %add380.i, double* %Res.i.sroa.15.120.idx30, align 8
+  ret void
+}
+
+attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2


From 4392f0f407fe4e2a9ec53b2560a1cbf86357c190 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Sat, 13 Apr 2013 06:07:40 +0000
Subject: MI-Sched: schedule physreg copies.

The register allocator expects minimal physreg live ranges. Schedule
physreg copies accordingly. This is slightly tricky when they occur in
the middle of the scheduling region. For now, this is handled by
rescheduling the copy when its associated instruction is
scheduled. Eventually we may instead bundle them, but only if we can
preserve the bundles as parallel copies during regalloc.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179449 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineScheduler.h |  5 ++-
 include/llvm/CodeGen/ScheduleDAG.h      | 25 +++++------
 lib/CodeGen/MachineScheduler.cpp        | 73 ++++++++++++++++++++++++++++++++-
 lib/CodeGen/ScheduleDAGInstrs.cpp       |  4 ++
 test/CodeGen/X86/misched-copy.ll        | 48 ++++++++++++++++++++++
 5 files changed, 141 insertions(+), 14 deletions(-)
 create mode 100644 test/CodeGen/X86/misched-copy.ll

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 57febe7746..99cbd870ec 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -297,6 +297,10 @@ public:
   /// reorderable instructions.
   virtual void schedule();
 
+  /// Change the position of an instruction within the basic block and update
+  /// live ranges and region boundary iterators.
+  void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
+
   /// Get current register pressure for the top scheduled instructions.
   const IntervalPressure &getTopPressure() const { return TopPressure; }
   const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; }
@@ -362,7 +366,6 @@ protected:
 
   void updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure);
 
-  void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
   bool checkSchedLimit();
 
   void findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 8c959da696..e13636c250 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -302,6 +302,7 @@ namespace llvm {
     bool isCallOp         : 1;          // Is a function call operand.
     bool isTwoAddress     : 1;          // Is a two-address instruction.
     bool isCommutable     : 1;          // Is a commutable instruction.
+    bool hasPhysRegUses   : 1;          // Has physreg uses.
     bool hasPhysRegDefs   : 1;          // Has physreg defs that are being used.
     bool hasPhysRegClobbers : 1;        // Has any physreg defs, used or not.
     bool isPending        : 1;          // True once pending.
@@ -331,10 +332,10 @@ namespace llvm {
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0),
         Latency(0), isVRegCycle(false), isCall(false), isCallOp(false),
-        isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false),
-        hasPhysRegClobbers(false), isPending(false), isAvailable(false),
-        isScheduled(false), isScheduleHigh(false), isScheduleLow(false),
-        isCloned(false), SchedulingPref(Sched::None),
+        isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
+        hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
+        isAvailable(false), isScheduled(false), isScheduleHigh(false),
+        isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
         TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
@@ -345,10 +346,10 @@ namespace llvm {
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0),
         Latency(0), isVRegCycle(false), isCall(false), isCallOp(false),
-        isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false),
-        hasPhysRegClobbers(false), isPending(false), isAvailable(false),
-        isScheduled(false), isScheduleHigh(false), isScheduleLow(false),
-        isCloned(false), SchedulingPref(Sched::None),
+        isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
+        hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
+        isAvailable(false), isScheduled(false), isScheduleHigh(false),
+        isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
         TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
@@ -358,10 +359,10 @@ namespace llvm {
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0),
         Latency(0), isVRegCycle(false), isCall(false), isCallOp(false),
-        isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false),
-        hasPhysRegClobbers(false), isPending(false), isAvailable(false),
-        isScheduled(false), isScheduleHigh(false), isScheduleLow(false),
-        isCloned(false), SchedulingPref(Sched::None),
+        isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
+        hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
+        isAvailable(false), isScheduled(false), isScheduleHigh(false),
+        isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
         TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 5bd2349b50..736f6deae9 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -404,6 +404,8 @@ void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
   }
 }
 
+/// This is normally called from the main scheduler loop but may also be invoked
+/// by the scheduling strategy to perform additional code motion.
 void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
                                     MachineBasicBlock::iterator InsertPos) {
   // Advance RegionBegin if the first instruction moves down.
@@ -916,7 +918,7 @@ public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
   enum CandReason {
-    NoCand, SingleExcess, SingleCritical, Cluster,
+    NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster,
     ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
     TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
     NodeOrder};
@@ -1191,6 +1193,8 @@ protected:
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Candidate);
 
+  void reschedulePhysRegCopies(SUnit *SU, bool isTop);
+
 #ifndef NDEBUG
   void traceCandidate(const SchedCandidate &Cand);
 #endif
@@ -1696,6 +1700,34 @@ static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
 }
 
+/// Minimize physical register live ranges. Regalloc wants them adjacent to
+/// their physreg def/use.
+///
+/// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
+/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
+/// with the operation that produces or consumes the physreg. We'll do this when
+/// regalloc has support for parallel copies.
+static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
+  const MachineInstr *MI = SU->getInstr();
+  if (!MI->isCopy())
+    return 0;
+
+  unsigned ScheduledOper = isTop ? 1 : 0;
+  unsigned UnscheduledOper = isTop ? 0 : 1;
+  // If we have already scheduled the physreg produce/consumer, immediately
+  // schedule the copy.
+  if (TargetRegisterInfo::isPhysicalRegister(
+        MI->getOperand(ScheduledOper).getReg()))
+    return 1;
+  // If the physreg is at the boundary, defer it. Otherwise schedule it
+  // immediately to free the dependent. We can hoist the copy later.
+  bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
+  if (TargetRegisterInfo::isPhysicalRegister(
+        MI->getOperand(UnscheduledOper).getReg()))
+    return AtBoundary ? -1 : 1;
+  return 0;
+}
+
 /// Apply a set of heursitics to a new candidate. Heuristics are currently
 /// hierarchical. This may be more efficient than a graduated cost model because
 /// we don't need to evaluate all aspects of the model for each node in the
@@ -1723,6 +1755,12 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     TryCand.Reason = NodeOrder;
     return;
   }
+
+  if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()),
+                 biasPhysRegCopy(Cand.SU, Zone.isTop()),
+                 TryCand, Cand, PhysRegCopy))
+    return;
+
   // Avoid exceeding the target's limit.
   if (tryLess(TryCand.RPDelta.Excess.UnitIncrease,
               Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess))
@@ -1851,6 +1889,7 @@ const char *ConvergingScheduler::getReasonStr(
   ConvergingScheduler::CandReason Reason) {
   switch (Reason) {
   case NoCand:         return "NOCAND    ";
+  case PhysRegCopy:    return "PREG-COPY";
   case SingleExcess:   return "REG-EXCESS";
   case SingleCritical: return "REG-CRIT  ";
   case Cluster:        return "CLUSTER   ";
@@ -2069,17 +2108,49 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   return SU;
 }
 
+void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
+
+  MachineBasicBlock::iterator InsertPos = SU->getInstr();
+  if (!isTop)
+    ++InsertPos;
+  SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
+
+  // Find already scheduled copies with a single physreg dependence and move
+  // them just above the scheduled instruction.
+  for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end();
+       I != E; ++I) {
+    if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg()))
+      continue;
+    SUnit *DepSU = I->getSUnit();
+    if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
+      continue;
+    MachineInstr *Copy = DepSU->getInstr();
+    if (!Copy->isCopy())
+      continue;
+    DEBUG(dbgs() << "  Rescheduling physreg copy ";
+          I->getSUnit()->dump(DAG));
+    DAG->moveInstruction(Copy, InsertPos);
+  }
+}
+
 /// Update the scheduler's state after scheduling a node. This is the same node
 /// that was just returned by pickNode(). However, ScheduleDAGMI needs to update
 /// it's state based on the current cycle before MachineSchedStrategy does.
+///
+/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
+/// them here. See comments in biasPhysRegCopy.
 void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
     SU->TopReadyCycle = Top.CurrCycle;
     Top.bumpNode(SU);
+    if (SU->hasPhysRegUses)
+      reschedulePhysRegCopies(SU, true);
   }
   else {
     SU->BotReadyCycle = Bot.CurrCycle;
     Bot.bumpNode(SU);
+    if (SU->hasPhysRegDefs)
+      reschedulePhysRegCopies(SU, false);
   }
 }
 
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 71e7a21ef2..e4da6a41ee 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -262,6 +262,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
       if (UseOp < 0)
         Dep = SDep(SU, SDep::Artificial);
       else {
+        // Set the hasPhysRegDefs only for physreg defs that have a use within
+        // the scheduling region.
+        SU->hasPhysRegDefs = true;
         Dep = SDep(SU, SDep::Data, *Alias);
         RegUse = UseSU->getInstr();
         Dep.setMinLatency(
@@ -318,6 +321,7 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   }
 
   if (!MO.isDef()) {
+    SU->hasPhysRegUses = true;
     // Either insert a new Reg2SUnits entry with an empty SUnits list, or
     // retrieve the existing SUnits list for this register's uses.
     // Push this SUnit on the use list.
diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll
new file mode 100644
index 0000000000..d04df999b9
--- /dev/null
+++ b/test/CodeGen/X86/misched-copy.ll
@@ -0,0 +1,48 @@
+; RUN: llc %s -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched 2>&1 | FileCheck %s
+;
+; Test scheduling of copy instructions.
+;
+; Argument copies should be hoisted to the top of the block.
+; Return copies should be sunk to the end.
+; MUL_HiLo PhysReg use copies should be just above the mul.
+; MUL_HiLo PhysReg def copies should be just below the mul.
+;
+; CHECK:      *** Final schedule for BB#1 ***
+; CHECK-NEXT: %EAX<def> = COPY
+; CHECK:      MUL32r %vreg{{[0-6]+}}, %EAX<imp-def>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use>;
+; CHECK-NEXT: COPY %EAX;
+; CHECK-NEXT: COPY %EDX;
+; CHECK:      DIVSSrm
+define i64 @mulhoist(i32 %a, i32 %b) #0 {
+entry:
+  br label %body
+
+body:
+  %convb = sitofp i32 %b to float
+  ; Generates an iMUL64r to legalize types.
+  %aa = zext i32 %a to i64
+  %mul = mul i64 %aa, 74383
+  ; Do some dependent long latency stuff.
+  %trunc = trunc i64 %mul to i32
+  %convm = sitofp i32 %trunc to float
+  %divm = fdiv float %convm, 0.75
+  ;%addmb = fadd float %divm, %convb
+  ;%divmb = fdiv float %addmb, 0.125
+  ; Do some independent long latency stuff.
+  %conva = sitofp i32 %a to float
+  %diva = fdiv float %conva, 0.75
+  %addab = fadd float %diva, %convb
+  %divab = fdiv float %addab, 0.125
+  br label %end
+
+end:
+  %val = fptosi float %divab to i64
+  %add = add i64 %mul, %val
+  ret i64 %add
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2


From baedcd79773919f2a7c7c4dcdc37f49a807a73f3 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Sat, 13 Apr 2013 06:07:49 +0000
Subject: MI-Sched: DEBUG formatting.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179452 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 736f6deae9..0acd980114 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1343,6 +1343,8 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
   for (ReadyQueue::iterator I = Available.begin(), E = Available.end();
        I != E; ++I) {
     unsigned L = getUnscheduledLatency(*I);
+    DEBUG(dbgs() << "  " << Available.getName()
+          << " RemLatency SU(" << (*I)->NodeNum << ") " << L << '\n');
     if (L > RemLatency)
       RemLatency = L;
   }
@@ -1353,10 +1355,13 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
       RemLatency = L;
   }
   unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow();
+  DEBUG(dbgs() << "  " << Available.getName()
+        << " ExpectedLatency " << ExpectedLatency
+        << " CP Limit " << CriticalPathLimit << '\n');
   if (RemLatency + ExpectedLatency >= CriticalPathLimit
       && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
     Policy.ReduceLatency = true;
-    DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n');
+    DEBUG(dbgs() << "  Increase ILP: " << Available.getName() << '\n');
   }
 }
 
@@ -1573,7 +1578,8 @@ void ConvergingScheduler::balanceZones(
   if ((int)(Rem->getMaxRemainingCount(SchedModel) - RemainingCritCount)
       > (int)SchedModel->getLatencyFactor()) {
     CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx;
-    DEBUG(dbgs() << "Balance " << CriticalZone.Available.getName() << " reduce "
+    DEBUG(dbgs() << "  Balance " << CriticalZone.Available.getName()
+          << " reduce "
           << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name
           << '\n');
   }
@@ -1584,7 +1590,8 @@ void ConvergingScheduler::balanceZones(
   if ((int)(OppositeZone.ExpectedCount - OppositeCount)
       > (int)SchedModel->getLatencyFactor()) {
     OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx;
-    DEBUG(dbgs() << "Balance " << OppositeZone.Available.getName() << " demand "
+    DEBUG(dbgs() << "  Balance " << OppositeZone.Available.getName()
+          << " demand "
           << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name
           << '\n');
   }
@@ -1608,7 +1615,7 @@ void ConvergingScheduler::checkResourceLimits(
     if (Top.CritResIdx != Rem.CritResIdx) {
       TopCand.Policy.ReduceResIdx = Top.CritResIdx;
       BotCand.Policy.ReduceResIdx = Bot.CritResIdx;
-      DEBUG(dbgs() << "Reduce scheduled "
+      DEBUG(dbgs() << "  Reduce scheduled "
             << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n');
     }
     return;
@@ -1625,7 +1632,7 @@ void ConvergingScheduler::checkResourceLimits(
         && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) {
       TopCand.Policy.ReduceLatency = true;
       BotCand.Policy.ReduceLatency = true;
-      DEBUG(dbgs() << "Reduce scheduled latency " << Top.ExpectedLatency
+      DEBUG(dbgs() << "  Reduce scheduled latency " << Top.ExpectedLatency
             << " + " << Bot.ExpectedLatency << '\n');
     }
     return;
@@ -1863,20 +1870,20 @@ static bool compareRPDelta(const RegPressureDelta &LHS,
 
   // Avoid increasing the max critical pressure in the scheduled region.
   if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) {
-    DEBUG(dbgs() << "RP excess top - bot: "
+    DEBUG(dbgs() << "  RP excess top - bot: "
           << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n');
     return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
   }
   // Avoid increasing the max critical pressure in the scheduled region.
   if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) {
-    DEBUG(dbgs() << "RP critical top - bot: "
+    DEBUG(dbgs() << "  RP critical top - bot: "
           << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease)
           << '\n');
     return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
   }
   // Avoid increasing the max pressure of the entire region.
   if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) {
-    DEBUG(dbgs() << "RP current top - bot: "
+    DEBUG(dbgs() << "  RP current top - bot: "
           << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease)
           << '\n');
     return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
@@ -1992,8 +1999,7 @@ void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
 
 static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
                       bool IsTop) {
-  DEBUG(dbgs() << "Pick " << (IsTop ? "Top" : "Bot")
-        << " SU(" << Cand.SU->NodeNum << ") "
+  DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
         << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
 }
 
@@ -2003,10 +2009,12 @@ SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
     IsTopNode = false;
+    DEBUG(dbgs() << "Pick Top NOCAND\n");
     return SU;
   }
   if (SUnit *SU = Top.pickOnlyChoice()) {
     IsTopNode = true;
+    DEBUG(dbgs() << "Pick Bot NOCAND\n");
     return SU;
   }
   CandPolicy NoPolicy;
@@ -2104,7 +2112,7 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   if (SU->isBottomReady())
     Bot.removeReady(SU);
 
-  DEBUG(dbgs() << "Scheduling " << *SU->getInstr());
+  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
   return SU;
 }
 
@@ -2250,12 +2258,12 @@ public:
     SUnit *SU = ReadyQ.back();
     ReadyQ.pop_back();
     IsTopNode = false;
-    DEBUG(dbgs() << "*** Scheduling " << "SU(" << SU->NodeNum << "): "
-          << *SU->getInstr()
+    DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") "
           << " ILP: " << DAG->getDFSResult()->getILP(SU)
           << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @"
           << DAG->getDFSResult()->getSubtreeLevel(
-            DAG->getDFSResult()->getSubtreeID(SU)) << '\n');
+            DAG->getDFSResult()->getSubtreeID(SU)) << '\n'
+          << "Scheduling " << *SU->getInstr());
     return SU;
   }
 
-- 
cgit v1.2.3-70-g09d2


From 811a372d4fbf505e3d5db3b1d18f1109de47e0a7 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Wed, 24 Apr 2013 15:54:36 +0000
Subject: MI Sched: regpressure tracing.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180191 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 0acd980114..e800361061 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -507,6 +507,14 @@ updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) {
     if ((int)NewMaxPressure[ID] > MaxUnits)
       MaxUnits = NewMaxPressure[ID];
   }
+  DEBUG(
+    for (unsigned i = 0, e = NewMaxPressure.size(); i < e; ++i) {
+      unsigned Limit = TRI->getRegPressureSetLimit(i);
+      if (NewMaxPressure[i] > Limit ) {
+        dbgs() << "  " << TRI->getRegPressureSetName(i) << ": "
+               << NewMaxPressure[i] << " > " << Limit << "\n";
+      }
+    });
 }
 
 /// schedule - Called back from MachineScheduler::runOnMachineFunction
-- 
cgit v1.2.3-70-g09d2


From e38afe1e335084134f7830ba6f2208e2ddde59b4 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Wed, 24 Apr 2013 15:54:43 +0000
Subject: MI Sched: eliminate local vreg copies.

For now, we just reschedule instructions that use the copied vregs and
let regalloc elliminate it. I would really like to eliminate the
copies on-the-fly during scheduling, but we need a complete
implementation of repairIntervalsInRange() first.

The general strategy is for the register coalescer to eliminate as
many global copies as possible and shrink live ranges to be
extended-basic-block local. The coalescer should not have to worry
about resolving local copies (e.g. it shouldn't attemp to reorder
instructions). The scheduler is a much better place to deal with local
interference. The coalescer side of this equation needs work.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180193 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/LiveInterval.h      |   9 ++
 include/llvm/CodeGen/MachineScheduler.h  |   4 +
 include/llvm/CodeGen/ScheduleDAG.h       |   5 +-
 include/llvm/CodeGen/ScheduleDAGInstrs.h |   3 +
 lib/CodeGen/MachineScheduler.cpp         | 201 +++++++++++++++++++++++++++++--
 test/CodeGen/ARM/misched-copy-arm.ll     |  30 +++++
 6 files changed, 242 insertions(+), 10 deletions(-)
 create mode 100644 test/CodeGen/ARM/misched-copy-arm.ll

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index 244be9c501..cb09a49666 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -399,6 +399,15 @@ namespace llvm {
       return r != end() && r->containsRange(Start, End);
     }
 
+    /// True iff this live range is a single segment that lies between the
+    /// specified boundaries, exclusively. Vregs live across a backedge are not
+    /// considered local. The boundaries are expected to lie within an extended
+    /// basic block, so vregs that are not live out should contain no holes.
+    bool isLocal(SlotIndex Start, SlotIndex End) const {
+      return beginIndex() > Start.getBaseIndex() &&
+        endIndex() < End.getBoundaryIndex();
+    }
+
     /// removeRange - Remove the specified range from this interval.  Note that
     /// the range must be a single LiveRange in its entirety.
     void removeRange(SlotIndex Start, SlotIndex End,
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 99cbd870ec..769e4b42a5 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -274,6 +274,10 @@ public:
     Mutations.push_back(Mutation);
   }
 
+  /// \brief True if an edge can be added from PredSU to SuccSU without creating
+  /// a cycle.
+  bool canAddEdge(SUnit *SuccSU, SUnit *PredSU);
+
   /// \brief Add a DAG edge to the given SU with the given predecessor
   /// dependence data.
   ///
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index e13636c250..7cff27e172 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -727,9 +727,8 @@ namespace llvm {
     /// IsReachable - Checks if SU is reachable from TargetSU.
     bool IsReachable(const SUnit *SU, const SUnit *TargetSU);
 
-    /// WillCreateCycle - Returns true if adding an edge from SU to TargetSU
-    /// will create a cycle.
-    bool WillCreateCycle(SUnit *SU, SUnit *TargetSU);
+    /// WillCreateCycle - Return true if addPred(TargetSU, SU) creates a cycle.
+    bool WillCreateCycle(SUnit *TargetSU, SUnit *SU);
 
     /// AddPred - Updates the topological ordering to accommodate an edge
     /// to be added from SUnit X to SUnit Y.
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index b71ece4305..990cac6348 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -150,6 +150,9 @@ namespace llvm {
 
     virtual ~ScheduleDAGInstrs() {}
 
+    /// \brief Expose LiveIntervals for use in DAG mutators and such.
+    LiveIntervals *getLIS() const { return LIS; }
+
     /// \brief Get the machine model for instruction scheduling.
     const TargetSchedModel *getSchedModel() const { return &SchedModel; }
 
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index e800361061..c4937a2ecf 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -51,7 +51,11 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
-// Experimental heuristics
+// FIXME: remove this flag after initial testing. It should always be a good
+// thing.
+static cl::opt<bool> EnableCopyConstrain("misched-vcopy", cl::Hidden,
+    cl::desc("Constrain vreg copies."), cl::init(true));
+
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
   cl::desc("Enable load clustering."), cl::init(true));
 
@@ -323,6 +327,10 @@ ScheduleDAGMI::~ScheduleDAGMI() {
   delete SchedImpl;
 }
 
+bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
+  return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
+}
+
 bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
   if (SuccSU != &ExitSU) {
     // Do not use WillCreateCycle, it assumes SD scheduling.
@@ -914,6 +922,180 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// CopyConstrain - DAG post-processing to encourage copy elimination.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create weak edges from all uses of a copy to
+/// the one use that defines the copy's source vreg, most likely an induction
+/// variable increment.
+class CopyConstrain : public ScheduleDAGMutation {
+  // Transient state.
+  SlotIndex RegionBeginIdx;
+  SlotIndex RegionEndIdx;
+public:
+  CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
+
+  virtual void apply(ScheduleDAGMI *DAG);
+
+protected:
+  void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+/// constrainLocalCopy handles two possibilities:
+/// 1) Local src:
+/// I0:     = dst
+/// I1: src = ...
+/// I2:     = dst
+/// I3: dst = src (copy)
+/// (create pred->succ edges I0->I1, I2->I1)
+///
+/// 2) Local copy:
+/// I0: dst = src (copy)
+/// I1:     = dst
+/// I2: src = ...
+/// I3:     = dst
+/// (create pred->succ edges I1->I2, I3->I2)
+///
+/// Although the MachineScheduler is currently constrained to single blocks,
+/// this algorithm should handle extended blocks. An EBB is a set of
+/// contiguously numbered blocks such that the previous block in the EBB is
+/// always the single predecessor.
+void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) {
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineInstr *Copy = CopySU->getInstr();
+
+  // Check for pure vreg copies.
+  unsigned SrcReg = Copy->getOperand(1).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    return;
+
+  unsigned DstReg = Copy->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return;
+
+  // Check if either the dest or source is local. If it's live across a back
+  // edge, it's not local. Note that if both vregs are live across the back
+  // edge, we cannot successfully contrain the copy without cyclic scheduling.
+  unsigned LocalReg = DstReg;
+  unsigned GlobalReg = SrcReg;
+  LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
+  if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
+    LocalReg = SrcReg;
+    GlobalReg = DstReg;
+    LocalLI = &LIS->getInterval(LocalReg);
+    if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
+      return;
+  }
+  LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
+
+  // Find the global segment after the start of the local LI.
+  LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
+  // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
+  // local live range. We could create edges from other global uses to the local
+  // start, but the coalescer should have already eliminated these cases, so
+  // don't bother dealing with it.
+  if (GlobalSegment == GlobalLI->end())
+    return;
+
+  // If GlobalSegment is killed at the LocalLI->start, the call to find()
+  // returned the next global segment. But if GlobalSegment overlaps with
+  // LocalLI->start, then advance to the next segement. If a hole in GlobalLI
+  // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
+  if (GlobalSegment->contains(LocalLI->beginIndex()))
+    ++GlobalSegment;
+
+  if (GlobalSegment == GlobalLI->end())
+    return;
+
+  // Check if GlobalLI contains a hole in the vicinity of LocalLI.
+  if (GlobalSegment != GlobalLI->begin()) {
+    // Two address defs have no hole.
+    if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end,
+                               GlobalSegment->start)) {
+      return;
+    }
+    // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
+    // it would be a disconnected component in the live range.
+    assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() &&
+           "Disconnected LRG within the scheduling region.");
+  }
+  MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
+  if (!GlobalDef)
+    return;
+
+  SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
+  if (!GlobalSU)
+    return;
+
+  // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
+  // constraining the uses of the last local def to precede GlobalDef.
+  SmallVector<SUnit*,8> LocalUses;
+  const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
+  MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
+  SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
+  for (SUnit::const_succ_iterator
+         I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
+       I != E; ++I) {
+    if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
+      continue;
+    if (I->getSUnit() == GlobalSU)
+      continue;
+    if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
+      return;
+    LocalUses.push_back(I->getSUnit());
+  }
+  // Open the top of the GlobalLI hole by constraining any earlier global uses
+  // to precede the start of LocalLI.
+  SmallVector<SUnit*,8> GlobalUses;
+  MachineInstr *FirstLocalDef =
+    LIS->getInstructionFromIndex(LocalLI->beginIndex());
+  SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
+  for (SUnit::const_pred_iterator
+         I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
+    if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
+      continue;
+    if (I->getSUnit() == FirstLocalSU)
+      continue;
+    if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
+      return;
+    GlobalUses.push_back(I->getSUnit());
+  }
+  DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
+  // Add the weak edges.
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
+    DEBUG(dbgs() << "  Local use SU(" << (*I)->NodeNum << ") -> SU("
+          << GlobalSU->NodeNum << ")\n");
+    DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
+  }
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
+    DEBUG(dbgs() << "  Global use SU(" << (*I)->NodeNum << ") -> SU("
+          << FirstLocalSU->NodeNum << ")\n");
+    DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
+  }
+}
+
+/// \brief Callback from DAG postProcessing to create weak edges to encourage
+/// copy elimination.
+void CopyConstrain::apply(ScheduleDAGMI *DAG) {
+  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(
+    &*nextIfDebug(DAG->begin(), DAG->end()));
+  RegionEndIdx = DAG->getLIS()->getInstructionIndex(
+    &*priorNonDebug(DAG->end(), DAG->begin()));
+
+  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+    SUnit *SU = &DAG->SUnits[Idx];
+    if (!SU->getInstr()->isCopy())
+      continue;
+
+    constrainLocalCopy(SU, DAG);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
@@ -926,7 +1108,7 @@ public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
   enum CandReason {
-    NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster,
+    NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster, Weak,
     ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
     TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
     NodeOrder};
@@ -1802,13 +1984,11 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
   if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
                  TryCand, Cand, Cluster))
     return;
-  // Currently, weak edges are for clustering, so we hard-code that reason.
-  // However, deferring the current TryCand will not change Cand's reason.
-  CandReason OrigReason = Cand.Reason;
+
+  // Weak edges are for clustering and other constraints.
   if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
               getWeakLeft(Cand.SU, Zone.isTop()),
-              TryCand, Cand, Cluster)) {
-    Cand.Reason = OrigReason;
+              TryCand, Cand, Weak)) {
     return;
   }
   // Avoid critical resource consumption and balance the schedule.
@@ -1908,6 +2088,7 @@ const char *ConvergingScheduler::getReasonStr(
   case SingleExcess:   return "REG-EXCESS";
   case SingleCritical: return "REG-CRIT  ";
   case Cluster:        return "CLUSTER   ";
+  case Weak:           return "WEAK      ";
   case SingleMax:      return "REG-MAX   ";
   case MultiPressure:  return "REG-MULTI ";
   case ResourceReduce: return "RES-REDUCE";
@@ -2177,6 +2358,12 @@ static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
          "-misched-topdown incompatible with -misched-bottomup");
   ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
   // Register DAG post-processors.
+  //
+  // FIXME: extend the mutation API to allow earlier mutations to instantiate
+  // data and pass it to later mutations. Have a single mutation that gathers
+  // the interesting nodes in one pass.
+  if (EnableCopyConstrain)
+    DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
   if (EnableLoadCluster)
     DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
   if (EnableMacroFusion)
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
new file mode 100644
index 0000000000..4b15326008
--- /dev/null
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -0,0 +1,30 @@
+; REQUIRES: asserts
+; RUN: llc < %s -march=thumb -mcpu=swift -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; Loop counter copies should be eliminated.
+; There is also a MUL here, but we don't care where it is scheduled.
+; CHECK: postinc
+; CHECK: *** Final schedule for BB#2 ***
+; CHECK: t2LDRs
+; CHECK: t2ADDrr
+; CHECK: t2CMPrr
+; CHECK: COPY
+define i32 @postinc(i32 %a, i32* nocapture %d, i32 %s) nounwind {
+entry:
+  %cmp4 = icmp eq i32 %a, 0
+  br i1 %cmp4, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %s.05 = phi i32 [ %mul, %for.body ], [ 0, %entry ]
+  %indvars.iv.next = add i32 %indvars.iv, %s
+  %arrayidx = getelementptr inbounds i32* %d, i32 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %s.05
+  %exitcond = icmp eq i32 %indvars.iv.next, %a
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+  ret i32 %s.0.lcssa
+}
-- 
cgit v1.2.3-70-g09d2


From a264a20277a4699cdc44092767228b76234016d9 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Wed, 24 Apr 2013 23:19:56 +0000
Subject: Fix for r180193 - MI Sched: eliminate local vreg.

Fixes PR15838. Need to check for blocks with nothing but dbg.value.

I'm not sure how to force this situation with a unit test. I tried to
reduce the test case in PR15838 (1k lines of metadata) but gave up.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180227 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index c4937a2ecf..32aedbee94 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -933,6 +933,8 @@ namespace {
 class CopyConstrain : public ScheduleDAGMutation {
   // Transient state.
   SlotIndex RegionBeginIdx;
+  // RegionEndIdx is the slot index of the last non-debug instruction in the
+  // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
   SlotIndex RegionEndIdx;
 public:
   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
@@ -1082,8 +1084,10 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) {
 /// \brief Callback from DAG postProcessing to create weak edges to encourage
 /// copy elimination.
 void CopyConstrain::apply(ScheduleDAGMI *DAG) {
-  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(
-    &*nextIfDebug(DAG->begin(), DAG->end()));
+  MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
+  if (FirstPos == DAG->end())
+    return;
+  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos);
   RegionEndIdx = DAG->getLIS()->getInstructionIndex(
     &*priorNonDebug(DAG->end(), DAG->begin()));
 
-- 
cgit v1.2.3-70-g09d2


From f13fc1b23ad40407d0ee4fd0ee807a40261d639e Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Tue, 30 Apr 2013 22:10:59 +0000
Subject: MI Sched: revert a minor heuristic that snuck in with -misched-vcopy.

I'll fix the heuristic in a general way in a follow-up commit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180815 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineScheduler.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'lib/CodeGen/MachineScheduler.cpp')

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 32aedbee94..fff6b2b4c0 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1990,9 +1990,15 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     return;
 
   // Weak edges are for clustering and other constraints.
+  //
+  // Deferring TryCand here does not change Cand's reason. This is good in the
+  // sense that a bad candidate shouldn't affect a previous candidate's
+  // goodness, but bad in that it is assymetric and depends on queue order.
+  CandReason OrigReason = Cand.Reason;
   if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
               getWeakLeft(Cand.SU, Zone.isTop()),
               TryCand, Cand, Weak)) {
+    Cand.Reason = OrigReason;
     return;
   }
   // Avoid critical resource consumption and balance the schedule.
-- 
cgit v1.2.3-70-g09d2