[AMDGPU][IGLP] SingleWaveOpt: Cache DSW Counters from PreRA (#67759)

jrbyrnes · web-flow · commit 6afceba5106a · 2023-10-06T17:34:14.000-07:00
Save the DSW counters from PreRA scheduling. While this avoids recalculation in the postRA pass, that isn't the main purpose.

This is required because of physical register dependencies in PostRA scheduling -- they alter the DAG s.t. our counters may become incorrect -- which alters the layout of the pipeline. By preserving the values from PreRA, we can be sure that we accurately construct the pipeline.

Additionally, remove a bad assert in SharesPredWithPrevNthGroup -- it is possible that we will have an empty cache if OtherGroup has no elements which have a V_PERM pred (possible if the V_PERM SG is empty).
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -850,7 +850,8 @@ class IGLPStrategy {
   // Add SchedGroups to \p Pipeline to implement this Strategy.
   virtual void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      bool IsPostRA) = 0;
 
   // Returns true if this strategy should be applied to a ScheduleDAG.
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -868,7 +869,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 public:
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      bool IsPostRA) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -880,7 +882,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 
 void MFMASmallGemmOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    bool IsPostRA) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
@@ -1076,9 +1079,12 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
               Cache->push_back(Pred.getSUnit());
           }
         }
+
+        // If the other group has no PERM preds, then this group won't share any
+        if (!Cache->size())
+          return false;
       }
 
-      assert(Cache->size());
       auto DAG = SyncPipe[0].DAG;
       // Does the previous DS_WRITE share a V_PERM predecessor with this
       // VMEM_READ
@@ -1095,7 +1101,8 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 public:
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      bool IsPostRA) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -1105,14 +1112,20 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
   }
 };
 
+static unsigned DSWCount = 0;
+static unsigned DSWWithPermCount = 0;
+static unsigned DSWWithSharedVMEMCount = 0;
+
 void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
-    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    bool IsPostRA) {
   unsigned MFMACount = 0;
-  unsigned DSWCount = 0;
-  unsigned DSWWithPermCount = 0;
-  unsigned DSWWithSharedVMEMCount = 0;
   unsigned DSRCount = 0;
+
+  assert((IsPostRA ||
+          DSWCount == DSWWithPermCount == DSWWithSharedVMEMCount == 0) &&
+         "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
     auto I = SU.getInstr();
@@ -1121,7 +1134,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore()) {
+      else if (I->mayStore() && !IsPostRA) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1133,56 +1146,59 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
       }
     }
   }
-  DSWWithPermCount = DSWithPerms.size();
-  auto I = DSWithPerms.begin();
-  auto E = DSWithPerms.end();
-
-  // Get the count of DS_WRITES with V_PERM predecessors which
-  // have loop carried dependencies (WAR) on the same VMEM_READs.
-  // We consider partial overlap as a miss -- in other words,
-  // for a given DS_W, we only consider another DS_W as matching
-  // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
-  // for every V_PERM pred of this DS_W.
-  DenseMap<MachineInstr *, SUnit *> VMEMLookup;
-  SmallVector<SUnit *, 6> Counted;
-  for (; I != E; I++) {
-    SUnit *Cand = nullptr;
-    bool MissedAny = false;
-    for (auto &Pred : (*I)->Preds) {
-      if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
-        continue;
 
-      if (Cand && llvm::is_contained(Counted, Cand))
-        break;
-
-      for (auto &Succ : Pred.getSUnit()->Succs) {
-        auto MI = Succ.getSUnit()->getInstr();
-        if (!TII->isVMEM(*MI) || !MI->mayLoad())
+  if (!IsPostRA) {
+    DSWWithPermCount = DSWithPerms.size();
+    auto I = DSWithPerms.begin();
+    auto E = DSWithPerms.end();
+
+    // Get the count of DS_WRITES with V_PERM predecessors which
+    // have loop carried dependencies (WAR) on the same VMEM_READs.
+    // We consider partial overlap as a miss -- in other words,
+    // for a given DS_W, we only consider another DS_W as matching
+    // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
+    // for every V_PERM pred of this DS_W.
+    DenseMap<MachineInstr *, SUnit *> VMEMLookup;
+    SmallVector<SUnit *, 6> Counted;
+    for (; I != E; I++) {
+      SUnit *Cand = nullptr;
+      bool MissedAny = false;
+      for (auto &Pred : (*I)->Preds) {
+        if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
           continue;
 
-        if (MissedAny || !VMEMLookup.size()) {
-          MissedAny = true;
-          VMEMLookup[MI] = *I;
-          continue;
-        }
+        if (Cand && llvm::is_contained(Counted, Cand))
+          break;
 
-        if (!VMEMLookup.contains(MI)) {
-          MissedAny = true;
-          VMEMLookup[MI] = *I;
-          continue;
-        }
+        for (auto &Succ : Pred.getSUnit()->Succs) {
+          auto MI = Succ.getSUnit()->getInstr();
+          if (!TII->isVMEM(*MI) || !MI->mayLoad())
+            continue;
 
-        Cand = VMEMLookup[MI];
-        if (llvm::is_contained(Counted, Cand)) {
-          MissedAny = true;
-          break;
+          if (MissedAny || !VMEMLookup.size()) {
+            MissedAny = true;
+            VMEMLookup[MI] = *I;
+            continue;
+          }
+
+          if (!VMEMLookup.contains(MI)) {
+            MissedAny = true;
+            VMEMLookup[MI] = *I;
+            continue;
+          }
+
+          Cand = VMEMLookup[MI];
+          if (llvm::is_contained(Counted, Cand)) {
+            MissedAny = true;
+            break;
+          }
         }
       }
-    }
-    if (!MissedAny && Cand) {
-      DSWWithSharedVMEMCount += 2;
-      Counted.push_back(Cand);
-      Counted.push_back(*I);
+      if (!MissedAny && Cand) {
+        DSWWithSharedVMEMCount += 2;
+        Counted.push_back(Cand);
+        Counted.push_back(*I);
+      }
     }
   }
 
@@ -1398,7 +1414,11 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   // first created SchedGroup first.
   bool IsBottomUp = 1;
 
+  // Whether the mutation is being applied to post RA scheduling
+  bool IsPostRA = false;
+
   IGroupLPDAGMutation() = default;
+  IGroupLPDAGMutation(bool IsPostRA) : IsPostRA(IsPostRA) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -1686,16 +1706,16 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
   if (S->shouldApplyStrategy(DAG)) {
     IsBottomUp = S->IsBottomUp;
-    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
+    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsPostRA);
   }
 }
 
 } // namespace
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
-  return std::make_unique<IGroupLPDAGMutation>();
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA) {
+  return std::make_unique<IGroupLPDAGMutation>(IsPostRA);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -14,7 +14,7 @@
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -440,7 +440,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -450,7 +450,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(createIGroupLPDAGMutation());
+  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
   return DAG;
 }
 
@@ -905,7 +905,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(createIGroupLPDAGMutation());
+    DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -706,7 +706,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation());
+  DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -843,7 +843,7 @@ bool GCNSchedStage::initGCNRegion() {
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
     SavedMutations.clear();
     SavedMutations.swap(DAG.Mutations);
-    DAG.addMutation(createIGroupLPDAGMutation());
+    DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
   }
 
   return true;
@@ -1557,7 +1557,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation());
+    addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
   }
 
   ScheduleDAGMI::schedule();
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir