Skip to content

Commit 6afceba

Browse files
authored
[AMDGPU][IGLP] SingleWaveOpt: Cache DSW Counters from PreRA (#67759)
Save the DSW counters from PreRA scheduling. While this avoids recalculation in the postRA pass, that isn't the main purpose. This is required because of physical register dependencies in PostRA scheduling -- they alter the DAG s.t. our counters may become incorrect -- which alters the layout of the pipeline. By preserving the values from PreRA, we can be sure that we accurately construct the pipeline. Additionally, remove a bad assert in SharesPredWithPrevNthGroup -- it is possible that we will have an empty cache if OtherGroup has no elements which have a V_PERM pred (possible if the V_PERM SG is empty).
1 parent a9e9727 commit 6afceba

File tree

6 files changed

+186
-166
lines changed

6 files changed

+186
-166
lines changed

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Lines changed: 76 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,8 @@ class IGLPStrategy {
850850
// Add SchedGroups to \p Pipeline to implement this Strategy.
851851
virtual void applyIGLPStrategy(
852852
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
853-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
853+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
854+
bool IsPostRA) = 0;
854855

855856
// Returns true if this strategy should be applied to a ScheduleDAG.
856857
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -868,7 +869,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
868869
public:
869870
void applyIGLPStrategy(
870871
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
871-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
872+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
873+
bool IsPostRA) override;
872874

873875
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
874876

@@ -880,7 +882,8 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
880882

881883
void MFMASmallGemmOpt::applyIGLPStrategy(
882884
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
883-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
885+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
886+
bool IsPostRA) {
884887
// Count the number of MFMA instructions.
885888
unsigned MFMACount = 0;
886889
for (const MachineInstr &I : *DAG)
@@ -1076,9 +1079,12 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
10761079
Cache->push_back(Pred.getSUnit());
10771080
}
10781081
}
1082+
1083+
// If the other group has no PERM preds, then this group won't share any
1084+
if (!Cache->size())
1085+
return false;
10791086
}
10801087

1081-
assert(Cache->size());
10821088
auto DAG = SyncPipe[0].DAG;
10831089
// Does the previous DS_WRITE share a V_PERM predecessor with this
10841090
// VMEM_READ
@@ -1095,7 +1101,8 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
10951101
public:
10961102
void applyIGLPStrategy(
10971103
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1098-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
1104+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1105+
bool IsPostRA) override;
10991106

11001107
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
11011108

@@ -1105,14 +1112,20 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
11051112
}
11061113
};
11071114

1115+
static unsigned DSWCount = 0;
1116+
static unsigned DSWWithPermCount = 0;
1117+
static unsigned DSWWithSharedVMEMCount = 0;
1118+
11081119
void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
11091120
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1110-
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
1121+
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1122+
bool IsPostRA) {
11111123
unsigned MFMACount = 0;
1112-
unsigned DSWCount = 0;
1113-
unsigned DSWWithPermCount = 0;
1114-
unsigned DSWWithSharedVMEMCount = 0;
11151124
unsigned DSRCount = 0;
1125+
1126+
assert((IsPostRA ||
1127+
DSWCount == DSWWithPermCount == DSWWithSharedVMEMCount == 0) &&
1128+
"DSWCounters should be zero in pre-RA scheduling!");
11161129
SmallVector<SUnit *, 6> DSWithPerms;
11171130
for (auto &SU : DAG->SUnits) {
11181131
auto I = SU.getInstr();
@@ -1121,7 +1134,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
11211134
else if (TII->isDS(*I)) {
11221135
if (I->mayLoad())
11231136
++DSRCount;
1124-
else if (I->mayStore()) {
1137+
else if (I->mayStore() && !IsPostRA) {
11251138
++DSWCount;
11261139
for (auto Pred : SU.Preds) {
11271140
if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1133,56 +1146,59 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
11331146
}
11341147
}
11351148
}
1136-
DSWWithPermCount = DSWithPerms.size();
1137-
auto I = DSWithPerms.begin();
1138-
auto E = DSWithPerms.end();
1139-
1140-
// Get the count of DS_WRITES with V_PERM predecessors which
1141-
// have loop carried dependencies (WAR) on the same VMEM_READs.
1142-
// We consider partial overlap as a miss -- in other words,
1143-
// for a given DS_W, we only consider another DS_W as matching
1144-
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
1145-
// for every V_PERM pred of this DS_W.
1146-
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
1147-
SmallVector<SUnit *, 6> Counted;
1148-
for (; I != E; I++) {
1149-
SUnit *Cand = nullptr;
1150-
bool MissedAny = false;
1151-
for (auto &Pred : (*I)->Preds) {
1152-
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
1153-
continue;
11541149

1155-
if (Cand && llvm::is_contained(Counted, Cand))
1156-
break;
1157-
1158-
for (auto &Succ : Pred.getSUnit()->Succs) {
1159-
auto MI = Succ.getSUnit()->getInstr();
1160-
if (!TII->isVMEM(*MI) || !MI->mayLoad())
1150+
if (!IsPostRA) {
1151+
DSWWithPermCount = DSWithPerms.size();
1152+
auto I = DSWithPerms.begin();
1153+
auto E = DSWithPerms.end();
1154+
1155+
// Get the count of DS_WRITES with V_PERM predecessors which
1156+
// have loop carried dependencies (WAR) on the same VMEM_READs.
1157+
// We consider partial overlap as a miss -- in other words,
1158+
// for a given DS_W, we only consider another DS_W as matching
1159+
// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
1160+
// for every V_PERM pred of this DS_W.
1161+
DenseMap<MachineInstr *, SUnit *> VMEMLookup;
1162+
SmallVector<SUnit *, 6> Counted;
1163+
for (; I != E; I++) {
1164+
SUnit *Cand = nullptr;
1165+
bool MissedAny = false;
1166+
for (auto &Pred : (*I)->Preds) {
1167+
if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
11611168
continue;
11621169

1163-
if (MissedAny || !VMEMLookup.size()) {
1164-
MissedAny = true;
1165-
VMEMLookup[MI] = *I;
1166-
continue;
1167-
}
1170+
if (Cand && llvm::is_contained(Counted, Cand))
1171+
break;
11681172

1169-
if (!VMEMLookup.contains(MI)) {
1170-
MissedAny = true;
1171-
VMEMLookup[MI] = *I;
1172-
continue;
1173-
}
1173+
for (auto &Succ : Pred.getSUnit()->Succs) {
1174+
auto MI = Succ.getSUnit()->getInstr();
1175+
if (!TII->isVMEM(*MI) || !MI->mayLoad())
1176+
continue;
11741177

1175-
Cand = VMEMLookup[MI];
1176-
if (llvm::is_contained(Counted, Cand)) {
1177-
MissedAny = true;
1178-
break;
1178+
if (MissedAny || !VMEMLookup.size()) {
1179+
MissedAny = true;
1180+
VMEMLookup[MI] = *I;
1181+
continue;
1182+
}
1183+
1184+
if (!VMEMLookup.contains(MI)) {
1185+
MissedAny = true;
1186+
VMEMLookup[MI] = *I;
1187+
continue;
1188+
}
1189+
1190+
Cand = VMEMLookup[MI];
1191+
if (llvm::is_contained(Counted, Cand)) {
1192+
MissedAny = true;
1193+
break;
1194+
}
11791195
}
11801196
}
1181-
}
1182-
if (!MissedAny && Cand) {
1183-
DSWWithSharedVMEMCount += 2;
1184-
Counted.push_back(Cand);
1185-
Counted.push_back(*I);
1197+
if (!MissedAny && Cand) {
1198+
DSWWithSharedVMEMCount += 2;
1199+
Counted.push_back(Cand);
1200+
Counted.push_back(*I);
1201+
}
11861202
}
11871203
}
11881204

@@ -1398,7 +1414,11 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
13981414
// first created SchedGroup first.
13991415
bool IsBottomUp = 1;
14001416

1417+
// Whether the mutation is being applied to post RA scheduling
1418+
bool IsPostRA = false;
1419+
14011420
IGroupLPDAGMutation() = default;
1421+
IGroupLPDAGMutation(bool IsPostRA) : IsPostRA(IsPostRA) {}
14021422
};
14031423

14041424
unsigned SchedGroup::NumSchedGroups = 0;
@@ -1686,16 +1706,16 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
16861706
auto S = createIGLPStrategy(StrategyID, DAG, TII);
16871707
if (S->shouldApplyStrategy(DAG)) {
16881708
IsBottomUp = S->IsBottomUp;
1689-
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1709+
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsPostRA);
16901710
}
16911711
}
16921712

16931713
} // namespace
16941714

16951715
namespace llvm {
16961716

1697-
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1698-
return std::make_unique<IGroupLPDAGMutation>();
1717+
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA) {
1718+
return std::make_unique<IGroupLPDAGMutation>(IsPostRA);
16991719
}
17001720

17011721
} // end namespace llvm

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
namespace llvm {
1616

17-
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
17+
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsPostRA);
1818

1919
} // namespace llvm
2020

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
440440
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
441441
if (ST.shouldClusterStores())
442442
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
443-
DAG->addMutation(createIGroupLPDAGMutation());
443+
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
444444
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
445445
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
446446
return DAG;
@@ -450,7 +450,7 @@ static ScheduleDAGInstrs *
450450
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
451451
ScheduleDAGMILive *DAG =
452452
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
453-
DAG->addMutation(createIGroupLPDAGMutation());
453+
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
454454
return DAG;
455455
}
456456

@@ -905,7 +905,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
905905
if (ST.shouldClusterStores())
906906
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
907907
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
908-
DAG->addMutation(createIGroupLPDAGMutation());
908+
DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
909909
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
910910
DAG->addMutation(createVOPDPairingMutation());
911911
return DAG;

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
706706
return false;
707707

708708
SavedMutations.swap(DAG.Mutations);
709-
DAG.addMutation(createIGroupLPDAGMutation());
709+
DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
710710

711711
InitialOccupancy = DAG.MinOccupancy;
712712
// Aggressivly try to reduce register pressure in the unclustered high RP
@@ -843,7 +843,7 @@ bool GCNSchedStage::initGCNRegion() {
843843
StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
844844
SavedMutations.clear();
845845
SavedMutations.swap(DAG.Mutations);
846-
DAG.addMutation(createIGroupLPDAGMutation());
846+
DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
847847
}
848848

849849
return true;
@@ -1557,7 +1557,7 @@ void GCNPostScheduleDAGMILive::schedule() {
15571557
if (HasIGLPInstrs) {
15581558
SavedMutations.clear();
15591559
SavedMutations.swap(Mutations);
1560-
addMutation(createIGroupLPDAGMutation());
1560+
addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
15611561
}
15621562

15631563
ScheduleDAGMI::schedule();

0 commit comments

Comments
 (0)