-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Teach iterative schedulers about IGLP #134953
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesThis adds IGLP mutation to the iterative schedulers ( The Patch is 64.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134953.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index bbd262748d680..19bbe1be727d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2698,6 +2698,13 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
namespace llvm {
+namespace AMDGPU {
+bool isIGLPMutationOnly(unsigned Opcode) {
+ return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
+}
+
+} // end namespace AMDGPU
+
/// \p Phase specifes whether or not this is a reentry into the
/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aff7096f26d67..b7e8c711c6fcc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -18,6 +18,11 @@ namespace llvm {
namespace AMDGPU {
// The current phase of instruction scheduling
enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
+
+// Return true if the instruction is mutually exclusive with all non-IGLP DAG
+// mutations, requiring all other mutations to be disabled.
+bool isIGLPMutationOnly(unsigned Opcode);
+
} // namespace AMDGPU
std::unique_ptr<ScheduleDAGMutation>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4b5c70f09155f..b469fa6226811 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -602,6 +602,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}
@@ -613,12 +614,15 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}
static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
- return new GCNIterativeScheduler(C,
- GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+ auto *DAG = new GCNIterativeScheduler(
+ C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+ DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
+ return DAG;
}
static ScheduleDAGInstrs *
@@ -629,6 +633,7 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+ DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index da065e8d8cb6b..68e07f007e0f1 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "GCNIterativeScheduler.h"
+#include "AMDGPUIGroupLP.h"
#include "GCNSchedStrategy.h"
#include "SIMachineFunctionInfo.h"
@@ -118,6 +119,25 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
}
#endif
+void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) {
+ bool HasIGLPInstrs = false;
+
+ for (MachineBasicBlock::iterator I = R.Begin; I != R.End; I++) {
+ if (AMDGPU::isIGLPMutationOnly(I->getOpcode())) {
+ HasIGLPInstrs = true;
+ break;
+ }
+ }
+
+ if (HasIGLPInstrs) {
+ SavedMutations.clear();
+ SavedMutations.swap(Mutations);
+ auto SchedPhase = IsReentry ? AMDGPU::SchedulingPhase::PreRAReentry
+ : AMDGPU::SchedulingPhase::Initial;
+ addMutation(createIGroupLPDAGMutation(SchedPhase));
+ }
+}
+
// DAG builder helper
class GCNIterativeScheduler::BuildDAG {
GCNIterativeScheduler &Sch;
@@ -125,14 +145,15 @@ class GCNIterativeScheduler::BuildDAG {
SmallVector<SUnit*, 8> BotRoots;
public:
- BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
- : Sch(_Sch) {
+ BuildDAG(const Region &R, GCNIterativeScheduler &_Sch, bool IsReentry = false)
+ : Sch(_Sch) {
auto *BB = R.Begin->getParent();
Sch.BaseClass::startBlock(BB);
Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
-
+ Sch.swapIGLPMutations(R, IsReentry);
Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
/*TrackLaneMask*/true);
+ Sch.postProcessDAG();
Sch.Topo.InitDAGTopologicalSorting();
Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
}
@@ -432,13 +453,15 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
auto NewOcc = TargetOcc;
for (auto *R : Regions) {
+ // Always build the DAG to add mutations
+ BuildDAG DAG(*R, *this);
+
if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
- break;
+ continue;
LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
- BuildDAG DAG(*R, *this);
const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
const auto MaxRP = getSchedulePressure(*R, MinSchedule);
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
@@ -469,8 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
- if (TryMaximizeOccupancy && Occ < TgtOcc)
+ bool IsReentry = false;
+ if (TryMaximizeOccupancy && Occ < TgtOcc) {
Occ = tryMaximizeOccupancy(TgtOcc);
+ IsReentry = true;
+ }
// This is really weird but for some magic scheduling regions twice
// gives performance improvement
@@ -489,7 +515,8 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
for (auto *R : Regions) {
OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
-
+ IsReentry |= I > 0;
+ swapIGLPMutations(*R, IsReentry);
Ovr.schedule();
const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
@@ -556,8 +583,11 @@ void GCNIterativeScheduler::scheduleILP(
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
- if (TryMaximizeOccupancy && Occ < TgtOcc)
+ bool IsReentry = false;
+ if (TryMaximizeOccupancy && Occ < TgtOcc) {
Occ = tryMaximizeOccupancy(TgtOcc);
+ IsReentry = true;
+ }
TgtOcc = std::min(Occ, TgtOcc);
LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, "
@@ -566,7 +596,7 @@ void GCNIterativeScheduler::scheduleILP(
unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy());
for (auto *R : Regions) {
- BuildDAG DAG(*R, *this);
+ BuildDAG DAG(*R, *this, IsReentry);
const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this);
const auto RP = getSchedulePressure(*R, ILPSchedule);
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
index c0228540b7a2f..f731b1fc7e0df 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -77,6 +77,8 @@ class GCNIterativeScheduler : public ScheduleDAGMILive {
const StrategyKind Strategy;
mutable GCNUpwardRPTracker UPTracker;
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
class BuildDAG;
class OverrideLegacyStrategy;
@@ -91,6 +93,7 @@ class GCNIterativeScheduler : public ScheduleDAGMILive {
return getRegionPressure(R.Begin, R.End);
}
+ void swapIGLPMutations(const Region &R, bool IsReentry);
void setBestSchedule(Region &R,
ScheduleRef Schedule,
const GCNRegPressure &MaxRP = GCNRegPressure());
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ea9bc88bbe86b..5e5d06a40aad4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -188,12 +188,6 @@ static void getRegisterPressures(
Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
}
-// Return true if the instruction is mutually exclusive with all non-IGLP DAG
-// mutations, requiring all other mutations to be disabled.
-static bool isIGLPMutationOnly(unsigned Opcode) {
- return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
-}
-
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
const RegPressureTracker &RPTracker,
@@ -1163,7 +1157,7 @@ bool GCNSchedStage::initGCNRegion() {
StageID == GCNSchedStageID::ILPInitialSchedule) {
for (auto &I : DAG) {
Unsched.push_back(&I);
- if (isIGLPMutationOnly(I.getOpcode()))
+ if (AMDGPU::isIGLPMutationOnly(I.getOpcode()))
DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
}
} else {
@@ -2048,7 +2042,7 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
- return isIGLPMutationOnly(MI->getOpcode());
+ return AMDGPU::isIGLPMutationOnly(MI->getOpcode());
});
}
diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
index 1113acb3c0305..ba1cb9b26dec6 100644
--- a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
+++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
@@ -1,4 +1,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-ilp < %s | FileCheck %s
; Test should not result in build failure
; CHECK-LABEL: shouldNotReApply
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
new file mode 100644
index 0000000000000..0764cd5d34d75
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -0,0 +1,933 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-minreg < %s | FileCheck -check-prefix=GCN-MINREG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck -check-prefix=GCN-MAXOCC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-ilp < %s | FileCheck -check-prefix=GCN-ILP %s
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
+; GCN-MINREG: ; %bb.0: ; %entry
+; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0
+; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112
+; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96
+; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80
+; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64
+; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4
+; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16
+; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32
+; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1
+; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16
+; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3]
+; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304
+; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288
+; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272
+; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256
+; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240
+; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224
+; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208
+; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 2
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208
+; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688
+; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672
+; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656
+; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640
+; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624
+; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608
+; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592
+; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 2
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400
+; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264
+; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248
+; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232
+; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216
+; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200
+; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184
+; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168
+; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 2
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592
+; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456
+; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440
+; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424
+; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408
+; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344
+; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360
+; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376
+; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_nop 2
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: s_endpgm
+;
+; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
+; GCN-MAXOCC: ; %bb.0: ; %entry
+; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1
+; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112
+; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96
+; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80
+; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64
+; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0
+; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16
+; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32
+; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1
+; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this avoid adding the mutations twice with gcn-iterative-max-occupancy-experimental
?
@@ -2698,6 +2698,13 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { | |||
|
|||
namespace llvm { | |||
|
|||
namespace AMDGPU { | |||
bool isIGLPMutationOnly(unsigned Opcode) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function probably fits better in SIInstrInfo.
@@ -118,21 +119,41 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, | |||
} | |||
#endif | |||
|
|||
void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess with less stages it's not worth caching this result like in the default scheduler?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, that was my thought.
For the multi-stage scheduling approach it makes sense to cache them for facilitating different strategies based on mutation selection. But for these schedulers, it seems we should either apply them or not.
Yeah the For the Ran a couple sanity check tests and things look as expected. |
Change-Id: Iee536f6c3238c59304ebe814c56eafa2219ff408
Change-Id: Ibf08571f46c2a378c6e1f5c968128571a5938367
Change-Id: If3650ce24a1a047557b3e40363b72aefd909e873
Change-Id: I963052236780781fd0ae56ce970c1f6179bdb904
force-push for rebase |
Change-Id: Ia66c6504e1edaef9bb8ce607c869deac721aff2f
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Remove newline Co-authored-by: Austin Kerbow <[email protected]>
Thanks for review @kerbowa |
* origin/main: [mlir][vector] Prevent folding non memref-type gather into maskedload (llvm#135371) [mlir][SMT] remove custom forall/exists builder because of asan memory leak [bazel] Fix a typo (llvm#135460) [bazel] Add support for SMT Dialect (llvm#135454) [clang] ASTImporter: fix SubstNonTypeTemplateParmExpr source location (llvm#135450) [RISCV] Don't fold offsets into auipc if offset is larger than the reference global variable. (llvm#135297) [gn] port d1fd977 [NFC][LLVM] Apply std::move to object being pushed back in findSymbolCommon (llvm#135290) [AMDGPU] Teach iterative schedulers about IGLP (llvm#134953)
This adds IGLP mutation to the iterative schedulers (`gcn-iterative-max-occupancy-experimental`, `gcn-iterative-minreg`, and `gcn-iterative-ilp`). The `gcn-iterative-minreg` and `gcn-iterative-ilp` schedulers never actually applied the mutations added, so this also has the effect of teaching them about mutations in general. The `gcn-iterative-max-occupancy-experimental` scheduler has calls to `ScheduleDAGMILive::schedule()`, so, before this, mutations were applied at this point. Now this is done during calls to `BuildDAG`, with IGLP superseding other mutations (similar to the other schedulers). We may end up scheduling regions multiple times, with mutations being applied each time, so we need to track for `AMDGPU::SchedulingPhase::PreRAReentry`
This adds IGLP mutation to the iterative schedulers (`gcn-iterative-max-occupancy-experimental`, `gcn-iterative-minreg`, and `gcn-iterative-ilp`). The `gcn-iterative-minreg` and `gcn-iterative-ilp` schedulers never actually applied the mutations added, so this also has the effect of teaching them about mutations in general. The `gcn-iterative-max-occupancy-experimental` scheduler has calls to `ScheduleDAGMILive::schedule()`, so, before this, mutations were applied at this point. Now this is done during calls to `BuildDAG`, with IGLP superseding other mutations (similar to the other schedulers). We may end up scheduling regions multiple times, with mutations being applied each time, so we need to track for `AMDGPU::SchedulingPhase::PreRAReentry`
This adds IGLP mutation to the iterative schedulers (`gcn-iterative-max-occupancy-experimental`, `gcn-iterative-minreg`, and `gcn-iterative-ilp`). The `gcn-iterative-minreg` and `gcn-iterative-ilp` schedulers never actually applied the mutations added, so this also has the effect of teaching them about mutations in general. The `gcn-iterative-max-occupancy-experimental` scheduler has calls to `ScheduleDAGMILive::schedule()`, so, before this, mutations were applied at this point. Now this is done during calls to `BuildDAG`, with IGLP superseding other mutations (similar to the other schedulers). We may end up scheduling regions multiple times, with mutations being applied each time, so we need to track for `AMDGPU::SchedulingPhase::PreRAReentry`
This adds IGLP mutation to the iterative schedulers (
gcn-iterative-max-occupancy-experimental
,gcn-iterative-minreg
, andgcn-iterative-ilp
).The
gcn-iterative-minreg
andgcn-iterative-ilp
schedulers never actually applied the mutations added, so this also has the effect of teaching them about mutations in general. Thegcn-iterative-max-occupancy-experimental
scheduler has calls toScheduleDAGMILive::schedule()
, so, before this, mutations were applied at this point. Now this is done during calls toBuildDAG
, with IGLP superseding other mutations (similar to the other schedulers). We may end up scheduling regions multiple times, with mutations being applied each time, so we need to track forAMDGPU::SchedulingPhase::PreRAReentry