Skip to content

Commit 303b7d0

Browse files
committed
[AMDGPU] Fix Xcnt handling between blocks
The compiler needs to conservatively flush the Xcnt Counter on entry to a block in case of pending SMEM and VMEM events.
1 parent 76e2963 commit 303b7d0

File tree

2 files changed

+30
-14
lines changed

2 files changed

+30
-14
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
565565
bool isVmemAccess(const MachineInstr &MI) const;
566566
bool generateWaitcntInstBefore(MachineInstr &MI,
567567
WaitcntBrackets &ScoreBrackets,
568-
MachineInstr *OldWaitcntInstr,
569-
bool FlushVmCnt);
568+
MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
569+
bool FlushXCnt);
570570
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
571571
MachineBasicBlock::instr_iterator It,
572572
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
573-
MachineInstr *OldWaitcntInstr);
573+
MachineInstr *OldWaitcntInstr, bool FlushXCnt);
574574
void updateEventWaitcntAfter(MachineInstr &Inst,
575575
WaitcntBrackets *ScoreBrackets);
576576
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -1841,12 +1841,13 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
18411841
/// and if so what the value of each counter is.
18421842
/// The "score bracket" is bound by the lower bound and upper bound
18431843
/// scores (*_score_LB and *_score_ub respectively).
1844-
/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1845-
/// flush the vmcnt counter here.
1844+
/// If FlushVmCnt/FlushXcnt is true, that means that we want to
1845+
/// generate a s_waitcnt to flush the vmcnt/xcnt counter here.
18461846
bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
18471847
WaitcntBrackets &ScoreBrackets,
18481848
MachineInstr *OldWaitcntInstr,
1849-
bool FlushVmCnt) {
1849+
bool FlushVmCnt,
1850+
bool FlushXCnt) {
18501851
setForceEmitWaitcnt();
18511852

18521853
assert(!MI.isMetaInstruction());
@@ -2101,18 +2102,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
21012102
Wait.BvhCnt = 0;
21022103
}
21032104

2105+
// Conservatively flush the Xcnt Counter at the start of the block.
2106+
if (FlushXCnt) {
2107+
if (ScoreBrackets.hasPendingEvent(SMEM_GROUP) &&
2108+
ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2109+
Wait.XCnt = 0;
2110+
}
2111+
21042112
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
21052113
Wait.LoadCnt = 0;
21062114

21072115
return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2108-
OldWaitcntInstr);
2116+
OldWaitcntInstr, FlushXCnt);
21092117
}
21102118

21112119
bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21122120
MachineBasicBlock::instr_iterator It,
21132121
MachineBasicBlock &Block,
21142122
WaitcntBrackets &ScoreBrackets,
2115-
MachineInstr *OldWaitcntInstr) {
2123+
MachineInstr *OldWaitcntInstr,
2124+
bool FlushXCnt) {
21162125
bool Modified = false;
21172126

21182127
if (OldWaitcntInstr)
@@ -2141,7 +2150,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21412150
}
21422151

21432152
// XCnt may be already consumed by a load wait.
2144-
if (Wait.XCnt != ~0u) {
2153+
// If we need to flush the Xcnt counter, don't
2154+
// combine it with any other wait events.
2155+
if (Wait.XCnt != ~0u && !FlushXCnt) {
21452156
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
21462157
Wait.XCnt = ~0u;
21472158

@@ -2213,8 +2224,9 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
22132224
ScoreBrackets.simplifyWaitcnt(Wait);
22142225

22152226
auto SuccessorIt = std::next(Inst.getIterator());
2216-
bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2217-
/*OldWaitcntInstr=*/nullptr);
2227+
bool Result =
2228+
generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2229+
/*OldWaitcntInstr=*/nullptr, /*FlushXCnt=*/false);
22182230

22192231
if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
22202232
BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -2454,6 +2466,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24542466

24552467
// Walk over the instructions.
24562468
MachineInstr *OldWaitcntInstr = nullptr;
2469+
bool FirstInstInBlock = true;
24572470

24582471
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
24592472
E = Block.instr_end();
@@ -2475,10 +2488,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24752488

24762489
bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
24772490
isPreheaderToFlush(Block, ScoreBrackets);
2491+
bool FlushXCnt = FirstInstInBlock;
2492+
if (FirstInstInBlock)
2493+
FirstInstInBlock = false;
24782494

24792495
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
24802496
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2481-
FlushVmCnt);
2497+
FlushVmCnt, FlushXCnt);
24822498
OldWaitcntInstr = nullptr;
24832499

24842500
// Restore vccz if it's not known to be correct already.
@@ -2567,7 +2583,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
25672583

25682584
// Combine or remove any redundant waitcnts at the end of the block.
25692585
Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2570-
OldWaitcntInstr);
2586+
OldWaitcntInstr, /*FlushXCnt=*/false);
25712587

25722588
LLVM_DEBUG({
25732589
dbgs() << "*** End Block: ";

llvm/test/CodeGen/AMDGPU/wait-xcnt.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -945,7 +945,6 @@ body: |
945945
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
946946
...
947947

948-
# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
949948
---
950949
name: wait_kmcnt_with_outstanding_vmem_2
951950
tracksRegLiveness: true
@@ -970,6 +969,7 @@ body: |
970969
; GCN-NEXT: liveins: $sgpr2
971970
; GCN-NEXT: {{ $}}
972971
; GCN-NEXT: S_WAIT_KMCNT 0
972+
; GCN-NEXT: S_WAIT_XCNT 0
973973
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
974974
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
975975
bb.0:

0 commit comments

Comments
 (0)