Skip to content

Commit 7aac631

Browse files
committed
Second attempt.
This seems much cleaner than flushing the xcnt counter. TODO: add more test cases.
1 parent 8b12569 commit 7aac631

File tree

4 files changed

+36
-44
lines changed

4 files changed

+36
-44
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
565565
bool isVmemAccess(const MachineInstr &MI) const;
566566
bool generateWaitcntInstBefore(MachineInstr &MI,
567567
WaitcntBrackets &ScoreBrackets,
568-
MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
569-
bool FlushXCnt);
568+
MachineInstr *OldWaitcntInstr,
569+
bool FlushVmCnt);
570570
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
571571
MachineBasicBlock::instr_iterator It,
572572
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
573-
MachineInstr *OldWaitcntInstr, bool FlushXCnt);
573+
MachineInstr *OldWaitcntInstr);
574574
void updateEventWaitcntAfter(MachineInstr &Inst,
575575
WaitcntBrackets *ScoreBrackets);
576576
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -1291,15 +1291,33 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
12911291
// Wait on XCNT is redundant if we are already waiting for a load to complete.
12921292
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
12931293
// zero.
1294-
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1295-
return applyWaitcnt(X_CNT, 0);
1294+
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
1295+
bool flag = hasPendingEvent(VMEM_GROUP);
1296+
unsigned lb = getScoreLB(X_CNT);
1297+
applyWaitcnt(X_CNT, 0);
1298+
if (flag) {
1299+
// Pendingevents mai i need to reactivate the VMEM_GROUP
1300+
PendingEvents |= (1 << VMEM_GROUP);
1301+
setScoreLB(X_CNT, lb);
1302+
}
1303+
// return applyWaitcnt(X_CNT, 0);
1304+
return;
1305+
}
12961306

12971307
// If we have pending store we cannot optimize XCnt because we do not wait for
12981308
// stores. VMEM loads retun in order, so if we only have loads XCnt is
12991309
// decremented to the same number as LOADCnt.
13001310
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1301-
!hasPendingEvent(STORE_CNT))
1302-
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1311+
!hasPendingEvent(STORE_CNT)) {
1312+
unsigned lb = getScoreLB(X_CNT);
1313+
bool flag = hasPendingEvent(SMEM_GROUP);
1314+
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1315+
if (flag) {
1316+
PendingEvents |= (1 << SMEM_GROUP);
1317+
setScoreLB(X_CNT, lb);
1318+
}
1319+
return;
1320+
}
13031321

13041322
applyWaitcnt(X_CNT, Wait.XCnt);
13051323
}
@@ -1841,13 +1859,12 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
18411859
/// and if so what the value of each counter is.
18421860
/// The "score bracket" is bound by the lower bound and upper bound
18431861
/// scores (*_score_LB and *_score_ub respectively).
1844-
/// If FlushVmCnt/FlushXcnt is true, that means that we want to
1845-
/// generate a s_waitcnt to flush the vmcnt/xcnt counter here.
1862+
/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1863+
/// flush the vmcnt counter here.
18461864
bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
18471865
WaitcntBrackets &ScoreBrackets,
18481866
MachineInstr *OldWaitcntInstr,
1849-
bool FlushVmCnt,
1850-
bool FlushXCnt) {
1867+
bool FlushVmCnt) {
18511868
setForceEmitWaitcnt();
18521869

18531870
assert(!MI.isMetaInstruction());
@@ -2102,26 +2119,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
21022119
Wait.BvhCnt = 0;
21032120
}
21042121

2105-
// Conservatively flush the Xcnt Counter at the start of the block.
2106-
if (FlushXCnt) {
2107-
if (ScoreBrackets.hasPendingEvent(SMEM_GROUP) &&
2108-
ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2109-
Wait.XCnt = 0;
2110-
}
2111-
21122122
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
21132123
Wait.LoadCnt = 0;
21142124

21152125
return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2116-
OldWaitcntInstr, FlushXCnt);
2126+
OldWaitcntInstr);
21172127
}
21182128

21192129
bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21202130
MachineBasicBlock::instr_iterator It,
21212131
MachineBasicBlock &Block,
21222132
WaitcntBrackets &ScoreBrackets,
2123-
MachineInstr *OldWaitcntInstr,
2124-
bool FlushXCnt) {
2133+
MachineInstr *OldWaitcntInstr) {
21252134
bool Modified = false;
21262135

21272136
if (OldWaitcntInstr)
@@ -2150,9 +2159,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21502159
}
21512160

21522161
// XCnt may be already consumed by a load wait.
2153-
// If we need to flush the Xcnt counter, don't
2154-
// combine it with any other wait events.
2155-
if (Wait.XCnt != ~0u && !FlushXCnt) {
2162+
if (Wait.XCnt != ~0u) {
21562163
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
21572164
Wait.XCnt = ~0u;
21582165

@@ -2224,9 +2231,8 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
22242231
ScoreBrackets.simplifyWaitcnt(Wait);
22252232

22262233
auto SuccessorIt = std::next(Inst.getIterator());
2227-
bool Result =
2228-
generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2229-
/*OldWaitcntInstr=*/nullptr, /*FlushXCnt=*/false);
2234+
bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2235+
/*OldWaitcntInstr=*/nullptr);
22302236

22312237
if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
22322238
BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -2466,7 +2472,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24662472

24672473
// Walk over the instructions.
24682474
MachineInstr *OldWaitcntInstr = nullptr;
2469-
bool FirstInstInBlock = true;
24702475

24712476
for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
24722477
E = Block.instr_end();
@@ -2488,13 +2493,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24882493

24892494
bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
24902495
isPreheaderToFlush(Block, ScoreBrackets);
2491-
bool FlushXCnt = FirstInstInBlock;
2492-
if (FirstInstInBlock)
2493-
FirstInstInBlock = false;
24942496

24952497
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
24962498
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2497-
FlushVmCnt, FlushXCnt);
2499+
FlushVmCnt);
24982500
OldWaitcntInstr = nullptr;
24992501

25002502
// Restore vccz if it's not known to be correct already.
@@ -2583,7 +2585,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
25832585

25842586
// Combine or remove any redundant waitcnts at the end of the block.
25852587
Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2586-
OldWaitcntInstr, /*FlushXCnt=*/false);
2588+
OldWaitcntInstr);
25872589

25882590
LLVM_DEBUG({
25892591
dbgs() << "*** End Block: ";

llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,6 @@
1010
; Check that we are changing SADDR form of a load to VADDR and do not have to use
1111
; readfirstlane instructions to move address from VGPRs into SGPRs.
1212

13-
; FIXME: Redundant xcnt in the loop header.
14-
; Pending xcnt events should check if they can be folded into soft waitcnts
15-
; before being propogated.
16-
1713
define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg1, ptr nocapture %arg2) {
1814
; GCN-LABEL: test_move_load_address_to_vgpr:
1915
; GCN: ; %bb.0: ; %bb
@@ -28,7 +24,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
2824
; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
2925
; GCN-NEXT: .LBB0_1: ; %bb3
3026
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
31-
; GCN-NEXT: s_wait_xcnt 0x0
3227
; GCN-NEXT: s_wait_dscnt 0x0
3328
; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
3429
; GCN-NEXT: s_wait_loadcnt 0x0

llvm/test/CodeGen/AMDGPU/mul.ll

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2583,13 +2583,8 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
25832583
; GFX1250-NEXT: s_branch .LBB15_6
25842584
; GFX1250-NEXT: .LBB15_5:
25852585
; GFX1250-NEXT: v_mov_b32_e32 v0, s7
2586-
; TODO: the xcnt can actually be merged into the loadcnt at the bottom.
2587-
; ifinstead of checking at the first instruction in the block,
2588-
; if i am able to check at the instruction which needs an xcnt,
2589-
; if it had a pending smem & vmem, then this extra xcnt can be avoided.
25902586
; GFX1250-NEXT: .LBB15_6: ; %endif
25912587
; GFX1250-NEXT: s_wait_kmcnt 0x0
2592-
; GFX1250-NEXT: s_wait_xcnt 0x0
25932588
; GFX1250-NEXT: s_mov_b32 s3, 0x31016000
25942589
; GFX1250-NEXT: s_mov_b32 s2, -1
25952590
; GFX1250-NEXT: s_wait_loadcnt 0x0

llvm/test/CodeGen/AMDGPU/wait-xcnt.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -969,8 +969,8 @@ body: |
969969
; GCN-NEXT: liveins: $sgpr2
970970
; GCN-NEXT: {{ $}}
971971
; GCN-NEXT: S_WAIT_KMCNT 0
972-
; GCN-NEXT: S_WAIT_XCNT 0
973972
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
973+
; GCN-NEXT: S_WAIT_XCNT 0
974974
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
975975
bb.0:
976976
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc

0 commit comments

Comments
 (0)