@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
565565 bool isVmemAccess (const MachineInstr &MI) const ;
566566 bool generateWaitcntInstBefore (MachineInstr &MI,
567567 WaitcntBrackets &ScoreBrackets,
568- MachineInstr *OldWaitcntInstr,
569- bool FlushVmCnt );
568+ MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
569+ bool FlushXCnt );
570570 bool generateWaitcnt (AMDGPU::Waitcnt Wait,
571571 MachineBasicBlock::instr_iterator It,
572572 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
573- MachineInstr *OldWaitcntInstr);
573+ MachineInstr *OldWaitcntInstr, bool FlushXCnt );
574574 void updateEventWaitcntAfter (MachineInstr &Inst,
575575 WaitcntBrackets *ScoreBrackets);
576576 bool isNextENDPGM (MachineBasicBlock::instr_iterator It,
@@ -1841,12 +1841,13 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
18411841// / and if so what the value of each counter is.
18421842// / The "score bracket" is bound by the lower bound and upper bound
18431843// / scores (*_score_LB and *_score_ub respectively).
1844- // / If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1845- // / flush the vmcnt counter here.
1844+ // / If FlushVmCnt/FlushXcnt is true, that means that we want to
1845+ // / generate a s_waitcnt to flush the vmcnt/xcnt counter here.
18461846bool SIInsertWaitcnts::generateWaitcntInstBefore (MachineInstr &MI,
18471847 WaitcntBrackets &ScoreBrackets,
18481848 MachineInstr *OldWaitcntInstr,
1849- bool FlushVmCnt) {
1849+ bool FlushVmCnt,
1850+ bool FlushXCnt) {
18501851 setForceEmitWaitcnt ();
18511852
18521853 assert (!MI.isMetaInstruction ());
@@ -2101,18 +2102,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
21012102 Wait.BvhCnt = 0 ;
21022103 }
21032104
2105+ // Conservatively flush the Xcnt Counter at the start of the block.
2106+ if (FlushXCnt) {
2107+ if (ScoreBrackets.hasPendingEvent (SMEM_GROUP) &&
2108+ ScoreBrackets.hasPendingEvent (VMEM_GROUP))
2109+ Wait.XCnt = 0 ;
2110+ }
2111+
21042112 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u )
21052113 Wait.LoadCnt = 0 ;
21062114
21072115 return generateWaitcnt (Wait, MI.getIterator (), *MI.getParent (), ScoreBrackets,
2108- OldWaitcntInstr);
2116+ OldWaitcntInstr, FlushXCnt );
21092117}
21102118
21112119bool SIInsertWaitcnts::generateWaitcnt (AMDGPU::Waitcnt Wait,
21122120 MachineBasicBlock::instr_iterator It,
21132121 MachineBasicBlock &Block,
21142122 WaitcntBrackets &ScoreBrackets,
2115- MachineInstr *OldWaitcntInstr) {
2123+ MachineInstr *OldWaitcntInstr,
2124+ bool FlushXCnt) {
21162125 bool Modified = false ;
21172126
21182127 if (OldWaitcntInstr)
@@ -2141,7 +2150,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21412150 }
21422151
21432152 // XCnt may be already consumed by a load wait.
2144- if (Wait.XCnt != ~0u ) {
2153+ // If we need to flush the Xcnt counter, don't
2154+ // combine it with any other wait events.
2155+ if (Wait.XCnt != ~0u && !FlushXCnt) {
21452156 if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent (SMEM_GROUP))
21462157 Wait.XCnt = ~0u ;
21472158
@@ -2213,8 +2224,9 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
22132224 ScoreBrackets.simplifyWaitcnt (Wait);
22142225
22152226 auto SuccessorIt = std::next (Inst.getIterator ());
2216- bool Result = generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
2217- /* OldWaitcntInstr=*/ nullptr );
2227+ bool Result =
2228+ generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
2229+ /* OldWaitcntInstr=*/ nullptr , /* FlushXCnt=*/ false );
22182230
22192231 if (Result && NeedsEndPGMCheck && isNextENDPGM (SuccessorIt, &Block)) {
22202232 BuildMI (Block, SuccessorIt, Inst.getDebugLoc (), TII->get (AMDGPU::S_NOP))
@@ -2454,6 +2466,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24542466
24552467 // Walk over the instructions.
24562468 MachineInstr *OldWaitcntInstr = nullptr ;
2469+ bool FirstInstInBlock = true ;
24572470
24582471 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin (),
24592472 E = Block.instr_end ();
@@ -2475,10 +2488,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24752488
24762489 bool FlushVmCnt = Block.getFirstTerminator () == Inst &&
24772490 isPreheaderToFlush (Block, ScoreBrackets);
2491+ bool FlushXCnt = FirstInstInBlock;
2492+ if (FirstInstInBlock)
2493+ FirstInstInBlock = false ;
24782494
24792495 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
24802496 Modified |= generateWaitcntInstBefore (Inst, ScoreBrackets, OldWaitcntInstr,
2481- FlushVmCnt);
2497+ FlushVmCnt, FlushXCnt );
24822498 OldWaitcntInstr = nullptr ;
24832499
24842500 // Restore vccz if it's not known to be correct already.
@@ -2567,7 +2583,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
25672583
25682584 // Combine or remove any redundant waitcnts at the end of the block.
25692585 Modified |= generateWaitcnt (Wait, Block.instr_end (), Block, ScoreBrackets,
2570- OldWaitcntInstr);
2586+ OldWaitcntInstr, /* FlushXCnt= */ false );
25712587
25722588 LLVM_DEBUG ({
25732589 dbgs () << " *** End Block: " ;
0 commit comments