@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
565565 bool isVmemAccess (const MachineInstr &MI) const ;
566566 bool generateWaitcntInstBefore (MachineInstr &MI,
567567 WaitcntBrackets &ScoreBrackets,
568- MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
569- bool FlushXCnt );
568+ MachineInstr *OldWaitcntInstr,
569+ bool FlushVmCnt );
570570 bool generateWaitcnt (AMDGPU::Waitcnt Wait,
571571 MachineBasicBlock::instr_iterator It,
572572 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
573- MachineInstr *OldWaitcntInstr, bool FlushXCnt );
573+ MachineInstr *OldWaitcntInstr);
574574 void updateEventWaitcntAfter (MachineInstr &Inst,
575575 WaitcntBrackets *ScoreBrackets);
576576 bool isNextENDPGM (MachineBasicBlock::instr_iterator It,
@@ -1291,15 +1291,33 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
12911291 // Wait on XCNT is redundant if we are already waiting for a load to complete.
12921292 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
12931293 // zero.
1294- if (Wait.KmCnt == 0 && hasPendingEvent (SMEM_GROUP))
1295- return applyWaitcnt (X_CNT, 0 );
1294+ if (Wait.KmCnt == 0 && hasPendingEvent (SMEM_GROUP)) {
1295+ bool flag = hasPendingEvent (VMEM_GROUP);
1296+ unsigned lb = getScoreLB (X_CNT);
1297+ applyWaitcnt (X_CNT, 0 );
1298+ if (flag) {
1299+ // Pendingevents mai i need to reactivate the VMEM_GROUP
1300+ PendingEvents |= (1 << VMEM_GROUP);
1301+ setScoreLB (X_CNT, lb);
1302+ }
1303+ // return applyWaitcnt(X_CNT, 0);
1304+ return ;
1305+ }
12961306
12971307 // If we have pending store we cannot optimize XCnt because we do not wait for
12981308 // stores. VMEM loads retun in order, so if we only have loads XCnt is
12991309 // decremented to the same number as LOADCnt.
13001310 if (Wait.LoadCnt != ~0u && hasPendingEvent (VMEM_GROUP) &&
1301- !hasPendingEvent (STORE_CNT))
1302- return applyWaitcnt (X_CNT, std::min (Wait.XCnt , Wait.LoadCnt ));
1311+ !hasPendingEvent (STORE_CNT)) {
1312+ unsigned lb = getScoreLB (X_CNT);
1313+ bool flag = hasPendingEvent (SMEM_GROUP);
1314+ applyWaitcnt (X_CNT, std::min (Wait.XCnt , Wait.LoadCnt ));
1315+ if (flag) {
1316+ PendingEvents |= (1 << SMEM_GROUP);
1317+ setScoreLB (X_CNT, lb);
1318+ }
1319+ return ;
1320+ }
13031321
13041322 applyWaitcnt (X_CNT, Wait.XCnt );
13051323}
@@ -1841,13 +1859,12 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
18411859// / and if so what the value of each counter is.
18421860// / The "score bracket" is bound by the lower bound and upper bound
18431861// / scores (*_score_LB and *_score_ub respectively).
1844- // / If FlushVmCnt/FlushXcnt is true, that means that we want to
1845- // / generate a s_waitcnt to flush the vmcnt/xcnt counter here.
1862+ // / If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1863+ // / flush the vmcnt counter here.
18461864bool SIInsertWaitcnts::generateWaitcntInstBefore (MachineInstr &MI,
18471865 WaitcntBrackets &ScoreBrackets,
18481866 MachineInstr *OldWaitcntInstr,
1849- bool FlushVmCnt,
1850- bool FlushXCnt) {
1867+ bool FlushVmCnt) {
18511868 setForceEmitWaitcnt ();
18521869
18531870 assert (!MI.isMetaInstruction ());
@@ -2102,26 +2119,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
21022119 Wait.BvhCnt = 0 ;
21032120 }
21042121
2105- // Conservatively flush the Xcnt Counter at the start of the block.
2106- if (FlushXCnt) {
2107- if (ScoreBrackets.hasPendingEvent (SMEM_GROUP) &&
2108- ScoreBrackets.hasPendingEvent (VMEM_GROUP))
2109- Wait.XCnt = 0 ;
2110- }
2111-
21122122 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u )
21132123 Wait.LoadCnt = 0 ;
21142124
21152125 return generateWaitcnt (Wait, MI.getIterator (), *MI.getParent (), ScoreBrackets,
2116- OldWaitcntInstr, FlushXCnt );
2126+ OldWaitcntInstr);
21172127}
21182128
21192129bool SIInsertWaitcnts::generateWaitcnt (AMDGPU::Waitcnt Wait,
21202130 MachineBasicBlock::instr_iterator It,
21212131 MachineBasicBlock &Block,
21222132 WaitcntBrackets &ScoreBrackets,
2123- MachineInstr *OldWaitcntInstr,
2124- bool FlushXCnt) {
2133+ MachineInstr *OldWaitcntInstr) {
21252134 bool Modified = false ;
21262135
21272136 if (OldWaitcntInstr)
@@ -2150,9 +2159,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21502159 }
21512160
21522161 // XCnt may be already consumed by a load wait.
2153- // If we need to flush the Xcnt counter, don't
2154- // combine it with any other wait events.
2155- if (Wait.XCnt != ~0u && !FlushXCnt) {
2162+ if (Wait.XCnt != ~0u ) {
21562163 if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent (SMEM_GROUP))
21572164 Wait.XCnt = ~0u ;
21582165
@@ -2224,9 +2231,8 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
22242231 ScoreBrackets.simplifyWaitcnt (Wait);
22252232
22262233 auto SuccessorIt = std::next (Inst.getIterator ());
2227- bool Result =
2228- generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
2229- /* OldWaitcntInstr=*/ nullptr , /* FlushXCnt=*/ false );
2234+ bool Result = generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
2235+ /* OldWaitcntInstr=*/ nullptr );
22302236
22312237 if (Result && NeedsEndPGMCheck && isNextENDPGM (SuccessorIt, &Block)) {
22322238 BuildMI (Block, SuccessorIt, Inst.getDebugLoc (), TII->get (AMDGPU::S_NOP))
@@ -2466,7 +2472,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24662472
24672473 // Walk over the instructions.
24682474 MachineInstr *OldWaitcntInstr = nullptr ;
2469- bool FirstInstInBlock = true ;
24702475
24712476 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin (),
24722477 E = Block.instr_end ();
@@ -2488,13 +2493,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
24882493
24892494 bool FlushVmCnt = Block.getFirstTerminator () == Inst &&
24902495 isPreheaderToFlush (Block, ScoreBrackets);
2491- bool FlushXCnt = FirstInstInBlock;
2492- if (FirstInstInBlock)
2493- FirstInstInBlock = false ;
24942496
24952497 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
24962498 Modified |= generateWaitcntInstBefore (Inst, ScoreBrackets, OldWaitcntInstr,
2497- FlushVmCnt, FlushXCnt );
2499+ FlushVmCnt);
24982500 OldWaitcntInstr = nullptr ;
24992501
25002502 // Restore vccz if it's not known to be correct already.
@@ -2583,7 +2585,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
25832585
25842586 // Combine or remove any redundant waitcnts at the end of the block.
25852587 Modified |= generateWaitcnt (Wait, Block.instr_end (), Block, ScoreBrackets,
2586- OldWaitcntInstr, /* FlushXCnt= */ false );
2588+ OldWaitcntInstr);
25872589
25882590 LLVM_DEBUG ({
25892591 dbgs () << " *** End Block: " ;
0 commit comments