Second attempt.

easyonaadit · easyonaadit · commit 7aac631cf9a8 · 2025-10-29T13:49:41.000+05:30
This seems much cleaner than flushing the xcnt counter.

TODO: add more test cases.
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -565,12 +565,12 @@ class SIInsertWaitcnts {
   bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
-                                 MachineInstr *OldWaitcntInstr, bool FlushVmCnt,
-                                 bool FlushXCnt);
+                                 MachineInstr *OldWaitcntInstr,
+                                 bool FlushVmCnt);
   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
                        MachineBasicBlock::instr_iterator It,
                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
-                       MachineInstr *OldWaitcntInstr, bool FlushXCnt);
+                       MachineInstr *OldWaitcntInstr);
   void updateEventWaitcntAfter(MachineInstr &Inst,
                                WaitcntBrackets *ScoreBrackets);
   bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -1291,15 +1291,33 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
-    return applyWaitcnt(X_CNT, 0);
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
+    bool flag = hasPendingEvent(VMEM_GROUP);
+    unsigned lb = getScoreLB(X_CNT);
+    applyWaitcnt(X_CNT, 0);
+    if (flag) {
+      // Pendingevents mai i need to reactivate the VMEM_GROUP
+      PendingEvents |= (1 << VMEM_GROUP);
+      setScoreLB(X_CNT, lb);
+    }
+    // return applyWaitcnt(X_CNT, 0);
+    return;
+  }
 
   // If we have pending store we cannot optimize XCnt because we do not wait for
   // stores. VMEM loads retun in order, so if we only have loads XCnt is
   // decremented to the same number as LOADCnt.
   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT))
-    return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+      !hasPendingEvent(STORE_CNT)) {
+    unsigned lb = getScoreLB(X_CNT);
+    bool flag = hasPendingEvent(SMEM_GROUP);
+    applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+    if (flag) {
+      PendingEvents |= (1 << SMEM_GROUP);
+      setScoreLB(X_CNT, lb);
+    }
+    return;
+  }
 
   applyWaitcnt(X_CNT, Wait.XCnt);
 }
@@ -1841,13 +1859,12 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-///  If FlushVmCnt/FlushXcnt is true, that means that we want to
-/// generate a s_waitcnt to flush the vmcnt/xcnt counter here.
+///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
+///  flush the vmcnt counter here.
 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                                                  WaitcntBrackets &ScoreBrackets,
                                                  MachineInstr *OldWaitcntInstr,
-                                                 bool FlushVmCnt,
-                                                 bool FlushXCnt) {
+                                                 bool FlushVmCnt) {
   setForceEmitWaitcnt();
 
   assert(!MI.isMetaInstruction());
@@ -2102,26 +2119,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       Wait.BvhCnt = 0;
   }
 
-  // Conservatively flush the Xcnt Counter at the start of the block.
-  if (FlushXCnt) {
-    if (ScoreBrackets.hasPendingEvent(SMEM_GROUP) &&
-        ScoreBrackets.hasPendingEvent(VMEM_GROUP))
-      Wait.XCnt = 0;
-  }
-
   if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
     Wait.LoadCnt = 0;
 
   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
-                         OldWaitcntInstr, FlushXCnt);
+                         OldWaitcntInstr);
 }
 
 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                                        MachineBasicBlock::instr_iterator It,
                                        MachineBasicBlock &Block,
                                        WaitcntBrackets &ScoreBrackets,
-                                       MachineInstr *OldWaitcntInstr,
-                                       bool FlushXCnt) {
+                                       MachineInstr *OldWaitcntInstr) {
   bool Modified = false;
 
   if (OldWaitcntInstr)
@@ -2150,9 +2159,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   }
 
   // XCnt may be already consumed by a load wait.
-  // If we need to flush the Xcnt counter, don't
-  // combine it with any other wait events.
-  if (Wait.XCnt != ~0u && !FlushXCnt) {
+  if (Wait.XCnt != ~0u) {
     if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
       Wait.XCnt = ~0u;
 
@@ -2224,9 +2231,8 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
   ScoreBrackets.simplifyWaitcnt(Wait);
 
   auto SuccessorIt = std::next(Inst.getIterator());
-  bool Result =
-      generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
-                      /*OldWaitcntInstr=*/nullptr, /*FlushXCnt=*/false);
+  bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
+                                /*OldWaitcntInstr=*/nullptr);
 
   if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
     BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -2466,7 +2472,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
-  bool FirstInstInBlock = true;
 
   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
                                          E = Block.instr_end();
@@ -2488,13 +2493,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
                       isPreheaderToFlush(Block, ScoreBrackets);
-    bool FlushXCnt = FirstInstInBlock;
-    if (FirstInstInBlock)
-      FirstInstInBlock = false;
 
     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
-                                          FlushVmCnt, FlushXCnt);
+                                          FlushVmCnt);
     OldWaitcntInstr = nullptr;
 
     // Restore vccz if it's not known to be correct already.
@@ -2583,7 +2585,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Combine or remove any redundant waitcnts at the end of the block.
   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
-                              OldWaitcntInstr, /*FlushXCnt=*/false);
+                              OldWaitcntInstr);
 
   LLVM_DEBUG({
     dbgs() << "*** End Block: ";
diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
@@ -10,10 +10,6 @@
 ; Check that we are changing SADDR form of a load to VADDR and do not have to use
 ; readfirstlane instructions to move address from VGPRs into SGPRs.
 
-; FIXME: Redundant xcnt in the loop header.
-; Pending xcnt events should check if they can be folded into soft waitcnts
-; before being propogated.
-
 define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg1, ptr nocapture %arg2) {
 ; GCN-LABEL: test_move_load_address_to_vgpr:
 ; GCN:       ; %bb.0: ; %bb
@@ -28,7 +24,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap
 ; GCN-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GCN-NEXT:  .LBB0_1: ; %bb3
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_wait_xcnt 0x0
 ; GCN-NEXT:    s_wait_dscnt 0x0
 ; GCN-NEXT:    flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
 ; GCN-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2583,13 +2583,8 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1250-NEXT:    s_branch .LBB15_6
 ; GFX1250-NEXT:  .LBB15_5:
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s7
-; TODO: the xcnt can actually be merged into the loadcnt at the bottom.
-; ifinstead of checking at the first instruction in the block,
-; if i am able to check at the instruction which needs an xcnt,
-; if it had a pending smem & vmem, then this extra xcnt can be avoided.
 ; GFX1250-NEXT:  .LBB15_6: ; %endif
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -969,8 +969,8 @@ body: |
   ; GCN-NEXT:   liveins: $sgpr2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_WAIT_KMCNT 0
-  ; GCN-NEXT:   S_WAIT_XCNT 0
   ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 0
   ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   bb.0:
     liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc