Skip to content

[AMDGPU] Add new cache flushing instructions for GFX12 #76944

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 8, 2024

Conversation

mbrkusanin
Copy link
Collaborator

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Jan 4, 2024

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Mirko Brkušanin (mbrkusanin)

Changes

Patch is 138.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/76944.diff

13 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+4-2)
  • (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+38-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+10)
  • (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+67-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (+10-20)
  • (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+22-44)
  • (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+4-5)
  • (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+115-198)
  • (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+4-8)
  • (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+103-174)
  • (added) llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir (+29)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vflat.s (+27)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt (+27)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43d35fa5291ca0..15a54856cb2e3a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1222,8 +1222,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
 
 } // End HasD16LoadStore
 
-def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
-                                       int_amdgcn_buffer_wbinvl1>;
+let SubtargetPredicate = isNotGFX12Plus in
+def BUFFER_WBINVL1 : MUBUF_Invalidate <
+  "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
+>;
 
 let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 615f8cd54d8f9c..345564c06af144 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   bits<1> has_sve  = 0; // Scratch VGPR Enable
   bits<1> lds = 0;
   bits<1> sve = 0;
+  bits<1> has_offset = 1;
 
   let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
     !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let Inst{51-50} = cpol{4-3}; // scope
   let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);
   let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{95-72} = offset;
+  let Inst{95-72} = !if(ps.has_offset, offset, ?);
 }
 
 class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -340,6 +341,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass
     GlobalSaddrTable<1, opName>;
 }
 
+class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> :
+  FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> {
+
+  let AsmMatchConverter = "";
+
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let is_flat_global = 1;
+
+  let has_offset = 0;
+  let has_saddr = 0;
+  let enabled_saddr = 0;
+  let saddr_value = 0;
+  let has_vdst = 0;
+  let has_data = 0;
+  let has_vaddr = 0;
+  let has_glc = 0;
+  let has_dlc = 0;
+  let glcValue = 0;
+  let dlcValue = 0;
+  let has_sccb = 0;
+  let sccbValue = 0;
+  let has_sve = 0;
+  let lds = 0;
+  let sve = 0;
+}
+
 class FlatScratchInst <string sv_op, string mode> {
   string SVOp = sv_op;
   string Mode = mode;
@@ -928,6 +957,10 @@ defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor
 
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+
+  def GLOBAL_INV    : FLAT_Global_Invalidate_Writeback<"global_inv">;
+  def GLOBAL_WB     : FLAT_Global_Invalidate_Writeback<"global_wb">;
+  def GLOBAL_WBINV  : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
 } // End SubtargetPredicate = isGFX12Plus
 
 } // End is_flat_global = 1
@@ -2662,6 +2695,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A
 defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
 defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
 
+defm GLOBAL_INV                    : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
+defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">;
+defm GLOBAL_WBINV                  : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">;
+
 // ENC_VSCRATCH.
 defm SCRATCH_LOAD_U8               : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
 defm SCRATCH_LOAD_I8               : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 55ddb540c51e53..1cb1d32707f2d7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
   });
 }
 
+static bool isCacheInvOrWBInst(MachineInstr &Inst) {
+  auto Opc = Inst.getOpcode();
+  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+         Opc == AMDGPU::GLOBAL_WBINV;
+}
+
 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                                WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
@@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
+    // TODO: Track this properly.
+    if (isCacheInvOrWBInst(Inst))
+      return;
+
     assert(Inst.mayLoadOrStore());
 
     int FlatASCount = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 10ec54d3317fdf..7d47e042da9dc5 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -578,6 +578,14 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
                                       bool IsNonTemporal) const override;
 };
 
+class SIGfx12CacheControl : public SIGfx11CacheControl {
+public:
+  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+
+  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+};
+
 class SIMemoryLegalizer final : public MachineFunctionPass {
 private:
 
@@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
     return std::make_unique<SIGfx7CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX11)
     return std::make_unique<SIGfx10CacheControl>(ST);
-  return std::make_unique<SIGfx11CacheControl>(ST);
+  if (Generation < AMDGPUSubtarget::GFX12)
+    return std::make_unique<SIGfx11CacheControl>(ST);
+  return std::make_unique<SIGfx12CacheControl>(ST);
 }
 
 bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                        SIAtomicScope Scope,
+                                        SIAtomicAddrSpace AddrSpace,
+                                        Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
+    return false;
+
+  AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+  switch (Scope) {
+  case SIAtomicScope::SYSTEM:
+    ScopeImm = AMDGPU::CPol::SCOPE_SYS;
+    break;
+  case SIAtomicScope::AGENT:
+    ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+    break;
+  case SIAtomicScope::WORKGROUP:
+    // In WGP mode the waves of a work-group can be executing on either CU of
+    // the WGP. Therefore we need to invalidate the L0 which is per CU.
+    // Otherwise in CU mode all waves of a work-group are on the same CU, and so
+    // the L0 does not need to be invalidated.
+    if (ST.isCuModeEnabled())
+      return false;
+
+    ScopeImm = AMDGPU::CPol::SCOPE_SE;
+    break;
+  case SIAtomicScope::WAVEFRONT:
+  case SIAtomicScope::SINGLETHREAD:
+    // No cache to invalidate.
+    return false;
+  default:
+    llvm_unreachable("Unsupported synchronization scope");
+  }
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return true;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 8ca09973a8ed7e..904120e7d11894 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -1295,8 +1295,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1347,8 +1346,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1389,8 +1387,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1438,8 +1435,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1491,8 +1487,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1536,8 +1531,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1590,8 +1584,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1633,8 +1626,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1682,8 +1674,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1736,8 +1727,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9f97f1f4bace5d..26d981ad7b4bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -240,8 +240,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB0_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -276,8 +275,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB0_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -571,8 +569,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s13, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB1_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -610,8 +607,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB1_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -967,8 +963,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB2_4:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1016,8 +1011,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB2_4:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1284,8 +1278,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB3_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1321,8 +1314,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB3_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1673,8 +1665,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB4_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1718,8 +1709,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB4_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1836,8 +1826,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_mov_b32 s5, s1
 ; GFX12-NEXT:    buff...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Jan 4, 2024

@llvm/pr-subscribers-mc

Author: Mirko Brkušanin (mbrkusanin)

Changes

Patch is 138.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/76944.diff

13 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+4-2)
  • (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+38-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+10)
  • (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+67-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (+10-20)
  • (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+22-44)
  • (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+4-5)
  • (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+115-198)
  • (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+4-8)
  • (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+103-174)
  • (added) llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir (+29)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vflat.s (+27)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt (+27)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43d35fa5291ca0..15a54856cb2e3a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1222,8 +1222,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
 
 } // End HasD16LoadStore
 
-def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
-                                       int_amdgcn_buffer_wbinvl1>;
+let SubtargetPredicate = isNotGFX12Plus in
+def BUFFER_WBINVL1 : MUBUF_Invalidate <
+  "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
+>;
 
 let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 615f8cd54d8f9c..345564c06af144 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   bits<1> has_sve  = 0; // Scratch VGPR Enable
   bits<1> lds = 0;
   bits<1> sve = 0;
+  bits<1> has_offset = 1;
 
   let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
     !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let Inst{51-50} = cpol{4-3}; // scope
   let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);
   let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
-  let Inst{95-72} = offset;
+  let Inst{95-72} = !if(ps.has_offset, offset, ?);
 }
 
 class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -340,6 +341,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass
     GlobalSaddrTable<1, opName>;
 }
 
+class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> :
+  FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> {
+
+  let AsmMatchConverter = "";
+
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let is_flat_global = 1;
+
+  let has_offset = 0;
+  let has_saddr = 0;
+  let enabled_saddr = 0;
+  let saddr_value = 0;
+  let has_vdst = 0;
+  let has_data = 0;
+  let has_vaddr = 0;
+  let has_glc = 0;
+  let has_dlc = 0;
+  let glcValue = 0;
+  let dlcValue = 0;
+  let has_sccb = 0;
+  let sccbValue = 0;
+  let has_sve = 0;
+  let lds = 0;
+  let sve = 0;
+}
+
 class FlatScratchInst <string sv_op, string mode> {
   string SVOp = sv_op;
   string Mode = mode;
@@ -928,6 +957,10 @@ defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor
 
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+
+  def GLOBAL_INV    : FLAT_Global_Invalidate_Writeback<"global_inv">;
+  def GLOBAL_WB     : FLAT_Global_Invalidate_Writeback<"global_wb">;
+  def GLOBAL_WBINV  : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
 } // End SubtargetPredicate = isGFX12Plus
 
 } // End is_flat_global = 1
@@ -2662,6 +2695,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A
 defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
 defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
 
+defm GLOBAL_INV                    : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
+defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">;
+defm GLOBAL_WBINV                  : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">;
+
 // ENC_VSCRATCH.
 defm SCRATCH_LOAD_U8               : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
 defm SCRATCH_LOAD_I8               : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 55ddb540c51e53..1cb1d32707f2d7 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
   });
 }
 
+static bool isCacheInvOrWBInst(MachineInstr &Inst) {
+  auto Opc = Inst.getOpcode();
+  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+         Opc == AMDGPU::GLOBAL_WBINV;
+}
+
 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                                WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
@@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
+    // TODO: Track this properly.
+    if (isCacheInvOrWBInst(Inst))
+      return;
+
     assert(Inst.mayLoadOrStore());
 
     int FlatASCount = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 10ec54d3317fdf..7d47e042da9dc5 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -578,6 +578,14 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
                                       bool IsNonTemporal) const override;
 };
 
+class SIGfx12CacheControl : public SIGfx11CacheControl {
+public:
+  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+
+  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+};
+
 class SIMemoryLegalizer final : public MachineFunctionPass {
 private:
 
@@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
     return std::make_unique<SIGfx7CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX11)
     return std::make_unique<SIGfx10CacheControl>(ST);
-  return std::make_unique<SIGfx11CacheControl>(ST);
+  if (Generation < AMDGPUSubtarget::GFX12)
+    return std::make_unique<SIGfx11CacheControl>(ST);
+  return std::make_unique<SIGfx12CacheControl>(ST);
 }
 
 bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+                                        SIAtomicScope Scope,
+                                        SIAtomicAddrSpace AddrSpace,
+                                        Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not have a cache.
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
+    return false;
+
+  AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+  switch (Scope) {
+  case SIAtomicScope::SYSTEM:
+    ScopeImm = AMDGPU::CPol::SCOPE_SYS;
+    break;
+  case SIAtomicScope::AGENT:
+    ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+    break;
+  case SIAtomicScope::WORKGROUP:
+    // In WGP mode the waves of a work-group can be executing on either CU of
+    // the WGP. Therefore we need to invalidate the L0 which is per CU.
+    // Otherwise in CU mode all waves of a work-group are on the same CU, and so
+    // the L0 does not need to be invalidated.
+    if (ST.isCuModeEnabled())
+      return false;
+
+    ScopeImm = AMDGPU::CPol::SCOPE_SE;
+    break;
+  case SIAtomicScope::WAVEFRONT:
+  case SIAtomicScope::SINGLETHREAD:
+    // No cache to invalidate.
+    return false;
+  default:
+    llvm_unreachable("Unsupported synchronization scope");
+  }
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return true;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 8ca09973a8ed7e..904120e7d11894 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -1295,8 +1295,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1347,8 +1346,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1389,8 +1387,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1438,8 +1435,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1491,8 +1487,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1536,8 +1531,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1590,8 +1584,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1633,8 +1626,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1682,8 +1674,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1736,8 +1727,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9f97f1f4bace5d..26d981ad7b4bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -240,8 +240,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB0_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -276,8 +275,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB0_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -571,8 +569,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s13, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB1_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -610,8 +607,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB1_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -967,8 +963,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB2_4:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1016,8 +1011,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB2_4:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1284,8 +1278,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB3_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1321,8 +1314,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB3_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1673,8 +1665,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1264-NEXT:    s_waitcnt vmcnt(0)
-; GFX1264-NEXT:    buffer_gl0_inv
-; GFX1264-NEXT:    buffer_gl1_inv
+; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB4_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1718,8 +1709,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX1232-NEXT:    s_waitcnt vmcnt(0)
-; GFX1232-NEXT:    buffer_gl0_inv
-; GFX1232-NEXT:    buffer_gl1_inv
+; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB4_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
@@ -1836,8 +1826,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    buffer_gl0_inv
-; GFX12-NEXT:    buffer_gl1_inv
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_mov_b32 s5, s1
 ; GFX12-NEXT:    buff...
[truncated]

@mbrkusanin mbrkusanin force-pushed the gfx12-cache-flushing-instructions branch from 30ab815 to bc73a9c Compare January 4, 2024 14:11
@jayfoad jayfoad merged commit 7ca4473 into llvm:main Jan 8, 2024
@mbrkusanin mbrkusanin deleted the gfx12-cache-flushing-instructions branch January 19, 2024 09:32
justinfargnoli pushed a commit to justinfargnoli/llvm-project that referenced this pull request Jan 28, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants