Skip to content

[AMDGPU] Add new cache flushing instructions for GFX12 #76944

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1222,8 +1222,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <

} // End HasD16LoadStore

def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
int_amdgcn_buffer_wbinvl1>;
let SubtargetPredicate = isNotGFX12Plus in
def BUFFER_WBINVL1 : MUBUF_Invalidate <
"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
>;

let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
Expand Down
39 changes: 38 additions & 1 deletion llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
bits<1> has_sve = 0; // Scratch VGPR Enable
bits<1> lds = 0;
bits<1> sve = 0;
bits<1> has_offset = 1;

let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
Expand Down Expand Up @@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{51-50} = cpol{4-3}; // scope
let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);
let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
let Inst{95-72} = offset;
let Inst{95-72} = !if(ps.has_offset, offset, ?);
}

class GlobalSaddrTable <bit is_saddr, string Name = ""> {
Expand Down Expand Up @@ -340,6 +341,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass
GlobalSaddrTable<1, opName>;
}

class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> :
FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> {

let AsmMatchConverter = "";

let hasSideEffects = 1;
let mayLoad = 0;
let mayStore = 0;
let is_flat_global = 1;

let has_offset = 0;
let has_saddr = 0;
let enabled_saddr = 0;
let saddr_value = 0;
let has_vdst = 0;
let has_data = 0;
let has_vaddr = 0;
let has_glc = 0;
let has_dlc = 0;
let glcValue = 0;
let dlcValue = 0;
let has_sccb = 0;
let sccbValue = 0;
let has_sve = 0;
let lds = 0;
let sve = 0;
}

class FlatScratchInst <string sv_op, string mode> {
string SVOp = sv_op;
string Mode = mode;
Expand Down Expand Up @@ -928,6 +957,10 @@ defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor

let SubtargetPredicate = isGFX12Plus in {
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;

def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">;
def GLOBAL_WB : FLAT_Global_Invalidate_Writeback<"global_wb">;
def GLOBAL_WBINV : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
} // End SubtargetPredicate = isGFX12Plus

} // End is_flat_global = 1
Expand Down Expand Up @@ -2662,6 +2695,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;

defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">;
defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">;

// ENC_VSCRATCH.
defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
defm SCRATCH_LOAD_I8 : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
});
}

static bool isCacheInvOrWBInst(MachineInstr &Inst) {
auto Opc = Inst.getOpcode();
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
Opc == AMDGPU::GLOBAL_WBINV;
}

void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
Expand All @@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
} else if (TII->isFLAT(Inst)) {
// TODO: Track this properly.
if (isCacheInvOrWBInst(Inst))
return;

assert(Inst.mayLoadOrStore());

int FlatASCount = 0;
Expand Down
70 changes: 68 additions & 2 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,14 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
bool IsNonTemporal) const override;
};

class SIGfx12CacheControl : public SIGfx11CacheControl {
public:
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}

bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
};

class SIMemoryLegalizer final : public MachineFunctionPass {
private:

Expand Down Expand Up @@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
return std::make_unique<SIGfx7CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX11)
return std::make_unique<SIGfx10CacheControl>(ST);
return std::make_unique<SIGfx11CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX12)
return std::make_unique<SIGfx11CacheControl>(ST);
return std::make_unique<SIGfx12CacheControl>(ST);
}

bool SIGfx6CacheControl::enableLoadCacheBypass(
Expand Down Expand Up @@ -1423,7 +1433,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
bool Changed = false;

MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
const DebugLoc &DL = MI->getDebugLoc();

if (Pos == Position::AFTER)
++MI;
Expand Down Expand Up @@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}

bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;

MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();

/// The scratch address space does not need the global memory cache
/// to be flushed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.

/// Other address spaces do not have a cache.
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
return false;

AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
switch (Scope) {
case SIAtomicScope::SYSTEM:
ScopeImm = AMDGPU::CPol::SCOPE_SYS;
break;
case SIAtomicScope::AGENT:
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore we need to invalidate the L0 which is per CU.
// Otherwise in CU mode all waves of a work-group are on the same CU, and so
// the L0 does not need to be invalidated.
if (ST.isCuModeEnabled())
return false;

ScopeImm = AMDGPU::CPol::SCOPE_SE;
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to invalidate.
return false;
default:
llvm_unreachable("Unsupported synchronization scope");
}

if (Pos == Position::AFTER)
++MI;

BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);

if (Pos == Position::AFTER)
--MI;

return true;
}

bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
Expand Down
30 changes: 10 additions & 20 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1295,8 +1295,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
; GFX12-NEXT: v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
Expand Down Expand Up @@ -1347,8 +1346,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-NEXT: v_mov_b32_e32 v2, 2
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
Expand Down Expand Up @@ -1389,8 +1387,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
; GFX12-NEXT: v_mov_b32_e32 v2, 2
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
Expand Down Expand Up @@ -1438,8 +1435,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-NEXT: v_mov_b32_e32 v2, 2
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
Expand Down Expand Up @@ -1491,8 +1487,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
Expand Down Expand Up @@ -1536,8 +1531,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
Expand Down Expand Up @@ -1590,8 +1584,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
Expand Down Expand Up @@ -1633,8 +1626,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
; GFX12-NEXT: v_mov_b32_e32 v4, v2
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
Expand Down Expand Up @@ -1682,8 +1674,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
Expand Down Expand Up @@ -1736,8 +1727,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: buffer_gl0_inv
; GFX12-NEXT: buffer_gl1_inv
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
Expand Down
Loading