Skip to content

Commit 7ca4473

Browse files
mbrkusaninrovka
andauthored
[AMDGPU] Add new cache flushing instructions for GFX12 (#76944)
Co-authored-by: Diana Picus <[email protected]>
1 parent f0f16be commit 7ca4473

13 files changed

+461
-454
lines changed

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,8 +1222,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
12221222

12231223
} // End HasD16LoadStore
12241224

1225-
def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
1226-
int_amdgcn_buffer_wbinvl1>;
1225+
let SubtargetPredicate = isNotGFX12Plus in
1226+
def BUFFER_WBINVL1 : MUBUF_Invalidate <
1227+
"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
1228+
>;
12271229

12281230
let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
12291231
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
6060
bits<1> has_sve = 0; // Scratch VGPR Enable
6161
bits<1> lds = 0;
6262
bits<1> sve = 0;
63+
bits<1> has_offset = 1;
6364

6465
let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
6566
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
182183
let Inst{51-50} = cpol{4-3}; // scope
183184
let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);
184185
let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
185-
let Inst{95-72} = offset;
186+
let Inst{95-72} = !if(ps.has_offset, offset, ?);
186187
}
187188

188189
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -340,6 +341,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass
340341
GlobalSaddrTable<1, opName>;
341342
}
342343

344+
class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> :
345+
FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> {
346+
347+
let AsmMatchConverter = "";
348+
349+
let hasSideEffects = 1;
350+
let mayLoad = 0;
351+
let mayStore = 0;
352+
let is_flat_global = 1;
353+
354+
let has_offset = 0;
355+
let has_saddr = 0;
356+
let enabled_saddr = 0;
357+
let saddr_value = 0;
358+
let has_vdst = 0;
359+
let has_data = 0;
360+
let has_vaddr = 0;
361+
let has_glc = 0;
362+
let has_dlc = 0;
363+
let glcValue = 0;
364+
let dlcValue = 0;
365+
let has_sccb = 0;
366+
let sccbValue = 0;
367+
let has_sve = 0;
368+
let lds = 0;
369+
let sve = 0;
370+
}
371+
343372
class FlatScratchInst <string sv_op, string mode> {
344373
string SVOp = sv_op;
345374
string Mode = mode;
@@ -928,6 +957,10 @@ defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor
928957

929958
let SubtargetPredicate = isGFX12Plus in {
930959
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
960+
961+
def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">;
962+
def GLOBAL_WB : FLAT_Global_Invalidate_Writeback<"global_wb">;
963+
def GLOBAL_WBINV : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
931964
} // End SubtargetPredicate = isGFX12Plus
932965

933966
} // End is_flat_global = 1
@@ -2662,6 +2695,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A
26622695
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
26632696
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
26642697

2698+
defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
2699+
defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">;
2700+
defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">;
2701+
26652702
// ENC_VSCRATCH.
26662703
defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
26672704
defm SCRATCH_LOAD_I8 : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
14241424
});
14251425
}
14261426

1427+
static bool isCacheInvOrWBInst(MachineInstr &Inst) {
1428+
auto Opc = Inst.getOpcode();
1429+
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
1430+
Opc == AMDGPU::GLOBAL_WBINV;
1431+
}
1432+
14271433
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
14281434
WaitcntBrackets *ScoreBrackets) {
14291435
// Now look at the instruction opcode. If it is a memory access
@@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
14391445
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
14401446
}
14411447
} else if (TII->isFLAT(Inst)) {
1448+
// TODO: Track this properly.
1449+
if (isCacheInvOrWBInst(Inst))
1450+
return;
1451+
14421452
assert(Inst.mayLoadOrStore());
14431453

14441454
int FlatASCount = 0;

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,14 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
578578
bool IsNonTemporal) const override;
579579
};
580580

581+
class SIGfx12CacheControl : public SIGfx11CacheControl {
582+
public:
583+
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
584+
585+
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
586+
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
587+
};
588+
581589
class SIMemoryLegalizer final : public MachineFunctionPass {
582590
private:
583591

@@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
857865
return std::make_unique<SIGfx7CacheControl>(ST);
858866
if (Generation < AMDGPUSubtarget::GFX11)
859867
return std::make_unique<SIGfx10CacheControl>(ST);
860-
return std::make_unique<SIGfx11CacheControl>(ST);
868+
if (Generation < AMDGPUSubtarget::GFX12)
869+
return std::make_unique<SIGfx11CacheControl>(ST);
870+
return std::make_unique<SIGfx12CacheControl>(ST);
861871
}
862872

863873
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -1423,7 +1433,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
14231433
bool Changed = false;
14241434

14251435
MachineBasicBlock &MBB = *MI->getParent();
1426-
DebugLoc DL = MI->getDebugLoc();
1436+
const DebugLoc &DL = MI->getDebugLoc();
14271437

14281438
if (Pos == Position::AFTER)
14291439
++MI;
@@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
21322142
return Changed;
21332143
}
21342144

2145+
bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2146+
SIAtomicScope Scope,
2147+
SIAtomicAddrSpace AddrSpace,
2148+
Position Pos) const {
2149+
if (!InsertCacheInv)
2150+
return false;
2151+
2152+
MachineBasicBlock &MBB = *MI->getParent();
2153+
DebugLoc DL = MI->getDebugLoc();
2154+
2155+
/// The scratch address space does not need the global memory cache
2156+
/// to be flushed as all memory operations by the same thread are
2157+
/// sequentially consistent, and no other thread can access scratch
2158+
/// memory.
2159+
2160+
/// Other address spaces do not have a cache.
2161+
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2162+
return false;
2163+
2164+
AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2165+
switch (Scope) {
2166+
case SIAtomicScope::SYSTEM:
2167+
ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2168+
break;
2169+
case SIAtomicScope::AGENT:
2170+
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2171+
break;
2172+
case SIAtomicScope::WORKGROUP:
2173+
// In WGP mode the waves of a work-group can be executing on either CU of
2174+
// the WGP. Therefore we need to invalidate the L0 which is per CU.
2175+
// Otherwise in CU mode all waves of a work-group are on the same CU, and so
2176+
// the L0 does not need to be invalidated.
2177+
if (ST.isCuModeEnabled())
2178+
return false;
2179+
2180+
ScopeImm = AMDGPU::CPol::SCOPE_SE;
2181+
break;
2182+
case SIAtomicScope::WAVEFRONT:
2183+
case SIAtomicScope::SINGLETHREAD:
2184+
// No cache to invalidate.
2185+
return false;
2186+
default:
2187+
llvm_unreachable("Unsupported synchronization scope");
2188+
}
2189+
2190+
if (Pos == Position::AFTER)
2191+
++MI;
2192+
2193+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2194+
2195+
if (Pos == Position::AFTER)
2196+
--MI;
2197+
2198+
return true;
2199+
}
2200+
21352201
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
21362202
if (AtomicPseudoMIs.empty())
21372203
return false;

llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1295,8 +1295,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
12951295
; GFX12-NEXT: v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
12961296
; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN
12971297
; GFX12-NEXT: s_waitcnt vmcnt(0)
1298-
; GFX12-NEXT: buffer_gl0_inv
1299-
; GFX12-NEXT: buffer_gl1_inv
1298+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
13001299
; GFX12-NEXT: ; return to shader part epilog
13011300
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
13021301
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1347,8 +1346,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
13471346
; GFX12-NEXT: v_mov_b32_e32 v2, 2
13481347
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
13491348
; GFX12-NEXT: s_waitcnt vmcnt(0)
1350-
; GFX12-NEXT: buffer_gl0_inv
1351-
; GFX12-NEXT: buffer_gl1_inv
1349+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
13521350
; GFX12-NEXT: ; return to shader part epilog
13531351
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
13541352
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1389,8 +1387,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
13891387
; GFX12-NEXT: v_mov_b32_e32 v2, 2
13901388
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN
13911389
; GFX12-NEXT: s_waitcnt vmcnt(0)
1392-
; GFX12-NEXT: buffer_gl0_inv
1393-
; GFX12-NEXT: buffer_gl1_inv
1390+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
13941391
; GFX12-NEXT: ; return to shader part epilog
13951392
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
13961393
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1438,8 +1435,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
14381435
; GFX12-NEXT: v_mov_b32_e32 v2, 2
14391436
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
14401437
; GFX12-NEXT: s_waitcnt vmcnt(0)
1441-
; GFX12-NEXT: buffer_gl0_inv
1442-
; GFX12-NEXT: buffer_gl1_inv
1438+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
14431439
; GFX12-NEXT: ; return to shader part epilog
14441440
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
14451441
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1491,8 +1487,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
14911487
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
14921488
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN
14931489
; GFX12-NEXT: s_waitcnt vmcnt(0)
1494-
; GFX12-NEXT: buffer_gl0_inv
1495-
; GFX12-NEXT: buffer_gl1_inv
1490+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
14961491
; GFX12-NEXT: ; return to shader part epilog
14971492
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
14981493
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1536,8 +1531,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
15361531
; GFX12-NEXT: v_mov_b32_e32 v0, 0
15371532
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN
15381533
; GFX12-NEXT: s_waitcnt vmcnt(0)
1539-
; GFX12-NEXT: buffer_gl0_inv
1540-
; GFX12-NEXT: buffer_gl1_inv
1534+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
15411535
; GFX12-NEXT: ; return to shader part epilog
15421536
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
15431537
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1590,8 +1584,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
15901584
; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
15911585
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN
15921586
; GFX12-NEXT: s_waitcnt vmcnt(0)
1593-
; GFX12-NEXT: buffer_gl0_inv
1594-
; GFX12-NEXT: buffer_gl1_inv
1587+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
15951588
; GFX12-NEXT: ; return to shader part epilog
15961589
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
15971590
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1633,8 +1626,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
16331626
; GFX12-NEXT: v_mov_b32_e32 v4, v2
16341627
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN
16351628
; GFX12-NEXT: s_waitcnt vmcnt(0)
1636-
; GFX12-NEXT: buffer_gl0_inv
1637-
; GFX12-NEXT: buffer_gl1_inv
1629+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
16381630
; GFX12-NEXT: ; return to shader part epilog
16391631
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
16401632
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1682,8 +1674,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
16821674
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
16831675
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
16841676
; GFX12-NEXT: s_waitcnt vmcnt(0)
1685-
; GFX12-NEXT: buffer_gl0_inv
1686-
; GFX12-NEXT: buffer_gl1_inv
1677+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
16871678
; GFX12-NEXT: ; return to shader part epilog
16881679
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
16891680
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1736,8 +1727,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
17361727
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
17371728
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
17381729
; GFX12-NEXT: s_waitcnt vmcnt(0)
1739-
; GFX12-NEXT: buffer_gl0_inv
1740-
; GFX12-NEXT: buffer_gl1_inv
1730+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
17411731
; GFX12-NEXT: ; return to shader part epilog
17421732
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
17431733
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst

0 commit comments

Comments
 (0)