Skip to content

Commit 95b77d9

Browse files
committed
Revert "AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat (#95394)"
This reverts commit 5021e6d. Breaks tests, see #95394 (comment)
1 parent 7cb5faf commit 95b77d9

10 files changed

+1346
-6662
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1659,13 +1659,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
16591659
});
16601660
}
16611661

1662-
if (ST.hasAtomicBufferGlobalPkAddF16Insts())
1663-
Atomic.legalFor({{V2F16, GlobalPtr}});
1664-
if (ST.hasAtomicGlobalPkAddBF16Inst())
1665-
Atomic.legalFor({{V2BF16, GlobalPtr}});
1666-
if (ST.hasAtomicFlatPkAdd16Insts())
1667-
Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1668-
16691662
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
16701663
// demarshalling
16711664
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,7 +1645,6 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
16451645
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
16461646
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
16471647
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
1648-
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16491648
}
16501649

16511650
let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
@@ -1670,16 +1669,13 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
16701669
}
16711670

16721671
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
1673-
// FIXME: These do not have signed offsets
16741672
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
16751673
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
1676-
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
1677-
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16781674
}
16791675

16801676
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
16811677
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
1682-
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
1678+
16831679
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
16841680

16851681
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15942,16 +15942,6 @@ static bool isHalf2OrBFloat2(Type *Ty) {
1594215942
return false;
1594315943
}
1594415944

15945-
static bool isHalf2(Type *Ty) {
15946-
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15947-
return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
15948-
}
15949-
15950-
static bool isBFloat2(Type *Ty) {
15951-
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
15952-
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
15953-
}
15954-
1595515945
TargetLowering::AtomicExpansionKind
1595615946
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1595715947
unsigned AS = RMW->getPointerAddressSpace();
@@ -16020,27 +16010,10 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1602016010
AS != AMDGPUAS::BUFFER_FAT_POINTER)
1602116011
return AtomicExpansionKind::CmpXChg;
1602216012

16013+
// TODO: gfx940 supports v2f16 and v2bf16
1602316014
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
1602416015
return AtomicExpansionKind::None;
1602516016

16026-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16027-
// gfx940, gfx12
16028-
// FIXME: Needs to account for no fine-grained memory
16029-
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16030-
return AtomicExpansionKind::None;
16031-
} else {
16032-
// gfx90a, gfx940, gfx12
16033-
// FIXME: Needs to account for no fine-grained memory
16034-
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16035-
return AtomicExpansionKind::None;
16036-
16037-
// gfx940, gfx12
16038-
// FIXME: Need to skip buffer_fat_pointer?
16039-
// FIXME: Needs to account for no fine-grained memory
16040-
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16041-
return AtomicExpansionKind::None;
16042-
}
16043-
1604416017
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
1604516018
return AtomicExpansionKind::CmpXChg;
1604616019

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,24 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
237237
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
238238
; GFX940: ; %bb.0:
239239
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240+
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
241+
; GFX940-NEXT: s_mov_b64 s[0:1], 0
242+
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
243+
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
244+
; GFX940-NEXT: s_waitcnt vmcnt(0)
245+
; GFX940-NEXT: v_mov_b32_e32 v5, v3
246+
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
240247
; GFX940-NEXT: buffer_wbl2 sc1
241-
; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0
248+
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
242249
; GFX940-NEXT: s_waitcnt vmcnt(0)
243250
; GFX940-NEXT: buffer_inv sc1
251+
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
252+
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253+
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
254+
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
255+
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
256+
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
257+
; GFX940-NEXT: v_mov_b32_e32 v0, v3
244258
; GFX940-NEXT: s_setpc_b64 s[30:31]
245259
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
246260
%result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -251,10 +265,23 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
251265
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
252266
; GFX940: ; %bb.0:
253267
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268+
; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
269+
; GFX940-NEXT: s_mov_b64 s[0:1], 0
270+
; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
271+
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
272+
; GFX940-NEXT: s_waitcnt vmcnt(0)
273+
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
254274
; GFX940-NEXT: buffer_wbl2 sc1
255-
; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024
275+
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
256276
; GFX940-NEXT: s_waitcnt vmcnt(0)
257277
; GFX940-NEXT: buffer_inv sc1
278+
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279+
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
280+
; GFX940-NEXT: v_mov_b32_e32 v5, v3
281+
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
282+
; GFX940-NEXT: s_cbranch_execnz .LBB18_1
283+
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
284+
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
258285
; GFX940-NEXT: s_setpc_b64 s[30:31]
259286
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
260287
%unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -265,10 +292,24 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
265292
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
266293
; GFX940: ; %bb.0:
267294
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295+
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
296+
; GFX940-NEXT: s_mov_b64 s[0:1], 0
297+
; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
298+
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
299+
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300+
; GFX940-NEXT: v_mov_b32_e32 v5, v3
301+
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
268302
; GFX940-NEXT: buffer_wbl2 sc1
269-
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0
303+
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
270304
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
271305
; GFX940-NEXT: buffer_inv sc1
306+
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
307+
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
308+
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
309+
; GFX940-NEXT: s_cbranch_execnz .LBB19_1
310+
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
311+
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
312+
; GFX940-NEXT: v_mov_b32_e32 v0, v3
272313
; GFX940-NEXT: s_setpc_b64 s[30:31]
273314
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
274315
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
@@ -279,10 +320,23 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
279320
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
280321
; GFX940: ; %bb.0:
281322
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323+
; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
324+
; GFX940-NEXT: s_mov_b64 s[0:1], 0
325+
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
326+
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
327+
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328+
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
282329
; GFX940-NEXT: buffer_wbl2 sc1
283-
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
330+
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
284331
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
285332
; GFX940-NEXT: buffer_inv sc1
333+
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
334+
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
335+
; GFX940-NEXT: v_mov_b32_e32 v5, v3
336+
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
337+
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
338+
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
339+
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
286340
; GFX940-NEXT: s_setpc_b64 s[30:31]
287341
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
288342
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst

0 commit comments

Comments
 (0)