Skip to content

Commit c99da46

Browse files
[AMDGPU][GFX12] Add Atomic cond_sub_u32 (#76224)
Co-authored-by: Vang Thao <[email protected]>
1 parent badf0ee commit c99da46

25 files changed

+860
-12
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,6 +1182,11 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
11821182

11831183
The iglp_opt strategy implementations are subject to change.
11841184

1185+
llvm.amdgcn.atomic.cond.sub.u32 Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32
1186+
and ds_cond_sub_u32 based on address space on gfx12 targets. This
1187+
performs subtraction only if the memory value is greater than or
1188+
equal to the data value.
1189+
11851190
============================================== ==========================================================
11861191

11871192
.. TODO::

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,6 +1263,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
12631263
def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
12641264
def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
12651265
def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
1266+
def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
12661267
def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
12671268
[llvm_anyint_ty],
12681269
[LLVMMatchType<0>, // src(VGPR)
@@ -1299,6 +1300,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
12991300
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
13001301
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
13011302
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
1303+
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
13021304
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
13031305
[llvm_anyint_ty],
13041306
[LLVMMatchType<0>, // src(VGPR)
@@ -1337,6 +1339,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
13371339
def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
13381340
def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
13391341
def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
1342+
def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
13401343
def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
13411344
[llvm_anyint_ty],
13421345
[LLVMMatchType<0>, // src(VGPR)
@@ -1372,6 +1375,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic;
13721375
def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
13731376
def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
13741377
def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
1378+
def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic;
13751379
def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
13761380
[llvm_anyint_ty],
13771381
[LLVMMatchType<0>, // src(VGPR)
@@ -2524,6 +2528,8 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
25242528
def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
25252529
def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
25262530

2531+
def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
2532+
25272533
//===----------------------------------------------------------------------===//
25282534
// Deep learning intrinsics.
25292535
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
264264
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
265265
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
266266
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
267+
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>;
267268
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
268269
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
269270
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5480,6 +5480,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
54805480
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
54815481
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
54825482
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5483+
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
54835484

54845485
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
54855486
}

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ enum NodeType : unsigned {
593593
BUFFER_ATOMIC_FADD,
594594
BUFFER_ATOMIC_FMIN,
595595
BUFFER_ATOMIC_FMAX,
596+
BUFFER_ATOMIC_COND_SUB_U32,
596597

597598
LAST_AMDGPU_ISD_NUMBER
598599
};

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,9 @@ defm int_amdgcn_flat_atomic_fmin_num : noret_op;
647647
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
648648
defm int_amdgcn_global_atomic_fmin_num : noret_op;
649649
defm int_amdgcn_global_atomic_fmax_num : noret_op;
650+
defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
651+
defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
652+
defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
650653

651654
multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
652655
let HasNoUse = true in

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5893,6 +5893,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
58935893
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
58945894
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
58955895
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5896+
case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5897+
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5898+
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
58965899
default:
58975900
llvm_unreachable("unhandled atomic opcode");
58985901
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4856,6 +4856,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48564856
case Intrinsic::amdgcn_flat_atomic_fmax_num:
48574857
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
48584858
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4859+
case Intrinsic::amdgcn_atomic_cond_sub_u32:
48594860
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
48604861
return getDefaultMappingAllVGPR(MI);
48614862
case Intrinsic::amdgcn_ds_ordered_add:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
237237
def : SourceOfDivergence<int_r600_read_tidig_x>;
238238
def : SourceOfDivergence<int_r600_read_tidig_y>;
239239
def : SourceOfDivergence<int_r600_read_tidig_z>;
240+
def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
240241
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
241242
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
242243
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
@@ -282,6 +283,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
282283
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
283284
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
284285
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
286+
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cond_sub_u32>;
285287
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>;
286288
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>;
287289
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>;
@@ -298,6 +300,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
298300
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
299301
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
300302
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
303+
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32>;
301304
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
302305
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
303306
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
@@ -314,6 +317,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
314317
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
315318
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
316319
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
320+
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cond_sub_u32>;
317321
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>;
318322
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>;
319323
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>;
@@ -330,6 +334,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
330334
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
331335
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
332336
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
337+
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
333338
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
334339
def : SourceOfDivergence<int_amdgcn_ps_live>;
335340
def : SourceOfDivergence<int_amdgcn_live_mask>;

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
12411241
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
12421242
>;
12431243

1244+
let SubtargetPredicate = isGFX12Plus in {
1245+
defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
1246+
"buffer_atomic_cond_sub_u32", VGPR_32, i32
1247+
>;
1248+
}
1249+
12441250
//===----------------------------------------------------------------------===//
12451251
// MTBUF Instructions
12461252
//===----------------------------------------------------------------------===//
@@ -1704,6 +1710,13 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
17041710
let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
17051711
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
17061712

1713+
let SubtargetPredicate = isGFX12Plus in {
1714+
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
1715+
1716+
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
1717+
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
1718+
}
1719+
17071720
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
17081721
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
17091722
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
@@ -2607,6 +2620,7 @@ defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x049,
26072620
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x034, "buffer_atomic_cmpswap_b32">;
26082621
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x042, "buffer_atomic_cmpswap_b64">;
26092622
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">;
2623+
defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Real_Atomic_gfx12<0x050>;
26102624
defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">;
26112625
def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">;
26122626
defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x040, "buffer_atomic_dec_u32">;

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,12 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
437437
let has_gds = 0;
438438
}
439439

440+
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
441+
bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
442+
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
443+
let AddedComplexity = complexity;
444+
}
445+
440446
defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">;
441447
defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">;
442448
defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">;
@@ -732,9 +738,22 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
732738

733739
let SubtargetPredicate = isGFX12Plus in {
734740

741+
defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
742+
defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32, "ds_cond_sub_u32">;
735743
defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
736744
defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32, "ds_sub_clamp_u32">;
737745

746+
multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
747+
ValueType vt, string frag> {
748+
def : DSAtomicRetPat<inst, vt,
749+
!cast<PatFrag>(frag#"_local_addrspace")>;
750+
751+
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
752+
def : DSAtomicRetPat<noRetInst, vt,
753+
!cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
754+
}
755+
756+
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
738757
} // let SubtargetPredicate = isGFX12Plus
739758

740759
//===----------------------------------------------------------------------===//
@@ -954,12 +973,6 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
954973

955974
} // End AddedComplexity = 100
956975

957-
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
958-
bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
959-
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
960-
let AddedComplexity = complexity;
961-
}
962-
963976
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
964977
let OtherPredicates = [LDSRequiresM0Init] in {
965978
def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
@@ -1237,7 +1250,9 @@ defm DS_MIN_NUM_F64 : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num
12371250
defm DS_MAX_NUM_F64 : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">;
12381251
defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">;
12391252
defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">;
1253+
defm DS_COND_SUB_U32 : DS_Real_gfx12<0x098>;
12401254
defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>;
1255+
defm DS_COND_SUB_RTN_U32 : DS_Real_gfx12<0x0a8>;
12411256
defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>;
12421257

12431258
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)