diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index e05f7fc3e7662..848cd7471be23 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1182,6 +1182,11 @@ The AMDGPU backend implements the following LLVM IR intrinsics. The iglp_opt strategy implementations are subject to change. + llvm.amdgcn.atomic.cond.sub.u32 Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32 + and ds_cond_sub_u32 based on address space on gfx12 targets. This + performs subtraction only if the memory value is greater than or + equal to the data value. + ============================================== ========================================================== .. TODO:: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e5596258847f9..e4479a271cf52 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1246,6 +1246,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1282,6 +1283,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; +def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1320,6 +1322,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1355,6 +1358,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; +def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -2496,6 +2500,8 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn; def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn; +def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 2b85024a9b40b..801c5fa2e1565 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -264,6 +264,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2f663571a8f97..34cc35de882c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5473,6 +5473,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 827fb106b5519..6828db6e0220d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -589,6 +589,7 @@ enum NodeType : unsigned { BUFFER_ATOMIC_FADD, BUFFER_ATOMIC_FMIN, BUFFER_ATOMIC_FMAX, + BUFFER_ATOMIC_COND_SUB_U32, LAST_AMDGPU_ISD_NUMBER }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 36e07d944c942..360aafedc5224 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -647,6 +647,9 @@ defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; defm int_amdgcn_global_atomic_fmax_num : noret_op; +defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op; +defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op; +defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op; multiclass noret_binary_atomic_op { let HasNoUse = true in diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index dfbe5c7fed882..0e650a643baa2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5883,6 +5883,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; default: llvm_unreachable("unhandled atomic opcode"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 92182ec069426..c4668a74114af 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4692,6 +4692,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 4cc8871a00fe1..adeddefb6a008 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -237,6 +237,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -282,6 +283,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -298,6 +300,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -314,6 +317,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -330,6 +334,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 43d35fa5291ca..fb4ef8620b795 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1245,6 +1245,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; +let SubtargetPredicate = isGFX12Plus in { +defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics < + "buffer_atomic_cond_sub_u32", VGPR_32, i32 +>; +} + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// @@ -1708,6 +1714,13 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; +let SubtargetPredicate = isGFX12Plus in { + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; +} + let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; @@ -2610,6 +2623,7 @@ defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x049, defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x034, "buffer_atomic_cmpswap_b32">; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x042, "buffer_atomic_cmpswap_b64">; defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">; +defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Real_Atomic_gfx12<0x050>; defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">; def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">; defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x040, "buffer_atomic_dec_u32">; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index bc9049b4ef33c..e6cf68ac1fa93 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -438,6 +438,12 @@ class DS_1A1D_PERMUTE : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, getVregSrcForVT.ret:$value, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} + defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">; defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">; defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">; @@ -733,9 +739,22 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; let SubtargetPredicate = isGFX12Plus in { +defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">; +defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32, "ds_cond_sub_u32">; defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">; defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32, "ds_sub_clamp_u32">; +multiclass DSAtomicRetNoRetPatIntrinsic_mc { + def : DSAtomicRetPat(frag#"_local_addrspace")>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + def : DSAtomicRetPat(frag#"_noret_local_addrspace"), /* complexity */ 1>; +} + +defm : DSAtomicRetNoRetPatIntrinsic_mc; } // let SubtargetPredicate = isGFX12Plus //===----------------------------------------------------------------------===// @@ -955,12 +974,6 @@ defm : DSWritePat_mc ; } // End AddedComplexity = 100 -class DSAtomicRetPat : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, getVregSrcForVT.ret:$value, offset:$offset, (i1 gds))> { - let AddedComplexity = complexity; -} - multiclass DSAtomicRetPat_mc { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat(frag#"_local_m0_"#vt.Size)>; @@ -1238,7 +1251,9 @@ defm DS_MIN_NUM_F64 : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num defm DS_MAX_NUM_F64 : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">; defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">; defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">; +defm DS_COND_SUB_U32 : DS_Real_gfx12<0x098>; defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>; +defm DS_COND_SUB_RTN_U32 : DS_Real_gfx12<0x0a8>; defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 615f8cd54d8f9..1027ff478248e 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -801,6 +801,7 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { let SubtargetPredicate = isGFX12Plus in { defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>; + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPR_32, i32>; } // End SubtargetPredicate = isGFX12Plus defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; @@ -927,6 +928,7 @@ defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ssho defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; let SubtargetPredicate = isGFX12Plus in { + defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>; defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>; } // End SubtargetPredicate = isGFX12Plus @@ -1074,23 +1076,43 @@ class FlatStoreSignedAtomicPat .ret:$data, $offset) >; -multiclass FlatAtomicNoRtnPat { - defvar noRtnNode = !cast(node # "_noret" # !if(isIntr, "", "_"#vt.Size)); +multiclass FlatAtomicNoRtnPatBase { + + defvar noRtnNode = !cast(node); let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } -multiclass FlatAtomicRtnPat { - defvar rtnNode = !cast(node # !if(isIntr, "", "_"#vt.Size)); +multiclass FlatAtomicNoRtnPatWithAddrSpace : + FlatAtomicNoRtnPatBase; + +multiclass FlatAtomicNoRtnPat : + FlatAtomicNoRtnPatBase; + + +multiclass FlatAtomicRtnPatBase { + + defvar rtnNode = !cast(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; } +multiclass FlatAtomicRtnPatWithAddrSpace : + FlatAtomicRtnPatBase; + +multiclass FlatAtomicRtnPat : + FlatAtomicRtnPatBase; + + multiclass FlatAtomicPat : FlatAtomicRtnPat, @@ -1270,6 +1292,13 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64 defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>; } // end foreach as +let SubtargetPredicate = isGFX12Plus in { + defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; +} + def : FlatStorePat ; def : FlatStorePat ; @@ -1531,6 +1560,13 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64> defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; +let SubtargetPredicate = isGFX12Plus in { + defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; +} + let OtherPredicates = [isGFX12Plus] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>; } @@ -2600,6 +2636,7 @@ defm FLAT_ATOMIC_OR_B64 : VFLAT_Real_Atomics_gfx12<0x04a, "FLAT_ATOMI defm FLAT_ATOMIC_XOR_B64 : VFLAT_Real_Atomics_gfx12<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>; defm FLAT_ATOMIC_INC_U64 : VFLAT_Real_Atomics_gfx12<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>; defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>; +defm FLAT_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050, "FLAT_ATOMIC_COND_SUB_U32", "flat_atomic_cond_sub_u32">; defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">; defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">; defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">; @@ -2657,6 +2694,7 @@ defm GLOBAL_ATOMIC_OR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04a, "GLOBAL_A defm GLOBAL_ATOMIC_XOR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>; defm GLOBAL_ATOMIC_INC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>; defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050, "GLOBAL_ATOMIC_COND_SUB_U32", "global_atomic_cond_sub_u32">; defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">; defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">; defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 079cae06888cd..cd00ddb7f770c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1311,6 +1311,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); @@ -8562,6 +8563,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + return lowerRawBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_struct_buffer_atomic_swap: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: return lowerStructBufferAtomicIntrin(Op, DAG, @@ -8603,6 +8607,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_dec: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_buffer_atomic_cmpswap: { unsigned Slc = Op.getConstantOperandVal(7); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index f07b8fa0ea4cd..8a365cd9007bc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -197,6 +197,7 @@ defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; +defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index ea2a8b75d0740..5dbf1bdd04227 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3768,6 +3768,7 @@ def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll index 59fbd5627ebfc..ea7c3e7175f12 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll @@ -22,7 +22,55 @@ define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) ret void } +; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) +define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4 + %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + store i32 %val, ptr addrspace(3) %use + ret void +} + +; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) +define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 { +entry: + %gep = getelementptr i32, ptr %addr, i32 4 + %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + store i32 %val, ptr %use + ret void +} + +; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) +define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + store i32 %val, ptr addrspace(1) %use + ret void +} + +; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) +define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +entry: + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) +define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +entry: + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1 +declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) #1 +declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) #1 +declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) #1 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll new file mode 100644 index 0000000000000..1d324a5df1b86 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) +declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) +declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) + +define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { +; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) { +; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr %addr, i32 4 + %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + store i32 %val, ptr %use + ret void +} + +define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { +; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { +; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { +; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { +; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: ds_store_b32 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: ds_store_b32 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4 + %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + store i32 %val, ptr addrspace(3) %use + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll new file mode 100644 index 0000000000000..9f89aa0ebb943 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12 + +define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) + ret void +} + +define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) + ret void +} + +define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) + ret void +} + +define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" } + diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index e01eb05e85588..70a041146984a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -2008,11 +2008,23 @@ flat_atomic_csub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +ds_cond_sub_rtn_u32 v5, v1, v2 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_cond_sub_u32 v1, v2 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + ds_sub_clamp_rtn_u32 v5, v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU ds_sub_clamp_u32 v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s index ba32dc8820eaa..c89d1ba8a4e54 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s @@ -1443,6 +1443,24 @@ ds_storexchg_rtn_b64 v[5:6], v1, v[2:3] offset:0 ds_storexchg_rtn_b64 v[254:255], v255, v[254:255] offset:4 // GFX12: [0x04,0x00,0xb4,0xd9,0xff,0xfe,0x00,0xfe] +ds_cond_sub_rtn_u32 v5, v1, v2 +// GFX12: [0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x05] + +ds_cond_sub_rtn_u32 v5, v1, v2 offset:65535 +// GFX12: [0xff,0xff,0xa0,0xda,0x01,0x02,0x00,0x05] + +ds_cond_sub_rtn_u32 v5, v1, v2 offset:0 +// GFX12: [0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x05] + +ds_cond_sub_u32 v1, v2 +// GFX12: [0x00,0x00,0x60,0xda,0x01,0x02,0x00,0x00] + +ds_cond_sub_u32 v1, v2 offset:65535 +// GFX12: [0xff,0xff,0x60,0xda,0x01,0x02,0x00,0x00] + +ds_cond_sub_u32 v1, v2 offset:0 +// GFX12: [0x00,0x00,0x60,0xda,0x01,0x02,0x00,0x00] + ds_sub_clamp_rtn_u32 v5, v1, v2 // GFX12: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s index a7a256cfd2b8f..4ee2e5ed294e9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s @@ -3178,6 +3178,72 @@ buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 glc dlc buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v255, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[12:15], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[96:99], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s101 offset:8388607 +// GFX12: encoding: [0x65,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], m0 offset:8388607 +// GFX12: encoding: [0x7d,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], 0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], -1 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], 0.5 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], -4.0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 idxen offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 offen offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:0 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:7 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RT_RETURN scope:SCOPE_SE +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 glc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 slc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:8388607 // GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s index 95d352b421a28..102fe9b15ebd7 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s @@ -117,6 +117,18 @@ flat_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8] offset:2047 th:TH_ATOMIC_RETURN flat_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8] offset:2047 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x7c,0x80,0x10,0xec,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00] +flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +flat_atomic_cond_sub_u32 v[0:1], v2 offset:-64 +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + flat_atomic_dec_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] @@ -864,6 +876,30 @@ global_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8], off offset:2047 th:TH_ATOMIC_R global_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8], off offset:2047 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x7c,0x80,0x10,0xee,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0 +global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:-64 +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + +global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_cond_sub_u32 v[0:1], v2, off offset:-64 +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v[0:1], v2, off offset:64 +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index d3c0e71494990..338442c98c3ef 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -2079,6 +2079,87 @@ # GFX12: ds_rsub_u64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0x08,0xd9,0xff,0x02,0x00,0x00] 0xff,0xff,0x08,0xd9,0xff,0x02,0x00,0x00 +# GFX12: ds_cond_sub_rtn_u32 v0, v1, v2 ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v1, v2 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0x01,0x02,0x00,0x00] +0x34,0x12,0xa0,0xda,0x01,0x02,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v1, v2 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa0,0xda,0x01,0x02,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v254, v253 ; encoding: [0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0x00] +0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v254, v253 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0x00] +0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v254, v253 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0x00] +0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v255, v1, v253 ; encoding: [0x00,0x00,0xa0,0xda,0x01,0xfd,0x00,0xff] +0x00,0x00,0xa0,0xda,0x01,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v1, v253 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0x01,0xfd,0x00,0xff] +0x34,0x12,0xa0,0xda,0x01,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v1, v253 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0xfd,0x00,0xff] +0xff,0xff,0xa0,0xda,0x01,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v2 ; encoding: [0x00,0x00,0xa0,0xda,0xfe,0x02,0x00,0xff] +0x00,0x00,0xa0,0xda,0xfe,0x02,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v2 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0xfe,0x02,0x00,0xff] +0x34,0x12,0xa0,0xda,0xfe,0x02,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v2 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xfe,0x02,0x00,0xff] +0xff,0xff,0xa0,0xda,0xfe,0x02,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v253 ; encoding: [0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0xff] +0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v253 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0xff] +0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v253 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0xff] +0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_u32 v0, v1 ; encoding: [0x00,0x00,0x60,0xda,0x00,0x01,0x00,0x00] +0x00,0x00,0x60,0xda,0x00,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v1 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0x00,0x01,0x00,0x00] +0x34,0x12,0x60,0xda,0x00,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v1 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x00,0x01,0x00,0x00] +0xff,0xff,0x60,0xda,0x00,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v254 ; encoding: [0x00,0x00,0x60,0xda,0x00,0xfe,0x00,0x00] +0x00,0x00,0x60,0xda,0x00,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v254 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0x00,0xfe,0x00,0x00] +0x34,0x12,0x60,0xda,0x00,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v254 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x00,0xfe,0x00,0x00] +0xff,0xff,0x60,0xda,0x00,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v1 ; encoding: [0x00,0x00,0x60,0xda,0xff,0x01,0x00,0x00] +0x00,0x00,0x60,0xda,0xff,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v1 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0xff,0x01,0x00,0x00] +0x34,0x12,0x60,0xda,0xff,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0x01,0x00,0x00] +0xff,0xff,0x60,0xda,0xff,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v254 ; encoding: [0x00,0x00,0x60,0xda,0xff,0xfe,0x00,0x00] +0x00,0x00,0x60,0xda,0xff,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v254 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0xff,0xfe,0x00,0x00] +0x34,0x12,0x60,0xda,0xff,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v254 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0xfe,0x00,0x00] +0xff,0xff,0x60,0xda,0xff,0xfe,0x00,0x00 + # GFX12: ds_sub_clamp_rtn_u32 v0, v1, v2 ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] 0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt index ff8437155e12e..dce542e93075d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt @@ -1986,6 +1986,48 @@ # GFX12: buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 scope:SCOPE_SE ; encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x84,0x00,0x00,0xff,0xff,0x7f] 0x03,0xc0,0x0d,0xc4,0x05,0x10,0x84,0x00,0x00,0xff,0xff,0x7f +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v255, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[12:15], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[96:99], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s101 offset:8388607 ; encoding: [0x65,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x65,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], m0 offset:8388607 ; encoding: [0x7d,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x7d,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 idxen offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 offen offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:7 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00 + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f + # GFX12: buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] 0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt index f4038cf10f50d..ed48d280c68b1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt @@ -69,6 +69,12 @@ # GFX12: flat_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00] 0x7c,0x80,0x10,0xec,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00 +# GFX12: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + # GFX12: flat_atomic_dec_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] 0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 @@ -513,6 +519,18 @@ # GFX12: global_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00] 0x7c,0x80,0x10,0xee,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00 +# GFX12: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_cond_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + # GFX12: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] 0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00