diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7906e0ee9d785..9efdbd751d96e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -953,6 +953,12 @@ def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority", "Export priority must be explicitly manipulated on GFX11.5" >; +def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order", + "HasVmemWriteVgprInOrder", + "true", + "VMEM instructions of the same type write VGPR results in order" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1123,7 +1129,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureVmemWriteVgprInOrder ] >; @@ -1136,7 +1143,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, + FeatureVmemWriteVgprInOrder ] >; @@ -1152,7 +1160,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, - FeatureDefaultComponentZero + FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder ] >; @@ -1170,7 +1178,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero + FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero, + FeatureVmemWriteVgprInOrder ] >; @@ -1193,7 +1202,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, + FeatureVmemWriteVgprInOrder ] >; @@ -1215,7 +1225,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, - FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, + FeatureVmemWriteVgprInOrder ] >; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 902f51ae358d5..9386bcf0d74b2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -239,6 +239,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasVALUTransUseHazard = false; bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; + bool HasVmemWriteVgprInOrder = false; bool RequiresCOV6 = false; @@ -1285,6 +1286,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasRequiredExportPriority() const { return HasRequiredExportPriority; } + bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; } + /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 1315aa0855788..13130db884dc1 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1778,11 +1778,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (IsVGPR) { // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the // previous write and this write are the same type of VMEM - // instruction, in which case they're guaranteed to write their - // results in order anyway. + // instruction, in which case they are (in some architectures) + // guaranteed to write their results in order anyway. if (Op.isUse() || !updateVMCntOnly(MI) || ScoreBrackets.hasOtherPendingVmemTypes(RegNo, - getVmemType(MI))) { + getVmemType(MI)) || + !ST->hasVmemWriteVgprInOrder()) { ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait); ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait); @@ -2389,7 +2390,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, } if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) return true; - return HasVMemLoad && UsesVgprLoadedOutside; + return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 23e8f98a7861b..dc3fc6529e24f 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -1398,6 +1398,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -2662,6 +2663,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 @@ -4141,6 +4143,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index ec0408236975d..8139f2d2eef3c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 @@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 @@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index cd01cc7309fcd..d029aa6769bab 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 @@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 @@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index 4c1ae4c228adb..0522d5258b9b5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -128,6 +128,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 355c296d122ff..22b718935738b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -745,7 +745,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_clause 0xf +; GFX12-NEXT: s_clause 0x7 ; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28 ; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24 ; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20 @@ -754,13 +754,21 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8 ; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4 ; GFX12-NEXT: global_load_u16 v4, v8, s[0:1] +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 ; GFX12-NEXT: s_wait_loadcnt 0x4 ; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 142bc37fdeb75..4cc47b09d813d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3563,15 +3563,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -4371,8 +4375,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -7341,8 +7347,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 @@ -7364,8 +7372,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index c0649322c8195..7cdf270810dea 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -3091,8 +3091,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192 ; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll index b045dd559aac2..34bcc3f02ac66 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -15,6 +15,7 @@ ; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 ; GCN: s_xor_saveexec_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir index 2417becb7c216..0ddd2aa285b26 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -1,5 +1,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -20,6 +21,13 @@ # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop body: | bb.0: @@ -58,6 +66,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_noterm +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_noterm body: | bb.0: @@ -129,6 +144,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_load +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_load body: | bb.0: @@ -170,6 +192,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_no_store +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_no_store body: | bb.0: @@ -212,6 +241,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_no_use +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_no_use body: | bb.0: @@ -255,6 +291,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2 +# GFX12-LABEL: bb.0: +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2 body: | bb.0: @@ -294,6 +338,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_store +# GFX12-LABEL: bb.0: +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_store body: | bb.0: @@ -334,6 +386,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_use_in_loop body: | bb.0: @@ -379,6 +438,15 @@ body: | # GFX10-LABEL: bb.2: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.3: + +# GFX12-LABEL: waitcnt_vm_loop2_nowait +# GFX12-LABEL: bb.0: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.3: name: waitcnt_vm_loop2_nowait body: | bb.0: @@ -427,6 +495,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_reginterval +# GFX12-LABEL: bb.0: +# GFX12: GLOBAL_LOAD_DWORDX4 +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval body: | bb.0: @@ -467,6 +543,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_reginterval2 +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval2 body: | bb.0: @@ -513,6 +596,15 @@ body: | # GFX10-NOT: S_WAITCNT 16240 # GFX10-LABEL: bb.2: +# GFX12-LABEL: waitcnt_vm_zero +# GFX12-LABEL: bb.0: +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: + name: waitcnt_vm_zero body: | bb.0: @@ -548,6 +640,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT +# GFX12-LABEL: waitcnt_vm_necessary +# GFX12-LABEL: bb.0: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12: $vgpr4 +# GFX12-NOT: S_WAITCNT +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAITCNT + # GFX9-LABEL: waitcnt_vm_necessary # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 3952 @@ -590,6 +690,13 @@ body: | # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: +# GFX12-LABEL: waitcnt_vm_loop_global_mem +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: + name: waitcnt_vm_loop_global_mem body: | bb.0: @@ -631,6 +738,13 @@ body: | # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: +# GFX12-LABEL: waitcnt_vm_loop_scratch_mem +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: + name: waitcnt_vm_loop_scratch_mem body: | bb.0: @@ -671,6 +785,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 11 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_flat_mem +# GFX12-LABEL: bb.0: +# GFX12: FLAT_LOAD_DWORD +# GFX12-NOT: S_WAIT_LOADCNT_DSCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT_DSCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_flat_mem body: | bb.0: @@ -713,6 +835,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_flat_load +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_flat_load body: | bb.0: