-
Notifications
You must be signed in to change notification settings - Fork 13.6k
release/19.x: [AMDGPU] Remove one case of vmcnt loop header flushing for GFX12 (#105550) #105808
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@arsenm @Pierre-vh @arsenm What do you think about merging this PR to the release branch? |
@llvm/pr-subscribers-backend-amdgpu Author: None (llvmbot) ChangesBackport 6119461 5506831 fa2dccb Requested by: @jayfoad Patch is 28.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105808.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7906e0ee9d7858..9efdbd751d96e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -953,6 +953,12 @@ def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
"Export priority must be explicitly manipulated on GFX11.5"
>;
+def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
+ "HasVmemWriteVgprInOrder",
+ "true",
+ "VMEM instructions of the same type write VGPR results in order"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1123,7 +1129,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
- FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1136,7 +1143,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1152,7 +1160,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
- FeatureDefaultComponentZero
+ FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder
]
>;
@@ -1170,7 +1178,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
+ FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1193,7 +1202,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength63,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1215,7 +1225,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength32,
- FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index def89c785b8552..459686aa3141a9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -239,6 +239,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasVALUTransUseHazard = false;
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;
+ bool HasVmemWriteVgprInOrder = false;
bool RequiresCOV6 = false;
@@ -1285,6 +1286,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
+ bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
+
/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1315aa08557888..13130db884dc1c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1778,11 +1778,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
- // instruction, in which case they're guaranteed to write their
- // results in order anyway.
+ // instruction, in which case they are (in some architectures)
+ // guaranteed to write their results in order anyway.
if (Op.isUse() || !updateVMCntOnly(MI) ||
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
- getVmemType(MI))) {
+ getVmemType(MI)) ||
+ !ST->hasVmemWriteVgprInOrder()) {
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
@@ -2389,7 +2390,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
}
if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
return true;
- return HasVMemLoad && UsesVgprLoadedOutside;
+ return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
}
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 23e8f98a7861bc..dc3fc6529e24f4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -1398,6 +1398,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -2662,6 +2663,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
@@ -4141,6 +4143,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index ec0408236975d1..8139f2d2eef3c3 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
@@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
@@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index cd01cc7309fcd2..d029aa6769babe 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -1255,6 +1255,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -2449,6 +2450,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
@@ -3949,6 +3951,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
@@ -5319,6 +5322,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -6812,6 +6816,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index 4c1ae4c228adb3..0522d5258b9b5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -128,6 +128,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 355c296d122ff2..22b718935738bd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -745,7 +745,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_clause 0xf
+; GFX12-NEXT: s_clause 0x7
; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
@@ -754,13 +754,21 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
; GFX12-NEXT: s_wait_loadcnt 0x4
; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 142bc37fdeb755..4cc47b09d813d6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3563,15 +3563,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -4371,8 +4375,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -7341,8 +7347,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39
@@ -7364,8 +7372,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index c0649322c81953..7cdf270810dea0 100644
--...
[truncated]
|
I'm not sure if I should have done three different backport requests for the three commits. It could be confusing if they get squash-and-merged onto the release branch. |
It's fine - I can merge it as three separate commits. |
…lvm#105548) (cherry picked from commit 6119461)
…m#105550) When a loop contains a VMEM load whose result is only used outside the loop, do not bother to flush vmcnt in the loop head on GFX12. A wait for vmcnt will be required inside the loop anyway, because VMEM instructions can write their VGPR results out of order. (cherry picked from commit fa2dccb)
@jayfoad (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
Backport 6119461 5506831 fa2dccb
Requested by: @jayfoad