From 839080f126c645cdf91eaa624359bd04e16cc569 Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Tue, 4 Feb 2025 03:48:19 -0600 Subject: [PATCH 1/4] GlobalISel: Fix defined register of invariant.start In contrast to SelectionDAG, GlobalISel created a new virtual register for the return value of invariant.start, leaving subsequent users of the invariant.start value with an undefined reference. A minimal example: ``` %tmp = alloca i32, align 4, addrspace(5) %tmpI = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #3 call void @llvm.invariant.end.p5(ptr %tmpI, i64 4, ptr addrspace(5) %tmp) #3 store i32 %i, ptr %tmpI, align 4 ``` Although the return value of invariant.start might not be intended for any use beyond invariant.end, an implicit definition of the corresponding virtual register avoids a segfault in the target instruction selector later. --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 4 +- .../AArch64/GlobalISel/arm64-irtranslator.ll | 2 +- .../promote-dependency-on-invariant-result.ll | 37 +++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 3e43299bb8110..362d856e76a8a 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2441,9 +2441,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return true; } case Intrinsic::invariant_start: { - LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL); - Register Undef = MRI->createGenericVirtualRegister(PtrTy); - MIRBuilder.buildUndef(Undef); + MIRBuilder.buildUndef(getOrCreateVReg(CI)); return true; } case Intrinsic::invariant_end: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index 7a67cf3fd4c94..15ee5e48a88e6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -2262,7 +2262,7 @@ declare ptr @llvm.invariant.start.p0(i64, ptr nocapture) readonly nounwind declare void @llvm.invariant.end.p0(ptr, i64, ptr nocapture) nounwind define void @test_invariant_intrin() { ; CHECK-LABEL: name: test_invariant_intrin -; CHECK: %{{[0-9]+}}:_(s64) = G_IMPLICIT_DEF +; CHECK: %{{[0-9]+}}:_(p0) = G_IMPLICIT_DEF ; CHECK-NEXT: RET_ReallyLR %x = alloca %t %inv = call ptr @llvm.invariant.start.p0(i64 8, ptr %x) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll new file mode 100644 index 0000000000000..c922ced4151e1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=instruction-select --verify-machineinstrs -o - %s | FileCheck %s +target datalayout = "A5" + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture) #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.invariant.end.p5(ptr, i64 immarg, ptr addrspace(5) nocapture) #0 + +; Function Attrs: nounwind +define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(1) %arg) #2 { + ; CHECK-LABEL: name: use_invariant_promotable_lds + ; CHECK: bb.1.bb: + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: early-clobber %13:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (p1) from %ir.arg.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %13, 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.arg.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[DEF]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY2]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.tmpI) + ; CHECK-NEXT: S_ENDPGM 0 +bb: + %i = load i32, ptr addrspace(1) %arg, align 4 + %tmp = alloca i32, align 4, addrspace(5) + %tmpI = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #3 + call void @llvm.invariant.end.p5(ptr %tmpI, i64 4, ptr addrspace(5) %tmp) #3 + store i32 %i, ptr %tmpI, align 4 + ret void +} + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite) } +attributes #2 = { nounwind } +attributes #3 = { nounwind memory(argmem: readwrite) } From d8bdb595bcf88366ab2fbcf7200fcc14d546eba1 Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Tue, 4 Feb 2025 04:46:06 -0600 Subject: [PATCH 2/4] simplify promote-dependency-on-invariant-result.ll --- .../promote-dependency-on-invariant-result.ll | 30 ++++++------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll index c922ced4151e1..0822384e15fb0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll @@ -1,37 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=instruction-select --verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=instruction-select -o - %s | FileCheck %s target datalayout = "A5" -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture) #0 +declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture) +declare void @llvm.invariant.end.p5(ptr, i64 immarg, ptr addrspace(5) nocapture) -; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.invariant.end.p5(ptr, i64 immarg, ptr addrspace(5) nocapture) #0 - -; Function Attrs: nounwind -define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(1) %arg) #2 { +define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(5) %arg, i32 %i) { ; CHECK-LABEL: name: use_invariant_promotable_lds ; CHECK: bb.1.bb: ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: early-clobber %13:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (p1) from %ir.arg.kernarg.offset, align 4, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %13, 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.arg.load, addrspace 1) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 40, 0 :: (dereferenceable invariant load (s32) from %ir.i.kernarg.offset, align 8, addrspace 4) ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[DEF]] - ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY2]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.tmpI) + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY2]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.tmp) ; CHECK-NEXT: S_ENDPGM 0 bb: - %i = load i32, ptr addrspace(1) %arg, align 4 - %tmp = alloca i32, align 4, addrspace(5) - %tmpI = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #3 - call void @llvm.invariant.end.p5(ptr %tmpI, i64 4, ptr addrspace(5) %tmp) #3 - store i32 %i, ptr %tmpI, align 4 + %tmp = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %arg) + call void @llvm.invariant.end.p5(ptr %tmp, i64 4, ptr addrspace(5) %arg) + store i32 %i, ptr %tmp, align 4 ret void } - -attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite) } -attributes #2 = { nounwind } -attributes #3 = { nounwind memory(argmem: readwrite) } From eab32fbd22207d2757af461731cd5bbc0669f9c8 Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Tue, 4 Feb 2025 06:19:54 -0600 Subject: [PATCH 3/4] simplify promote-dependency-on-invariant-result.ll further --- .../promote-dependency-on-invariant-result.ll | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll index 0822384e15fb0..b4648b20d793d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=instruction-select -o - %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=irtranslator -o - %s | FileCheck %s target datalayout = "A5" declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture) @@ -10,16 +10,15 @@ define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(5) %arg, i ; CHECK: bb.1.bb: ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 40, 0 :: (dereferenceable invariant load (s32) from %ir.i.kernarg.offset, align 8, addrspace 4) - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[DEF]] - ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY2]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.tmp) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; CHECK-NEXT: %13:_(p4) = nuw nusw G_PTR_ADD [[INT]], [[C]](s64) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD %13(p4) :: (dereferenceable invariant load (p5) from %ir.arg.kernarg.offset, addrspace 4) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF ; CHECK-NEXT: S_ENDPGM 0 bb: %tmp = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %arg) call void @llvm.invariant.end.p5(ptr %tmp, i64 4, ptr addrspace(5) %arg) - store i32 %i, ptr %tmp, align 4 ret void } From 885344e3549f278ee0dd65ec3775fdd6e8fd4aba Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Tue, 4 Feb 2025 07:24:30 -0600 Subject: [PATCH 4/4] add store again to promote-dependency-on-invariant-result.ll --- .../promote-dependency-on-invariant-result.ll | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll index b4648b20d793d..090aa067a5260 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll @@ -1,24 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=irtranslator -o - %s | FileCheck %s -target datalayout = "A5" declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture) declare void @llvm.invariant.end.p5(ptr, i64 immarg, ptr addrspace(5) nocapture) -define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(5) %arg, i32 %i) { +define void @use_invariant_promotable_lds(ptr addrspace(5) %arg, i32 %i) { ; CHECK-LABEL: name: use_invariant_promotable_lds ; CHECK: bb.1.bb: - ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 - ; CHECK-NEXT: %13:_(p4) = nuw nusw G_PTR_ADD [[INT]], [[C]](s64) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD %13(p4) :: (dereferenceable invariant load (p5) from %ir.arg.kernarg.offset, addrspace 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: G_STORE [[C]](s32), [[DEF]](p0) :: (store (s32) into %ir.tmp) + ; CHECK-NEXT: SI_RETURN bb: %tmp = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %arg) call void @llvm.invariant.end.p5(ptr %tmp, i64 4, ptr addrspace(5) %arg) + store i32 0, ptr %tmp, align 4 ret void }