diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 371e583c22a83..e05f7fc3e7662 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -4114,6 +4114,9 @@ Code object V5 metadata is the same as buffer that conforms to the requirements of the malloc/free device library V1 version implementation. + "hidden_dynamic_lds_size" + Size of the dynamically allocated LDS memory is passed in the kernarg. + "hidden_private_base" The high 32 bits of the flat addressing private aperture base. Only used by GFX8 to allow conversion between private segment diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp index dda3380c04ea9..33eed07c46292 100644 --- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp +++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp @@ -134,6 +134,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) { .Case("hidden_default_queue", true) .Case("hidden_completion_action", true) .Case("hidden_multigrid_sync_arg", true) + .Case("hidden_dynamic_lds_size", true) .Case("hidden_private_base", true) .Case("hidden_shared_base", true) .Case("hidden_queue_ptr", true) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b51a876750b58..74e9cd7d09654 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( Offset += 8; // Skipped. } - Offset += 72; // Reserved. + // Emit argument for hidden dynamic lds size + if (MFI.isDynamicLDSUsed()) { + emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset, + Args); + } else { + Offset += 4; // skipped + } + + Offset += 68; // Reserved. // hidden_private_base and hidden_shared_base are only when the subtarget has // ApertureRegs. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 323462e60a29f..31777295b4f8f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -19,6 +19,26 @@ using namespace llvm; +static const GlobalVariable * +getKernelDynLDSGlobalFromFunction(const Function &F) { + const Module *M = F.getParent(); + SmallString<64> KernelDynLDSName("llvm.amdgcn."); + KernelDynLDSName += F.getName(); + KernelDynLDSName += ".dynlds"; + return M->getNamedGlobal(KernelDynLDSName); +} + +static bool hasLDSKernelArgument(const Function &F) { + for (const Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + if (auto PtrTy = dyn_cast(ArgTy)) { + if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) + return true; + } + } + return false; +} + AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST) : IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())), @@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math"); NoSignedZerosFPMath = NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true"; + + const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F); + if (DynLdsGlobal || hasLDSKernelArgument(F)) + UsesDynamicLDS = true; } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, @@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, return Offset; } -static const GlobalVariable * -getKernelDynLDSGlobalFromFunction(const Function &F) { - const Module *M = F.getParent(); - std::string KernelDynLDSName = "llvm.amdgcn."; - KernelDynLDSName += F.getName(); - KernelDynLDSName += ".dynlds"; - return M->getNamedGlobal(KernelDynLDSName); -} - std::optional AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) { // TODO: Would be more consistent with the abs symbols to use a range @@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F, } } } + +void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) { + UsesDynamicLDS = DynLDS; +} + +bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 248ee26a47eb1..7efb7f825348e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -46,6 +46,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { /// stages. Align DynLDSAlign; + // Flag to check dynamic LDS usage by kernel. + bool UsesDynamicLDS = false; + // Kernels + shaders. i.e. functions called by the hardware and not called // by other functions. bool IsEntryFunction = false; @@ -119,6 +122,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { Align getDynLDSAlign() const { return DynLDSAlign; } void setDynLDSAlign(const Function &F, const GlobalVariable &GV); + + void setUsesDynamicLDS(bool DynLDS); + + bool isDynamicLDSUsed() const; }; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0e857e6ac71b6..b481ae43e8215 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6890,6 +6890,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, // Adjust alignment for that dynamic shared memory array. Function &F = DAG.getMachineFunction().getFunction(); MFI->setDynLDSAlign(F, *cast(GV)); + MFI->setUsesDynamicLDS(true); return SDValue( DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll new file mode 100644 index 0000000000000..cb15ff9fcb1bc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll @@ -0,0 +1,124 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s + + +; CHECK: amdhsa.kernels: +; CHECK-NEXT: - .args: +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: r +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: b +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 28 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 36 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 38 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 42 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 44 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 46 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 88 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 96 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: - .offset: 104 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: - .offset: 112 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: - .offset: 120 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_heap_v1 +; CHECK-NEXT: - .offset: 128 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_default_queue +; CHECK-NEXT: - .offset: 136 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_completion_action +; CHECK: - .offset: 144 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size +; CHECK: - .offset: 224 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_queue_ptr + +; CHECK: .name: test_v5 +; CHECK: .symbol: test_v5.kd + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 2 +@lds = external hidden addrspace(3) global [0 x i32], align 4 + +define void @funcs_dyn_lds() { + store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4 + ret void +} + +define amdgpu_kernel void @test_v5( + ptr addrspace(1) %r, + ptr addrspace(1) %a, + ptr addrspace(1) %b) #0 { +entry: + %a.val = load half, ptr addrspace(1) %a + %b.val = load half, ptr addrspace(1) %b + %r.val = fadd half %a.val, %b.val + store half %r.val, ptr addrspace(1) %r + call void @funcs_dyn_lds() + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} +!llvm.printf.fmts = !{!1, !2} +!1 = !{!"1:1:4:%d\5Cn"} +!2 = !{!"2:1:8:%g\5Cn"} + +attributes #0 = { optnone noinline } + diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll new file mode 100644 index 0000000000000..16bfe5f019683 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll @@ -0,0 +1,124 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s + + +; CHECK: amdhsa.kernels: +; CHECK-NEXT: - .args: +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: r +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: b +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 28 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 36 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 38 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 42 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 44 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 46 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 88 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 96 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: - .offset: 104 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: - .offset: 112 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: - .offset: 120 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_heap_v1 +; CHECK-NEXT: - .offset: 128 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_default_queue +; CHECK-NEXT: - .offset: 136 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_completion_action +; CHECK: - .offset: 144 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size +; CHECK: - .offset: 224 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_queue_ptr + +; CHECK: .name: test_v5 +; CHECK: .symbol: test_v5.kd + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 2 +@lds = external hidden addrspace(3) global [0 x i32], align 4 + +define void @funcs_dyn_lds(ptr addrspace(3) %lds_ptr) { + store i32 1234, ptr addrspace(3) %lds_ptr, align 4 + ret void +} + +define amdgpu_kernel void @test_v5( + ptr addrspace(1) %r, + ptr addrspace(1) %a, + ptr addrspace(1) %b) #0 { +entry: + %a.val = load half, ptr addrspace(1) %a + %b.val = load half, ptr addrspace(1) %b + %r.val = fadd half %a.val, %b.val + store half %r.val, ptr addrspace(1) %r + call void @funcs_dyn_lds(ptr addrspace(3) @lds) + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} +!llvm.printf.fmts = !{!1, !2} +!1 = !{!"1:1:4:%d\5Cn"} +!2 = !{!"2:1:8:%g\5Cn"} + +attributes #0 = { optnone noinline } + diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll new file mode 100644 index 0000000000000..d457c61b8d408 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-kernarg-hidden-args-v5.ll @@ -0,0 +1,125 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s + + +; CHECK: amdhsa.kernels: +; CHECK-NEXT: - .args: +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: r +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .name: b +; CHECK-NEXT: .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: global_buffer +; CHECK-NEXT: - .address_space: local +; CHECK-NEXT: .name: lds_ptr +; CHECK-NEXT: .offset: 24 +; CHECK-NEXT: .pointee_align: 1 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: dynamic_shared_pointer +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_x +; CHECK-NEXT: - .offset: 36 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_y +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_block_count_z +; CHECK-NEXT: - .offset: 44 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_x +; CHECK-NEXT: - .offset: 46 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_y +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_group_size_z +; CHECK-NEXT: - .offset: 50 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_x +; CHECK-NEXT: - .offset: 52 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_y +; CHECK-NEXT: - .offset: 54 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_remainder_z +; CHECK-NEXT: - .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 88 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .offset: 96 +; CHECK-NEXT: .size: 2 +; CHECK-NEXT: .value_kind: hidden_grid_dims +; CHECK-NEXT: - .offset: 104 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: - .offset: 112 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: - .offset: 120 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: - .offset: 128 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_heap_v1 +; CHECK-NEXT: - .offset: 136 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_default_queue +; CHECK-NEXT: - .offset: 144 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_completion_action +; CHECK: - .offset: 152 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size +; CHECK: - .offset: 232 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_queue_ptr + +; CHECK: .name: test_v5 +; CHECK: .symbol: test_v5.kd + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 2 + +define amdgpu_kernel void @test_v5( + ptr addrspace(1) %r, + ptr addrspace(1) %a, + ptr addrspace(1) %b, + ptr addrspace(3) %lds_ptr) #0 { +entry: + %a.val = load half, ptr addrspace(1) %a + %b.val = load half, ptr addrspace(1) %b + %r.val = fadd half %a.val, %b.val + store half %r.val, ptr addrspace(1) %r + store i32 1234, ptr addrspace(3) %lds_ptr, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} +!llvm.printf.fmts = !{!1, !2} +!1 = !{!"1:1:4:%d\5Cn"} +!2 = !{!"2:1:8:%g\5Cn"} + +attributes #0 = { optnone noinline } + diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll index cb30d668674c3..1a2ce636c733c 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll @@ -81,13 +81,16 @@ ; CHECK-NEXT: - .offset: 136 ; CHECK-NEXT: .size: 8 ; CHECK-NEXT: .value_kind: hidden_completion_action +; CHECK: - .offset: 144 +; CHECK-NEXT: .size: 4 +; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size ; GFX8-NEXT: - .offset: 216 ; GFX8-NEXT: .size: 4 ; GFX8-NEXT: .value_kind: hidden_private_base ; GFX8-NEXT: - .offset: 220 ; GFX8-NEXT: .size: 4 ; GFX8-NEXT: .value_kind: hidden_shared_base -; CHECK: - .offset: 224 +; CHECK: - .offset: 224 ; CHECK-NEXT: .size: 8 ; CHECK-NEXT: .value_kind: hidden_queue_ptr @@ -97,6 +100,7 @@ ; CHECK: amdhsa.version: ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 2 +@lds = external hidden addrspace(3) global [0 x i32], align 4 define amdgpu_kernel void @test_v5( ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -106,6 +110,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %r.val = fadd half %a.val, %b.val store half %r.val, ptr addrspace(1) %r + store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4 ret void }