-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Offload] Add olGetKernelMaxGroupSize #142950
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-offload @llvm/pr-subscribers-backend-amdgpu Author: Ross Brunton (RossBrunton) ChangesThis is equivalent to Co-Authored-By: Callum Fare <[email protected]> Full diff: https://github.com/llvm/llvm-project/pull/142950.diff 10 Files Affected:
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..71869cf4a68bb 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -24,6 +24,19 @@ def : Function {
let returns = [];
}
+def : Function {
+ let name = "olKernelMaxGroupSize";
+ let desc = "Get the maximum block size needed to achieve maximum occupancy.";
+ let details = [];
+ let params = [
+ Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+ Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+ Param<"size_t", "SharedMemory", "dynamic shared memory required", PARAM_IN>,
+ Param<"size_t*", "GroupSize", "maximum block size", PARAM_OUT>
+ ];
+ let returns = [];
+}
+
def : Struct {
let name = "ol_kernel_launch_size_args_t";
let desc = "Size-related arguments for a kernel launch.";
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7b67cbba43e68..a3f8d4ba52d1e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -468,6 +468,10 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
return olDestroy(Program);
}
+inline GenericKernelTy *getOmpKernel(ol_kernel_handle_t OlKernel) {
+ return reinterpret_cast<GenericKernelTy *>(OlKernel);
+}
+
Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
ol_kernel_handle_t *Kernel) {
@@ -484,6 +488,20 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
return Error::success();
}
+Error olKernelMaxGroupSize_impl(ol_kernel_handle_t Kernel,
+ ol_device_handle_t Device,
+ size_t DynamicMemSize, size_t *GroupSize) {
+ auto *KernelImpl = getOmpKernel(Kernel);
+
+ auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+ if (auto Err = Res.takeError()) {
+ return Err;
+ }
+ *GroupSize = *Res;
+
+ return Error::success();
+}
+
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData,
size_t ArgumentsSize,
@@ -514,7 +532,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
// Don't do anything with pointer indirection; use arg data as-is
LaunchArgs.Flags.IsCUDA = true;
- auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+ auto *KernelImpl = getOmpKernel(Kernel);
auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
LaunchArgs, AsyncInfoWrapper);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c15..bed9764bddf55 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -570,6 +570,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+ /// Return maximum block size for maximum occupancy
+ ///
+ /// TODO: This needs to be implemented for amdgpu
+ Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+ size_t DynamicMemSize) const override {
+ return 1;
+ }
+
/// Print more elaborate kernel launch info for AMDGPU
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6f..5d4e9fa212f52 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -276,6 +276,9 @@ struct GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
+ virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+ size_t DynamicMemSize) const = 0;
+
/// Get the kernel name.
const char *getName() const { return Name; }
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..e6699ee78596d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -71,6 +71,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..2c856c68a9368 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -289,6 +289,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);
CUresult cuCtxGetDevice(CUdevice *);
CUresult cuDeviceGet(CUdevice *, int);
@@ -370,5 +371,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
CUresult cuMemGetAllocationGranularity(size_t *granularity,
const CUmemAllocationProp *prop,
CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+ CUoccupancyB2DSize, size_t, int);
#endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..45d9647da9e53 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+ /// Return maximum block size for maximum occupancy
+ Expected<size_t> maxGroupSize(GenericDeviceTy &,
+ size_t DynamicMemSize) const override {
+ int minGridSize;
+ int maxBlockSize;
+ auto Res = cuOccupancyMaxPotentialBlockSize(
+ &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+ if (auto Err = Plugin::check(
+ Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+ return Err;
+ }
+ return maxBlockSize;
+ }
+
private:
/// The CUDA kernel function to execute.
CUfunction Func;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab250..a96aa346d33e5 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -114,6 +114,13 @@ struct GenELF64KernelTy : public GenericKernelTy {
return Plugin::success();
}
+ /// Return maximum block size for maximum occupancy
+ Expected<size_t> maxGroupSize(GenericDeviceTy &Device,
+ size_t DynamicMemSize) const override {
+ // TODO
+ return 1;
+ }
+
private:
/// The kernel function to execute.
void (*Func)(void);
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 2844b675e5de1..ac302d502c30c 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -14,6 +14,7 @@ add_offload_unittest("event"
add_offload_unittest("kernel"
kernel/olGetKernel.cpp
+ kernel/olKernelMaxGroupSize.cpp
kernel/olLaunchKernel.cpp)
add_offload_unittest("memory"
diff --git a/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
new file mode 100644
index 0000000000000..e83775ae0d896
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
@@ -0,0 +1,37 @@
+//===------- Offload API tests - olKernelMaxGroupSize ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olKernelMaxGroupSizeTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olKernelMaxGroupSizeTest);
+
+TEST_P(olKernelMaxGroupSizeTest, Success) {
+ size_t Size{0};
+ ASSERT_SUCCESS(olKernelMaxGroupSize(Kernel, Device, 0, &Size));
+ ASSERT_GT(Size, 0);
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullKernel) {
+ size_t Size;
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olKernelMaxGroupSize(nullptr, Device, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullDevice) {
+ size_t Size;
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olKernelMaxGroupSize(Kernel, nullptr, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullOutput) {
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+ olKernelMaxGroupSize(Kernel, Device, 0, nullptr));
+}
|
let details = []; | ||
let params = [ | ||
Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>, | ||
Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function takes in a device handle because, going by the HIP implementation, it looks like on AMDGPU it requires certain device properties.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The output for this should be a struct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any particular reason why? There's only one output value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, but if we add more stuff we don't want to break the ABI in the future. However, this should probably be merged with some big 'getKernelInfo' type thing if we don't already have it. No reason for a standalone function here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was originally planning on using getKernelInfo
(similar to olGetDeviceInfo
, olGetPlatformInfo
etc.), but the fact that the query requires a device and memory size complicates that.
Unless you meant something similar to stat()
, where it populates a buffer with information that the implementation might find useful? I can see the appeal of that, but it might result in plugins wasting time computing things that won't end up used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, a kernel implies that it's been successfully loaded onto the device, so it should have a device. Might be reasonable to add it as an argument to the main one. That or we could revisit the other question and add some metadata associated with the kernel.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The question about adding a dedicated kernel_handle_t type that contains a pointer to the GenericKernelTy, the device and maybe even the amount of dynamic memory required? That's my preference and will ease any headaches down the line if we need to add additional information to kernels.
let details = []; | ||
let params = [ | ||
Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>, | ||
Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The output for this should be a struct.
/// Return maximum block size for maximum occupancy | ||
/// | ||
/// TODO: This needs to be implemented for amdgpu | ||
Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice, | ||
size_t DynamicMemSize) const override { | ||
return 1; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Calculating occupancy is difficult on AMD GPUs, I'd need to dig up the code where HIP does it to see how it's done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had a look and found https://github.com/ROCm/hip/blob/854768787ee9bbd6ed22b3e8fd0f139955a57e6a/src/hip_module.cpp#L1015 (which might be a bit out of date). I took one look at it and decided it was probably worth doing as a separate change.
@@ -276,6 +276,9 @@ struct GenericKernelTy { | |||
KernelLaunchParamsTy LaunchParams, | |||
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; | |||
|
|||
virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't think this need to be size_t, block sizes are coded as 64-bit values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I chose size_t
since that is consistent with the fields in ol_kernel_launch_size_args_t
and just generally elsewhere in the API. Happy to change it if explicitly specifying uint64_t is preferred though.
Just to summarize and get discussion in one place: As input we have a kernel name/handle, device and local memory size. From that we want to output the maximum group size. In terms of API design, I can see multiple options: typedef void *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, Device, 1024, &GroupSize); struct ol_kernel_handle_impl {
GenericKernelTy *PluginKernel;
ol_program_handle_t Program;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, 1024, &GroupSize); struct ol_kernel_handle_impl {
GenericKernelTy *PluginKernel;
ol_program_handle_t Program;
size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", 1024, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize); struct ol_kernel_handle_impl {
GenericKernelTy *PluginKernel;
ol_program_handle_t Program;
size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
struct ol_kernel_invocation {
ol_program_handle_t Program;
const char *Name;
size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(&Invocation, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize); struct ol_kernel_handle_impl {
GenericKernelTy *PluginKernel;
ol_program_handle_t Program;
};
struct ol_kernel_invocation {
size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olGetKernelInfo(Kernel, &Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize); struct ol_kernel_invocation {
ol_program_handle_t Program;
const char *Name;
size_t DynMemorySize;
// Should not be used by API users
GenericKernelTy *PluginKernel;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024, nullptr};
size_t GroupSize;
olKernelLoad(&Invocation);
olGetKernelInfo(Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize); Which of these options looks best to you, @callumfare @jhuber6 ? |
I don't mind the kernel being an opaque handle like the other types as opposed to I don't think we should have any state kept in the kernel (same reason we don't set arguments like UR/OpenCL) so I'm not a fan of any option where we store Option 2 seems best if we change |
This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal value of `1`. Co-Authored-By: Callum Fare <[email protected]>
After a small discussion internally at Codeplay, I've renamed the function from |
@jhuber6 When you get a chance can you look at the options I posted above and let me know your thoughts? |
So, we should probably just do something similar to HSA and make a generic symbol type then have a single function to get information out of it. |
@jhuber6 So something like option 2, but with I think having an |
This is equivalent to
cuOccupancyMaxPotentialBlockSize
. It is currentlyonly implented on Cuda; AMDGPU and Host return the legal-but-suboptimal
value of
1
.Co-Authored-By: Callum Fare [email protected]