[Offload] Add olGetKernelMaxGroupSize #142950

RossBrunton · 2025-06-05T12:05:05Z

This is equivalent to cuOccupancyMaxPotentialBlockSize. It is currently
only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal
value of 1.

Co-Authored-By: Callum Fare [email protected]

llvmbot · 2025-06-05T12:05:41Z

@llvm/pr-subscribers-offload

@llvm/pr-subscribers-backend-amdgpu

Author: Ross Brunton (RossBrunton)

Changes

This is equivalent to cuOccupancyMaxPotentialBlockSize. It is currently
only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal
value of 1.

Co-Authored-By: Callum Fare <[email protected]>

Full diff: https://github.com/llvm/llvm-project/pull/142950.diff

10 Files Affected:

(modified) offload/liboffload/API/Kernel.td (+13)
(modified) offload/liboffload/src/OffloadImpl.cpp (+19-1)
(modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+8)
(modified) offload/plugins-nextgen/common/include/PluginInterface.h (+3)
(modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp (+1)
(modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (+3)
(modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+14)
(modified) offload/plugins-nextgen/host/src/rtl.cpp (+7)
(modified) offload/unittests/OffloadAPI/CMakeLists.txt (+1)
(added) offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp (+37)

diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..71869cf4a68bb 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -24,6 +24,19 @@ def : Function {
     let returns = [];
 }
 
+def : Function {
+    let name = "olKernelMaxGroupSize";
+    let desc = "Get the maximum block size needed to achieve maximum occupancy.";
+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+        Param<"size_t", "SharedMemory", "dynamic shared memory required", PARAM_IN>,
+        Param<"size_t*", "GroupSize", "maximum block size", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
 def : Struct {
     let name = "ol_kernel_launch_size_args_t";
     let desc = "Size-related arguments for a kernel launch.";
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7b67cbba43e68..a3f8d4ba52d1e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -468,6 +468,10 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }
 
+inline GenericKernelTy *getOmpKernel(ol_kernel_handle_t OlKernel) {
+  return reinterpret_cast<GenericKernelTy *>(OlKernel);
+}
+
 Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
                        ol_kernel_handle_t *Kernel) {
 
@@ -484,6 +488,20 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
   return Error::success();
 }
 
+Error olKernelMaxGroupSize_impl(ol_kernel_handle_t Kernel,
+                                ol_device_handle_t Device,
+                                size_t DynamicMemSize, size_t *GroupSize) {
+  auto *KernelImpl = getOmpKernel(Kernel);
+
+  auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+  if (auto Err = Res.takeError()) {
+    return Err;
+  }
+  *GroupSize = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_kernel_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
@@ -514,7 +532,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   // Don't do anything with pointer indirection; use arg data as-is
   LaunchArgs.Flags.IsCUDA = true;
 
-  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+  auto *KernelImpl = getOmpKernel(Kernel);
   auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
                                 LaunchArgs, AsyncInfoWrapper);
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c15..bed9764bddf55 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -570,6 +570,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                size_t DynamicMemSize) const override {
+    return 1;
+  }
+
   /// Print more elaborate kernel launch info for AMDGPU
   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
                                KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6f..5d4e9fa212f52 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -276,6 +276,9 @@ struct GenericKernelTy {
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
+  virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                        size_t DynamicMemSize) const = 0;
+
   /// Get the kernel name.
   const char *getName() const { return Name; }
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..e6699ee78596d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -71,6 +71,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..2c856c68a9368 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -289,6 +289,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);
 
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
@@ -370,5 +371,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        const CUmemAllocationProp *prop,
                                        CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                          CUoccupancyB2DSize, size_t, int);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..45d9647da9e53 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &,
+                                size_t DynamicMemSize) const override {
+    int minGridSize;
+    int maxBlockSize;
+    auto Res = cuOccupancyMaxPotentialBlockSize(
+        &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+    if (auto Err = Plugin::check(
+            Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+      return Err;
+    }
+    return maxBlockSize;
+  }
+
 private:
   /// The CUDA kernel function to execute.
   CUfunction Func;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab250..a96aa346d33e5 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -114,6 +114,13 @@ struct GenELF64KernelTy : public GenericKernelTy {
     return Plugin::success();
   }
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &Device,
+                                size_t DynamicMemSize) const override {
+    // TODO
+    return 1;
+  }
+
 private:
   /// The kernel function to execute.
   void (*Func)(void);
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 2844b675e5de1..ac302d502c30c 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -14,6 +14,7 @@ add_offload_unittest("event"
 
 add_offload_unittest("kernel"
     kernel/olGetKernel.cpp
+    kernel/olKernelMaxGroupSize.cpp
     kernel/olLaunchKernel.cpp)
 
 add_offload_unittest("memory"
diff --git a/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
new file mode 100644
index 0000000000000..e83775ae0d896
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
@@ -0,0 +1,37 @@
+//===------- Offload API tests - olKernelMaxGroupSize ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olKernelMaxGroupSizeTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olKernelMaxGroupSizeTest);
+
+TEST_P(olKernelMaxGroupSizeTest, Success) {
+  size_t Size{0};
+  ASSERT_SUCCESS(olKernelMaxGroupSize(Kernel, Device, 0, &Size));
+  ASSERT_GT(Size, 0);
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullKernel) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olKernelMaxGroupSize(nullptr, Device, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullDevice) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olKernelMaxGroupSize(Kernel, nullptr, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullOutput) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olKernelMaxGroupSize(Kernel, Device, 0, nullptr));
+}

RossBrunton · 2025-06-05T12:07:03Z

offload/liboffload/API/Kernel.td

+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,


This function takes in a device handle because, going by the HIP implementation, it looks like on AMDGPU it requires certain device properties.

The output for this should be a struct.

Any particular reason why? There's only one output value.

Right, but if we add more stuff we don't want to break the ABI in the future. However, this should probably be merged with some big 'getKernelInfo' type thing if we don't already have it. No reason for a standalone function here.

I was originally planning on using getKernelInfo (similar to olGetDeviceInfo, olGetPlatformInfo etc.), but the fact that the query requires a device and memory size complicates that.

Unless you meant something similar to stat(), where it populates a buffer with information that the implementation might find useful? I can see the appeal of that, but it might result in plugins wasting time computing things that won't end up used.

Well, a kernel implies that it's been successfully loaded onto the device, so it should have a device. Might be reasonable to add it as an argument to the main one. That or we could revisit the other question and add some metadata associated with the kernel.

The question about adding a dedicated kernel_handle_t type that contains a pointer to the GenericKernelTy, the device and maybe even the amount of dynamic memory required? That's my preference and will ease any headaches down the line if we need to add additional information to kernels.

jhuber6 · 2025-06-05T12:10:54Z

offload/liboffload/API/Kernel.td

+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,


The output for this should be a struct.

offload/liboffload/src/OffloadImpl.cpp

jhuber6 · 2025-06-05T12:12:22Z

offload/plugins-nextgen/amdgpu/src/rtl.cpp

+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                size_t DynamicMemSize) const override {
+    return 1;
+  }


Calculating occupancy is difficult on AMD GPUs, I'd need to dig up the code where HIP does it to see how it's done.

I had a look and found https://github.com/ROCm/hip/blob/854768787ee9bbd6ed22b3e8fd0f139955a57e6a/src/hip_module.cpp#L1015 (which might be a bit out of date). I took one look at it and decided it was probably worth doing as a separate change.

jhuber6 · 2025-06-05T12:12:46Z

offload/plugins-nextgen/common/include/PluginInterface.h

@@ -276,6 +276,9 @@ struct GenericKernelTy {
                           KernelLaunchParamsTy LaunchParams,
                           AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;

+  virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,


Don't think this need to be size_t, block sizes are coded as 64-bit values.

I chose size_t since that is consistent with the fields in ol_kernel_launch_size_args_t and just generally elsewhere in the API. Happy to change it if explicitly specifying uint64_t is preferred though.

RossBrunton · 2025-06-06T11:00:41Z

Just to summarize and get discussion in one place: As input we have a kernel name/handle, device and local memory size. From that we want to output the maximum group size.

In terms of API design, I can see multiple options:

typedef void *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, Device, 1024, &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, 1024, &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
  size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", 1024, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
  size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
struct ol_kernel_invocation {
  ol_program_handle_t Program;
  const char *Name;
  size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(&Invocation, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
};
struct ol_kernel_invocation {
  size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olGetKernelInfo(Kernel, &Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

struct ol_kernel_invocation {
  ol_program_handle_t Program;
  const char *Name;
  size_t DynMemorySize;
  // Should not be used by API users
  GenericKernelTy *PluginKernel;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024, nullptr};
size_t GroupSize;
olKernelLoad(&Invocation);
olGetKernelInfo(Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

Which of these options looks best to you, @callumfare @jhuber6 ?

callumfare · 2025-06-09T10:23:41Z

I don't mind the kernel being an opaque handle like the other types as opposed to void* like it is now. If the underlying pointer is useful to end-users we can expose it with something like olGetKernelNativeHandle. Long-term we'll need functions like that for every handle type anyway to support SYCL's native interop functionality.

I don't think we should have any state kept in the kernel (same reason we don't set arguments like UR/OpenCL) so I'm not a fan of any option where we store DynMemorySize in the kernel. Likewise adding a ol_kernel_invocation type seems overkill.

Option 2 seems best if we change ol_kernel_handle_t, otherwise option 1 seems fine.

This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal value of `1`. Co-Authored-By: Callum Fare <[email protected]>

RossBrunton · 2025-06-10T14:39:30Z

After a small discussion internally at Codeplay, I've renamed the function from olKernelMaxGroupSize to olGetKernelMaxGroupSize.

RossBrunton · 2025-06-16T11:12:53Z

@jhuber6 When you get a chance can you look at the options I posted above and let me know your thoughts?

jhuber6 · 2025-06-16T13:11:18Z

So, we should probably just do something similar to HSA and make a generic symbol type then have a single function to get information out of it.

RossBrunton · 2025-06-18T15:25:47Z

@jhuber6 So something like option 2, but with ol_symbol_handle_t instead of ol_kernel_handle_t?

I think having an olGetKernelMaxGroupSize (distinct from olGetSymbolInfo) method is pretty much unavoidable if we don't want to have the symbol type store DynamicMemorySize.

llvmbot added backend:AMDGPU offload labels Jun 5, 2025

RossBrunton commented Jun 5, 2025

View reviewed changes

jhuber6 reviewed Jun 5, 2025

View reviewed changes

RossBrunton and others added 3 commits June 10, 2025 14:59

[Offload] Add olKernelMaxGroupSize

19e0748

This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal value of `1`. Co-Authored-By: Callum Fare <[email protected]>

Respond to feedback

1cd55c1

Rename function

8fafdca

RossBrunton force-pushed the maxSize branch from 10530f5 to 8fafdca Compare June 10, 2025 14:37

RossBrunton changed the title ~~[Offload] Add olKernelMaxGroupSize~~ [Offload] Add olGetKernelMaxGroupSize Jun 10, 2025

Fix warning

ccec53f

[Offload] Add olGetKernelMaxGroupSize #142950

Are you sure you want to change the base?

[Offload] Add olGetKernelMaxGroupSize #142950

Uh oh!

Conversation

RossBrunton commented Jun 5, 2025

Uh oh!

llvmbot commented Jun 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RossBrunton commented Jun 6, 2025

Uh oh!

callumfare commented Jun 9, 2025

Uh oh!

RossBrunton commented Jun 10, 2025

Uh oh!

RossBrunton commented Jun 16, 2025

Uh oh!

jhuber6 commented Jun 16, 2025

Uh oh!

RossBrunton commented Jun 18, 2025

Uh oh!

Uh oh!

llvmbot commented Jun 5, 2025 •

edited

Loading