-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[Offload] Use flat array for cuLaunchKernel #95116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-offload Author: Johannes Doerfert (jdoerfert) ChangesWe already used a flat array of kernel launch parameters for the AMD GPU launch but now we also use this scheme for the NVIDIA GPU launch. The only remaining/required use of the indirection is the host plugin (due ot ffi). This allows to us simplify the use for non-OpenMP kernel launch. Full diff: https://github.com/llvm/llvm-project/pull/95116.diff 6 Files Affected:
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index e8fc27785b6c2..f3948a32ada8b 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -112,6 +112,16 @@ static_assert(sizeof(KernelArgsTy) ==
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
"Invalid struct size");
+
+/// Flat array of kernel launch parameters and their total size.
+struct KernelLaunchParamsTy {
+ /// Size of the Data array.
+ size_t Size = 0;
+ /// Flat array of kernel parameters.
+ void *Data = nullptr;
+ /// Ptrs to the Data entries. Only strictly required for the host plugin.
+ void **Ptrs = nullptr;
+};
}
#endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c6dd954746e4a..43e0bbd85a9d3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -20,6 +20,7 @@
#include <unistd.h>
#include <unordered_map>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/Utils.h"
@@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Print more elaborate kernel launch info for AMDGPU
@@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
- if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
- /*NumBlocks=*/1ul, KernelArgs,
- /*Args=*/nullptr, AsyncInfoWrapper))
+ if (auto Err =
+ AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
+ /*NumBlocks=*/1ul, KernelArgs,
+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -3266,18 +3269,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
- KernelArgsTy &KernelArgs, void *Args,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
- const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
- if (ArgsSize < KernelArgsSize)
+ if (ArgsSize < LaunchParams.Size)
return Plugin::error("Mismatch of kernel arguments size");
// The args size reported by HSA may or may not contain the implicit args.
// For now, assume that HSA does not consider the implicit arguments when
// reporting the arguments of a kernel. In the worst case, we can waste
// 56 bytes per allocation.
- uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
+ uint32_t AllArgsSize = LaunchParams.Size + ImplicitArgsSize;
AMDGPUPluginTy &AMDGPUPlugin =
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
@@ -3302,7 +3305,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Initialize implicit arguments.
utils::AMDGPUImplicitArgsTy *ImplArgs =
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
- advanceVoidPtr(AllArgs, KernelArgsSize));
+ advanceVoidPtr(AllArgs, LaunchParams.Size));
// Initialize the implicit arguments to zero.
std::memset(ImplArgs, 0, ImplicitArgsSize);
@@ -3310,9 +3313,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Copy the explicit arguments.
// TODO: We should expose the args memory manager alloc to the common part as
// alternative to copying them twice.
- if (KernelArgs.NumArgs)
- std::memcpy(AllArgs, *static_cast<void **>(Args),
- sizeof(void *) * KernelArgs.NumArgs);
+ if (LaunchParams.Size)
+ std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index eda6a4fd541e9..37d16ae3a7027 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -19,6 +19,7 @@
#include <shared_mutex>
#include <vector>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/EnvironmentVar.h"
@@ -265,7 +266,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
- void *Args,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
/// Get the kernel name.
@@ -326,11 +327,12 @@ struct GenericKernelTy {
private:
/// Prepare the arguments before launching the kernel.
- void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
- ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
- llvm::SmallVectorImpl<void *> &Args,
- llvm::SmallVectorImpl<void *> &Ptrs,
- KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
+ KernelLaunchParamsTy
+ prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+ ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
+ llvm::SmallVectorImpl<void *> &Args,
+ llvm::SmallVectorImpl<void *> &Ptrs,
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
/// Get the number of threads and blocks for the kernel based on the
/// user-defined threads and block clauses.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 913721a15d713..00e12aecf7512 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -268,9 +268,9 @@ struct RecordReplayTy {
OS.close();
}
- void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
- uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
- uint64_t LoopTripCount) {
+ void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
+ int32_t NumArgs, uint64_t NumTeamsClause,
+ uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
json::Object JsonKernelInfo;
JsonKernelInfo["Name"] = Name;
JsonKernelInfo["NumArgs"] = NumArgs;
@@ -283,7 +283,7 @@ struct RecordReplayTy {
json::Array JsonArgPtrs;
for (int I = 0; I < NumArgs; ++I)
- JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
+ JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
json::Array JsonArgOffsets;
@@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
- void *KernelArgsPtr =
+ KernelLaunchParamsTy LaunchParams =
prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
Ptrs, *KernelLaunchEnvOrErr);
@@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (RecordReplay.isRecording()) {
RecordReplay.saveImage(getName(), getImage());
RecordReplay.saveKernelInput(getName(), getImage());
- RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
+ RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
NumBlocks, NumThreads, KernelArgs.Tripcount);
}
@@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
return Err;
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
- KernelArgsPtr, AsyncInfoWrapper);
+ LaunchParams, AsyncInfoWrapper);
}
-void *GenericKernelTy::prepareArgs(
+KernelLaunchParamsTy GenericKernelTy::prepareArgs(
GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs,
@@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs(
NumArgs += KLEOffset;
if (NumArgs == 0)
- return nullptr;
+ return KernelLaunchParamsTy{};
Args.resize(NumArgs);
Ptrs.resize(NumArgs);
if (KernelLaunchEnvironment) {
- Ptrs[0] = KernelLaunchEnvironment;
- Args[0] = &Ptrs[0];
+ Args[0] = KernelLaunchEnvironment;
+ Ptrs[0] = &Args[0];
}
for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
- Ptrs[I] =
+ Args[I] =
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
- Args[I] = &Ptrs[I];
+ Ptrs[I] = &Args[I];
}
- return &Args[0];
+ return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
}
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b260334baa18b..a8d8846791745 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -16,6 +16,7 @@
#include <string>
#include <unordered_map>
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
@@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy {
/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
private:
@@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
- if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
- /*NumBlocks=*/1ul, KernelArgs, nullptr,
- AsyncInfoWrapper))
+ if (auto Err =
+ CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
+ /*NumBlocks=*/1ul, KernelArgs,
+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
- KernelArgsTy &KernelArgs, void *Args,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
@@ -1285,11 +1289,16 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
- CUresult Res =
- cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
- /*gridDimZ=*/1, NumThreads,
- /*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
- (void **)Args, nullptr);
+ void *Config[] = {/* CU_LAUNCH_PARAM_BUFFER_POINTER */ (void *)0x01,
+ LaunchParams.Data,
+ /* CU_LAUNCH_PARAM_BUFFER_SIZE */ (void *)0x02,
+ reinterpret_cast<void *>(&LaunchParams.Size),
+ /* CU_LAUNCH_PARAM_END */ (void *)0x00};
+
+ CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
+ /*gridDimZ=*/1, NumThreads,
+ /*blockDimY=*/1, /*blockDimZ=*/1,
+ MaxDynCGroupMem, Stream, nullptr, Config);
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
}
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index ef84cbaf54588..ef2488e42c15b 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
+ uint64_t NumBlocks, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
// Create a vector of ffi_types, one per argument.
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
@@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy {
// Call the kernel function through libffi.
long Return;
- ffi_call(&Cif, Func, &Return, (void **)Args);
+ ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs);
return Plugin::success();
}
|
We already used a flat array of kernel launch parameters for the AMD GPU launch but now we also use this scheme for the NVIDIA GPU launch. The only remaining/required use of the indirection is the host plugin (due ot ffi). This allows to us simplify the use for non-OpenMP kernel launch.
We already used a flat array of kernel launch parameters for the AMD GPU launch but now we also use this scheme for the NVIDIA GPU launch. The only remaining/required use of the indirection is the host plugin (due ot ffi). This allows to us simplify the use for non-OpenMP kernel launch.