Skip to content

[Offload] Use flat array for cuLaunchKernel #95116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions offload/include/Shared/APITypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,16 @@ static_assert(sizeof(KernelArgsTy) ==
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
"Invalid struct size");

/// Flat array of kernel launch parameters and their total size.
struct KernelLaunchParamsTy {
/// Size of the Data array.
size_t Size = 0;
/// Flat array of kernel parameters.
void *Data = nullptr;
/// Ptrs to the Data entries. Only strictly required for the host plugin.
void **Ptrs = nullptr;
};
}

#endif // OMPTARGET_SHARED_API_TYPES_H
28 changes: 15 additions & 13 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <unistd.h>
#include <unordered_map>

#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/Utils.h"
Expand Down Expand Up @@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {

/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

/// Print more elaborate kernel launch info for AMDGPU
Expand Down Expand Up @@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);

KernelArgsTy KernelArgs = {};
if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs,
/*Args=*/nullptr, AsyncInfoWrapper))
if (auto Err =
AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;

Error Err = Plugin::success();
Expand Down Expand Up @@ -3265,12 +3268,11 @@ struct AMDGPUPluginTy final : public GenericPluginTy {

Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
KernelArgsTy &KernelArgs, void *Args,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
const uint32_t LaunchParamsSize = KernelArgs.NumArgs * sizeof(void *);

if (ArgsSize != LaunchParamsSize &&
ArgsSize != LaunchParamsSize + getImplicitArgsSize())
if (ArgsSize != LaunchParams.Size &&
ArgsSize != LaunchParams.Size + getImplicitArgsSize())
return Plugin::error("Mismatch of kernel arguments size");

AMDGPUPluginTy &AMDGPUPlugin =
Expand All @@ -3294,10 +3296,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
return Err;

utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
if (ArgsSize == LaunchParamsSize + getImplicitArgsSize()) {
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
// Initialize implicit arguments.
ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
advanceVoidPtr(AllArgs, LaunchParamsSize));
advanceVoidPtr(AllArgs, LaunchParams.Size));

// Initialize the implicit arguments to zero.
std::memset(ImplArgs, 0, getImplicitArgsSize());
Expand All @@ -3306,8 +3308,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Copy the explicit arguments.
// TODO: We should expose the args memory manager alloc to the common part as
// alternative to copying them twice.
if (LaunchParamsSize)
std::memcpy(AllArgs, *static_cast<void **>(Args), LaunchParamsSize);
if (LaunchParams.Size)
std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);

AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);

Expand Down
14 changes: 8 additions & 6 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <shared_mutex>
#include <vector>

#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/EnvironmentVar.h"
Expand Down Expand Up @@ -265,7 +266,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
void *Args,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;

/// Get the kernel name.
Expand Down Expand Up @@ -326,11 +327,12 @@ struct GenericKernelTy {

private:
/// Prepare the arguments before launching the kernel.
void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs,
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
KernelLaunchParamsTy
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs,
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;

/// Get the number of threads and blocks for the kernel based on the
/// user-defined threads and block clauses.
Expand Down
28 changes: 14 additions & 14 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,9 @@ struct RecordReplayTy {
OS.close();
}

void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
uint64_t LoopTripCount) {
void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
int32_t NumArgs, uint64_t NumTeamsClause,
uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
json::Object JsonKernelInfo;
JsonKernelInfo["Name"] = Name;
JsonKernelInfo["NumArgs"] = NumArgs;
Expand All @@ -283,7 +283,7 @@ struct RecordReplayTy {

json::Array JsonArgPtrs;
for (int I = 0; I < NumArgs; ++I)
JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));

json::Array JsonArgOffsets;
Expand Down Expand Up @@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();

void *KernelArgsPtr =
KernelLaunchParamsTy LaunchParams =
prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
Ptrs, *KernelLaunchEnvOrErr);

Expand All @@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (RecordReplay.isRecording()) {
RecordReplay.saveImage(getName(), getImage());
RecordReplay.saveKernelInput(getName(), getImage());
RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
NumBlocks, NumThreads, KernelArgs.Tripcount);
}

Expand All @@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
return Err;

return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
KernelArgsPtr, AsyncInfoWrapper);
LaunchParams, AsyncInfoWrapper);
}

void *GenericKernelTy::prepareArgs(
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs,
Expand All @@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs(
NumArgs += KLEOffset;

if (NumArgs == 0)
return nullptr;
return KernelLaunchParamsTy{};

Args.resize(NumArgs);
Ptrs.resize(NumArgs);

if (KernelLaunchEnvironment) {
Ptrs[0] = KernelLaunchEnvironment;
Args[0] = &Ptrs[0];
Args[0] = KernelLaunchEnvironment;
Ptrs[0] = &Args[0];
}

for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
Ptrs[I] =
Args[I] =
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
Args[I] = &Ptrs[I];
Ptrs[I] = &Args[I];
}
return &Args[0];
return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
}

uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
Expand Down
4 changes: 4 additions & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ typedef enum CUevent_flags_enum {
CU_EVENT_INTERPROCESS = 0x4
} CUevent_flags;

static inline void *CU_LAUNCH_PARAM_END = (void *)0x00;
static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;

CUresult cuCtxGetDevice(CUdevice *);
CUresult cuDeviceGet(CUdevice *, int);
CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
Expand Down
28 changes: 18 additions & 10 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <string>
#include <unordered_map>

#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"

Expand Down Expand Up @@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy {

/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

private:
Expand Down Expand Up @@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);

KernelArgsTy KernelArgs = {};
if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs, nullptr,
AsyncInfoWrapper))
if (auto Err =
CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;

Error Err = Plugin::success();
Expand Down Expand Up @@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy {

Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
KernelArgsTy &KernelArgs, void *Args,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);

Expand All @@ -1285,11 +1289,15 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());

CUresult Res =
cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
/*gridDimZ=*/1, NumThreads,
/*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
(void **)Args, nullptr);
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
reinterpret_cast<void *>(&LaunchParams.Size),
CU_LAUNCH_PARAM_END};

CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
/*gridDimZ=*/1, NumThreads,
/*blockDimY=*/1, /*blockDimZ=*/1,
MaxDynCGroupMem, Stream, nullptr, Config);
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
}

Expand Down
5 changes: 3 additions & 2 deletions offload/plugins-nextgen/host/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy {

/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
// Create a vector of ffi_types, one per argument.
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
Expand All @@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy {

// Call the kernel function through libffi.
long Return;
ffi_call(&Cif, Func, &Return, (void **)Args);
ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs);

return Plugin::success();
}
Expand Down
Loading