Skip to content

Commit 54b5c76

Browse files
authored
[Offload] Use flat array for cuLaunchKernel (#95116)
We already used a flat array of kernel launch parameters for the AMD GPU launch but now we also use this scheme for the NVIDIA GPU launch. The only remaining/required use of the indirection is the host plugin (due ot ffi). This allows to us simplify the use for non-OpenMP kernel launch.
1 parent 9ab601f commit 54b5c76

File tree

7 files changed

+72
-45
lines changed

7 files changed

+72
-45
lines changed

offload/include/Shared/APITypes.h

+10
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,16 @@ static_assert(sizeof(KernelArgsTy) ==
116116
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
117117
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
118118
"Invalid struct size");
119+
120+
/// Flat array of kernel launch parameters and their total size.
121+
struct KernelLaunchParamsTy {
122+
/// Size of the Data array.
123+
size_t Size = 0;
124+
/// Flat array of kernel parameters.
125+
void *Data = nullptr;
126+
/// Ptrs to the Data entries. Only strictly required for the host plugin.
127+
void **Ptrs = nullptr;
128+
};
119129
}
120130

121131
#endif // OMPTARGET_SHARED_API_TYPES_H

offload/plugins-nextgen/amdgpu/src/rtl.cpp

+15-13
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <unistd.h>
2121
#include <unordered_map>
2222

23+
#include "Shared/APITypes.h"
2324
#include "Shared/Debug.h"
2425
#include "Shared/Environment.h"
2526
#include "Shared/Utils.h"
@@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
558559

559560
/// Launch the AMDGPU kernel function.
560561
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
561-
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
562+
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
563+
KernelLaunchParamsTy LaunchParams,
562564
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
563565

564566
/// Print more elaborate kernel launch info for AMDGPU
@@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
28022804
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
28032805

28042806
KernelArgsTy KernelArgs = {};
2805-
if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
2806-
/*NumBlocks=*/1ul, KernelArgs,
2807-
/*Args=*/nullptr, AsyncInfoWrapper))
2807+
if (auto Err =
2808+
AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
2809+
/*NumBlocks=*/1ul, KernelArgs,
2810+
KernelLaunchParamsTy{}, AsyncInfoWrapper))
28082811
return Err;
28092812

28102813
Error Err = Plugin::success();
@@ -3265,12 +3268,11 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
32653268

32663269
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
32673270
uint32_t NumThreads, uint64_t NumBlocks,
3268-
KernelArgsTy &KernelArgs, void *Args,
3271+
KernelArgsTy &KernelArgs,
3272+
KernelLaunchParamsTy LaunchParams,
32693273
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3270-
const uint32_t LaunchParamsSize = KernelArgs.NumArgs * sizeof(void *);
3271-
3272-
if (ArgsSize != LaunchParamsSize &&
3273-
ArgsSize != LaunchParamsSize + getImplicitArgsSize())
3274+
if (ArgsSize != LaunchParams.Size &&
3275+
ArgsSize != LaunchParams.Size + getImplicitArgsSize())
32743276
return Plugin::error("Mismatch of kernel arguments size");
32753277

32763278
AMDGPUPluginTy &AMDGPUPlugin =
@@ -3294,10 +3296,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
32943296
return Err;
32953297

32963298
utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
3297-
if (ArgsSize == LaunchParamsSize + getImplicitArgsSize()) {
3299+
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
32983300
// Initialize implicit arguments.
32993301
ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
3300-
advanceVoidPtr(AllArgs, LaunchParamsSize));
3302+
advanceVoidPtr(AllArgs, LaunchParams.Size));
33013303

33023304
// Initialize the implicit arguments to zero.
33033305
std::memset(ImplArgs, 0, getImplicitArgsSize());
@@ -3306,8 +3308,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33063308
// Copy the explicit arguments.
33073309
// TODO: We should expose the args memory manager alloc to the common part as
33083310
// alternative to copying them twice.
3309-
if (LaunchParamsSize)
3310-
std::memcpy(AllArgs, *static_cast<void **>(Args), LaunchParamsSize);
3311+
if (LaunchParams.Size)
3312+
std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
33113313

33123314
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
33133315

offload/plugins-nextgen/common/include/PluginInterface.h

+8-6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <shared_mutex>
2020
#include <vector>
2121

22+
#include "Shared/APITypes.h"
2223
#include "Shared/Debug.h"
2324
#include "Shared/Environment.h"
2425
#include "Shared/EnvironmentVar.h"
@@ -265,7 +266,7 @@ struct GenericKernelTy {
265266
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
266267
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
267268
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
268-
void *Args,
269+
KernelLaunchParamsTy LaunchParams,
269270
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
270271

271272
/// Get the kernel name.
@@ -326,11 +327,12 @@ struct GenericKernelTy {
326327

327328
private:
328329
/// Prepare the arguments before launching the kernel.
329-
void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
330-
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
331-
llvm::SmallVectorImpl<void *> &Args,
332-
llvm::SmallVectorImpl<void *> &Ptrs,
333-
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
330+
KernelLaunchParamsTy
331+
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
332+
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
333+
llvm::SmallVectorImpl<void *> &Args,
334+
llvm::SmallVectorImpl<void *> &Ptrs,
335+
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
334336

335337
/// Get the number of threads and blocks for the kernel based on the
336338
/// user-defined threads and block clauses.

offload/plugins-nextgen/common/src/PluginInterface.cpp

+14-14
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,9 @@ struct RecordReplayTy {
268268
OS.close();
269269
}
270270

271-
void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
272-
uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
273-
uint64_t LoopTripCount) {
271+
void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
272+
int32_t NumArgs, uint64_t NumTeamsClause,
273+
uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
274274
json::Object JsonKernelInfo;
275275
JsonKernelInfo["Name"] = Name;
276276
JsonKernelInfo["NumArgs"] = NumArgs;
@@ -283,7 +283,7 @@ struct RecordReplayTy {
283283

284284
json::Array JsonArgPtrs;
285285
for (int I = 0; I < NumArgs; ++I)
286-
JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
286+
JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
287287
JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
288288

289289
json::Array JsonArgOffsets;
@@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
549549
if (!KernelLaunchEnvOrErr)
550550
return KernelLaunchEnvOrErr.takeError();
551551

552-
void *KernelArgsPtr =
552+
KernelLaunchParamsTy LaunchParams =
553553
prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
554554
Ptrs, *KernelLaunchEnvOrErr);
555555

@@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
564564
if (RecordReplay.isRecording()) {
565565
RecordReplay.saveImage(getName(), getImage());
566566
RecordReplay.saveKernelInput(getName(), getImage());
567-
RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
567+
RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
568568
NumBlocks, NumThreads, KernelArgs.Tripcount);
569569
}
570570

@@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
573573
return Err;
574574

575575
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
576-
KernelArgsPtr, AsyncInfoWrapper);
576+
LaunchParams, AsyncInfoWrapper);
577577
}
578578

579-
void *GenericKernelTy::prepareArgs(
579+
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
580580
GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
581581
uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
582582
llvm::SmallVectorImpl<void *> &Ptrs,
@@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs(
585585
NumArgs += KLEOffset;
586586

587587
if (NumArgs == 0)
588-
return nullptr;
588+
return KernelLaunchParamsTy{};
589589

590590
Args.resize(NumArgs);
591591
Ptrs.resize(NumArgs);
592592

593593
if (KernelLaunchEnvironment) {
594-
Ptrs[0] = KernelLaunchEnvironment;
595-
Args[0] = &Ptrs[0];
594+
Args[0] = KernelLaunchEnvironment;
595+
Ptrs[0] = &Args[0];
596596
}
597597

598598
for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
599-
Ptrs[I] =
599+
Args[I] =
600600
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
601-
Args[I] = &Ptrs[I];
601+
Ptrs[I] = &Args[I];
602602
}
603-
return &Args[0];
603+
return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
604604
}
605605

606606
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h

+4
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,10 @@ typedef enum CUevent_flags_enum {
282282
CU_EVENT_INTERPROCESS = 0x4
283283
} CUevent_flags;
284284

285+
static inline void *CU_LAUNCH_PARAM_END = (void *)0x00;
286+
static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
287+
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
288+
285289
CUresult cuCtxGetDevice(CUdevice *);
286290
CUresult cuDeviceGet(CUdevice *, int);
287291
CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);

offload/plugins-nextgen/cuda/src/rtl.cpp

+18-10
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <string>
1717
#include <unordered_map>
1818

19+
#include "Shared/APITypes.h"
1920
#include "Shared/Debug.h"
2021
#include "Shared/Environment.h"
2122

@@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy {
149150

150151
/// Launch the CUDA kernel function.
151152
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
152-
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
153+
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
154+
KernelLaunchParamsTy LaunchParams,
153155
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
154156

155157
private:
@@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
12281230
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
12291231

12301232
KernelArgsTy KernelArgs = {};
1231-
if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
1232-
/*NumBlocks=*/1ul, KernelArgs, nullptr,
1233-
AsyncInfoWrapper))
1233+
if (auto Err =
1234+
CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
1235+
/*NumBlocks=*/1ul, KernelArgs,
1236+
KernelLaunchParamsTy{}, AsyncInfoWrapper))
12341237
return Err;
12351238

12361239
Error Err = Plugin::success();
@@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
12741277

12751278
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
12761279
uint32_t NumThreads, uint64_t NumBlocks,
1277-
KernelArgsTy &KernelArgs, void *Args,
1280+
KernelArgsTy &KernelArgs,
1281+
KernelLaunchParamsTy LaunchParams,
12781282
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
12791283
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
12801284

@@ -1285,11 +1289,15 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
12851289
uint32_t MaxDynCGroupMem =
12861290
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
12871291

1288-
CUresult Res =
1289-
cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
1290-
/*gridDimZ=*/1, NumThreads,
1291-
/*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
1292-
(void **)Args, nullptr);
1292+
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
1293+
CU_LAUNCH_PARAM_BUFFER_SIZE,
1294+
reinterpret_cast<void *>(&LaunchParams.Size),
1295+
CU_LAUNCH_PARAM_END};
1296+
1297+
CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
1298+
/*gridDimZ=*/1, NumThreads,
1299+
/*blockDimY=*/1, /*blockDimZ=*/1,
1300+
MaxDynCGroupMem, Stream, nullptr, Config);
12931301
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
12941302
}
12951303

offload/plugins-nextgen/host/src/rtl.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
9090

9191
/// Launch the kernel using the libffi.
9292
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
93-
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
93+
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
94+
KernelLaunchParamsTy LaunchParams,
9495
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
9596
// Create a vector of ffi_types, one per argument.
9697
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
@@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy {
105106

106107
// Call the kernel function through libffi.
107108
long Return;
108-
ffi_call(&Cif, Func, &Return, (void **)Args);
109+
ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs);
109110

110111
return Plugin::success();
111112
}

0 commit comments

Comments
 (0)