diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index a84b685eeedce..5b22bbaac144f 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -116,6 +116,16 @@ static_assert(sizeof(KernelArgsTy) == (8 * sizeof(int32_t) + 3 * sizeof(int64_t) + 4 * sizeof(void **) + 2 * sizeof(int64_t *)), "Invalid struct size"); + +/// Flat array of kernel launch parameters and their total size. +struct KernelLaunchParamsTy { + /// Size of the Data array. + size_t Size = 0; + /// Flat array of kernel parameters. + void *Data = nullptr; + /// Ptrs to the Data entries. Only strictly required for the host plugin. + void **Ptrs = nullptr; +}; } #endif // OMPTARGET_SHARED_API_TYPES_H diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 26bca4a3674bd..e678213df18ce 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -20,6 +20,7 @@ #include #include +#include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" #include "Shared/Utils.h" @@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Launch the AMDGPU kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// Print more elaborate kernel launch info for AMDGPU @@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); KernelArgsTy KernelArgs = {}; - if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, - /*NumBlocks=*/1ul, KernelArgs, - /*Args=*/nullptr, AsyncInfoWrapper)) + if (auto Err = + AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, + /*NumBlocks=*/1ul, KernelArgs, + KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; Error Err = Plugin::success(); @@ -3265,12 +3268,11 @@ struct AMDGPUPluginTy final : public GenericPluginTy { Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, uint64_t NumBlocks, - KernelArgsTy &KernelArgs, void *Args, + KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { - const uint32_t LaunchParamsSize = KernelArgs.NumArgs * sizeof(void *); - - if (ArgsSize != LaunchParamsSize && - ArgsSize != LaunchParamsSize + getImplicitArgsSize()) + if (ArgsSize != LaunchParams.Size && + ArgsSize != LaunchParams.Size + getImplicitArgsSize()) return Plugin::error("Mismatch of kernel arguments size"); AMDGPUPluginTy &AMDGPUPlugin = @@ -3294,10 +3296,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, return Err; utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr; - if (ArgsSize == LaunchParamsSize + getImplicitArgsSize()) { + if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) { // Initialize implicit arguments. ImplArgs = reinterpret_cast( - advanceVoidPtr(AllArgs, LaunchParamsSize)); + advanceVoidPtr(AllArgs, LaunchParams.Size)); // Initialize the implicit arguments to zero. std::memset(ImplArgs, 0, getImplicitArgsSize()); @@ -3306,8 +3308,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Copy the explicit arguments. // TODO: We should expose the args memory manager alloc to the common part as // alternative to copying them twice. - if (LaunchParamsSize) - std::memcpy(AllArgs, *static_cast(Args), LaunchParamsSize); + if (LaunchParams.Size) + std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size); AMDGPUDeviceTy &AMDGPUDevice = static_cast(GenericDevice); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 88423be039af7..0d2a36a42d5fa 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -19,6 +19,7 @@ #include #include +#include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" #include "Shared/EnvironmentVar.h" @@ -265,7 +266,7 @@ struct GenericKernelTy { AsyncInfoWrapperTy &AsyncInfoWrapper) const; virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, uint64_t NumBlocks, KernelArgsTy &KernelArgs, - void *Args, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; /// Get the kernel name. @@ -326,11 +327,12 @@ struct GenericKernelTy { private: /// Prepare the arguments before launching the kernel. - void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, - ptrdiff_t *ArgOffsets, uint32_t &NumArgs, - llvm::SmallVectorImpl &Args, - llvm::SmallVectorImpl &Ptrs, - KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const; + KernelLaunchParamsTy + prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, uint32_t &NumArgs, + llvm::SmallVectorImpl &Args, + llvm::SmallVectorImpl &Ptrs, + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const; /// Get the number of threads and blocks for the kernel based on the /// user-defined threads and block clauses. diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 5a53c479e33d0..94f9d4670b672 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -268,9 +268,9 @@ struct RecordReplayTy { OS.close(); } - void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs, - uint64_t NumTeamsClause, uint32_t ThreadLimitClause, - uint64_t LoopTripCount) { + void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams, + int32_t NumArgs, uint64_t NumTeamsClause, + uint32_t ThreadLimitClause, uint64_t LoopTripCount) { json::Object JsonKernelInfo; JsonKernelInfo["Name"] = Name; JsonKernelInfo["NumArgs"] = NumArgs; @@ -283,7 +283,7 @@ struct RecordReplayTy { json::Array JsonArgPtrs; for (int I = 0; I < NumArgs; ++I) - JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]); + JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]); JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs)); json::Array JsonArgOffsets; @@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); - void *KernelArgsPtr = + KernelLaunchParamsTy LaunchParams = prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args, Ptrs, *KernelLaunchEnvOrErr); @@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (RecordReplay.isRecording()) { RecordReplay.saveImage(getName(), getImage()); RecordReplay.saveKernelInput(getName(), getImage()); - RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs, + RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs, NumBlocks, NumThreads, KernelArgs.Tripcount); } @@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, return Err; return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, - KernelArgsPtr, AsyncInfoWrapper); + LaunchParams, AsyncInfoWrapper); } -void *GenericKernelTy::prepareArgs( +KernelLaunchParamsTy GenericKernelTy::prepareArgs( GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets, uint32_t &NumArgs, llvm::SmallVectorImpl &Args, llvm::SmallVectorImpl &Ptrs, @@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs( NumArgs += KLEOffset; if (NumArgs == 0) - return nullptr; + return KernelLaunchParamsTy{}; Args.resize(NumArgs); Ptrs.resize(NumArgs); if (KernelLaunchEnvironment) { - Ptrs[0] = KernelLaunchEnvironment; - Args[0] = &Ptrs[0]; + Args[0] = KernelLaunchEnvironment; + Ptrs[0] = &Args[0]; } for (uint32_t I = KLEOffset; I < NumArgs; ++I) { - Ptrs[I] = + Args[I] = (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]); - Args[I] = &Ptrs[I]; + Ptrs[I] = &Args[I]; } - return &Args[0]; + return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]}; } uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index d65e5cf61e096..16c8f7ad46c44 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -282,6 +282,10 @@ typedef enum CUevent_flags_enum { CU_EVENT_INTERPROCESS = 0x4 } CUevent_flags; +static inline void *CU_LAUNCH_PARAM_END = (void *)0x00; +static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01; +static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02; + CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 62460c07415be..b6465d61bd033 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -16,6 +16,7 @@ #include #include +#include "Shared/APITypes.h" #include "Shared/Debug.h" #include "Shared/Environment.h" @@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy { /// Launch the CUDA kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; private: @@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy { AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); KernelArgsTy KernelArgs = {}; - if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u, - /*NumBlocks=*/1ul, KernelArgs, nullptr, - AsyncInfoWrapper)) + if (auto Err = + CUDAKernel.launchImpl(*this, /*NumThread=*/1u, + /*NumBlocks=*/1ul, KernelArgs, + KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; Error Err = Plugin::success(); @@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy { Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, uint64_t NumBlocks, - KernelArgsTy &KernelArgs, void *Args, + KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { CUDADeviceTy &CUDADevice = static_cast(GenericDevice); @@ -1285,11 +1289,15 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t MaxDynCGroupMem = std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); - CUresult Res = - cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1, - /*gridDimZ=*/1, NumThreads, - /*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream, - (void **)Args, nullptr); + void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, + CU_LAUNCH_PARAM_BUFFER_SIZE, + reinterpret_cast(&LaunchParams.Size), + CU_LAUNCH_PARAM_END}; + + CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1, + /*gridDimZ=*/1, NumThreads, + /*blockDimY=*/1, /*blockDimZ=*/1, + MaxDynCGroupMem, Stream, nullptr, Config); return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName()); } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index aa59ea618e399..fe296b77c7d55 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy { /// Launch the kernel using the libffi. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override { // Create a vector of ffi_types, one per argument. SmallVector ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer); @@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy { // Call the kernel function through libffi. long Return; - ffi_call(&Cif, Func, &Return, (void **)Args); + ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs); return Plugin::success(); }