Skip to content

Commit 3d5c61a

Browse files
committed
[Offload][CUDA] Allow CUDA kernels to use LLVM/Offload
Through the new `-foffload-via-llvm` flag, CUDA kernels can now be lowered to the LLVM/Offload API. On the Clang side, this is simply done by using the OpenMP offload toolchain and emitting calls to `llvm*` functions to orchestrate the kernel launch rather than `cuda*` functions. These `llvm*` functions are implemented on top of the existing LLVM/Offload API. As we are about to redefine the Offload API, this wil help us in the design process as a second offload language. We do not support any CUDA APIs yet, however, we could: https://www.osti.gov/servlets/purl/1892137 For proper host execution we need to resurrect/rebase https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf (which was designed for debugging). ``` ❯❯❯ cat test.cu extern "C" { void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); } __global__ void square(int *A) { *A = 42; } int main(int argc, char **argv) { int DevNo = 0; int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); *Ptr = 7; printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); square<<<1, 1>>>(Ptr); printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); llvm_omp_target_free_shared(Ptr, DevNo); } ❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native ❯❯❯ llvm-objdump --offloading test123 test123: file format elf64-x86-64 OFFLOADING IMAGE [0]: kind elf arch gfx90a triple amdgcn-amd-amdhsa producer openmp ❯❯❯ LIBOMPTARGET_INFO=16 ./test123 Ptr 0x155448ac8000, *Ptr 7 Ptr 0x155448ac8000, *Ptr 42 ```
1 parent 36618e6 commit 3d5c61a

File tree

23 files changed

+356
-33
lines changed

23 files changed

+356
-33
lines changed

clang/include/clang/Basic/LangOptions.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
288288
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
289289
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
290290
LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
291+
LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
291292

292293
LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
293294
LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation")

clang/include/clang/Driver/Options.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
12751275
def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
12761276
Flags<[HelpHidden]>,
12771277
HelpText<"Compression level for offload device binaries (HIP only)">;
1278+
1279+
defm offload_via_llvm : BoolFOption<"offload-via-llvm",
1280+
LangOpts<"OffloadViaLLVM">, DefaultFalse,
1281+
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
1282+
NegFlag<SetFalse, [], [ClangOption], "Don't use">,
1283+
BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
12781284
}
12791285

12801286
// CUDA options

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "clang/Basic/Cuda.h"
2020
#include "clang/CodeGen/CodeGenABITypes.h"
2121
#include "clang/CodeGen/ConstantInitBuilder.h"
22+
#include "llvm/ADT/StringRef.h"
2223
#include "llvm/Frontend/Offloading/Utility.h"
2324
#include "llvm/IR/BasicBlock.h"
2425
#include "llvm/IR/Constants.h"
@@ -36,6 +37,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
3637

3738
class CGNVCUDARuntime : public CGCUDARuntime {
3839

40+
/// The prefix used for function calls and section names (CUDA, HIP, LLVM)
41+
StringRef Prefix;
42+
/// TODO: We should transition the OpenMP section to LLVM/Offload
43+
StringRef SectionPrefix;
44+
3945
private:
4046
llvm::IntegerType *IntTy, *SizeTy;
4147
llvm::Type *VoidTy;
@@ -191,15 +197,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
191197
} // end anonymous namespace
192198

193199
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194-
if (CGM.getLangOpts().HIP)
195-
return ((Twine("hip") + Twine(FuncName)).str());
196-
return ((Twine("cuda") + Twine(FuncName)).str());
200+
return (Prefix + FuncName).str();
197201
}
198202
std::string
199203
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200-
if (CGM.getLangOpts().HIP)
201-
return ((Twine("__hip") + Twine(FuncName)).str());
202-
return ((Twine("__cuda") + Twine(FuncName)).str());
204+
return ("__" + Prefix + FuncName).str();
203205
}
204206

205207
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +229,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
227229
SizeTy = CGM.SizeTy;
228230
VoidTy = CGM.VoidTy;
229231
PtrTy = CGM.UnqualPtrTy;
232+
233+
if (CGM.getLangOpts().OffloadViaLLVM) {
234+
Prefix = "llvm";
235+
SectionPrefix = "omp";
236+
} else if (CGM.getLangOpts().HIP)
237+
SectionPrefix = Prefix = "hip";
238+
else
239+
SectionPrefix = Prefix = "cuda";
230240
}
231241

232242
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,7 +315,8 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
305315
}
306316
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307317
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308-
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
318+
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
319+
(CGF.getLangOpts().OffloadViaLLVM))
309320
emitDeviceStubBodyNew(CGF, Args);
310321
else
311322
emitDeviceStubBodyLegacy(CGF, Args);
@@ -1129,8 +1140,9 @@ void CGNVCUDARuntime::transformManagedVars() {
11291140
// registered. The linker will provide a pointer to this section so we can
11301141
// register the symbols with the linked device image.
11311142
void CGNVCUDARuntime::createOffloadingEntries() {
1132-
StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133-
: "cuda_offloading_entries";
1143+
SmallVector<char, 32> Out;
1144+
StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
1145+
11341146
llvm::Module &M = CGM.getModule();
11351147
for (KernelInfo &I : EmittedKernels)
11361148
llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1211,9 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
11991211
}
12001212
return nullptr;
12011213
}
1202-
if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1214+
if (CGM.getLangOpts().OffloadViaLLVM)
1215+
createOffloadingEntries();
1216+
else if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
12031217
createOffloadingEntries();
12041218
else
12051219
return makeModuleCtorFunction();

clang/lib/Driver/Driver.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
792792
}) ||
793793
C.getInputArgs().hasArg(options::OPT_hip_link) ||
794794
C.getInputArgs().hasArg(options::OPT_hipstdpar);
795+
bool UseLLVMOffload = C.getInputArgs().hasArg(
796+
options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
795797
if (IsCuda && IsHIP) {
796798
Diag(clang::diag::err_drv_mix_cuda_hip);
797799
return;
798800
}
799-
if (IsCuda) {
801+
if (IsCuda && !UseLLVMOffload) {
800802
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
801803
const llvm::Triple &HostTriple = HostTC->getTriple();
802804
auto OFK = Action::OFK_Cuda;
@@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
818820
CudaInstallation.WarnIfUnsupportedVersion();
819821
}
820822
C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
821-
} else if (IsHIP) {
823+
} else if (IsHIP && !UseLLVMOffload) {
822824
if (auto *OMPTargetArg =
823825
C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
824826
Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
842844
// We need to generate an OpenMP toolchain if the user specified targets with
843845
// the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
844846
bool IsOpenMPOffloading =
845-
C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
846-
options::OPT_fno_openmp, false) &&
847-
(C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
848-
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
847+
((IsCuda || IsHIP) && UseLLVMOffload) ||
848+
(C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
849+
options::OPT_fno_openmp, false) &&
850+
(C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
851+
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
849852
if (IsOpenMPOffloading) {
850853
// We expect that -fopenmp-targets is always used in conjunction with the
851854
// option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
873876
for (StringRef T : OpenMPTargets->getValues())
874877
OpenMPTriples.insert(T);
875878
} else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
876-
!IsHIP && !IsCuda) {
879+
((!IsHIP && !IsCuda) || UseLLVMOffload)) {
877880
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
878881
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
879882
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4138,6 +4141,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
41384141

41394142
bool UseNewOffloadingDriver =
41404143
C.isOffloadingHostKind(Action::OFK_OpenMP) ||
4144+
Args.hasFlag(options::OPT_foffload_via_llvm,
4145+
options::OPT_fno_offload_via_llvm, false) ||
41414146
Args.hasFlag(options::OPT_offload_new_driver,
41424147
options::OPT_no_offload_new_driver, false);
41434148

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
11251125
CmdArgs.push_back("__clang_openmp_device_functions.h");
11261126
}
11271127

1128+
if (Args.hasArg(options::OPT_foffload_via_llvm)) {
1129+
CmdArgs.push_back("-include");
1130+
SmallString<128> P(D.ResourceDir);
1131+
llvm::sys::path::append(P, "include");
1132+
llvm::sys::path::append(P, "openmp_wrappers");
1133+
if (JA.isDeviceOffloading(Action::OFK_OpenMP))
1134+
llvm::sys::path::append(P, "__llvm_offload_device.h");
1135+
else
1136+
llvm::sys::path::append(P, "__llvm_offload_host.h");
1137+
CmdArgs.push_back(Args.MakeArgString(P));
1138+
}
1139+
11281140
// Add -i* options, and automatically translate to
11291141
// -include-pch/-include-pth for transparent PCH support. It's
11301142
// wonky, but we include looking for .gch so we can support seamless
@@ -6672,11 +6684,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
66726684
Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
66736685
options::OPT_fno_openmp_extensions);
66746686
}
6675-
6676-
// Forward the new driver to change offloading code generation.
6677-
if (Args.hasFlag(options::OPT_offload_new_driver,
6678-
options::OPT_no_offload_new_driver, false))
6687+
// Forward the offload runtime change to code generation, liboffload implies
6688+
// new driver. Otherwise, check if we should forward the new driver to change
6689+
// offloading code generation.
6690+
if (Args.hasFlag(options::OPT_foffload_via_llvm,
6691+
options::OPT_fno_offload_via_llvm, false)) {
6692+
CmdArgs.push_back("--offload-new-driver");
6693+
CmdArgs.push_back("-foffload-via-llvm");
6694+
} else if (Args.hasFlag(options::OPT_offload_new_driver,
6695+
options::OPT_no_offload_new_driver, false)) {
66796696
CmdArgs.push_back("--offload-new-driver");
6697+
}
66806698

66816699
SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
66826700

clang/lib/Driver/ToolChains/CommonArgs.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1144,8 +1144,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
11441144
bool ForceStaticHostRuntime, bool IsOffloadingHost,
11451145
bool GompNeedsRT) {
11461146
if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
1147-
options::OPT_fno_openmp, false))
1147+
options::OPT_fno_openmp, false)) {
1148+
// We need libomptarget (liboffload) if it's the choosen offloading runtime.
1149+
if (Args.hasFlag(options::OPT_foffload_via_llvm,
1150+
options::OPT_fno_offload_via_llvm, false))
1151+
CmdArgs.push_back("-lomptarget");
11481152
return false;
1153+
}
11491154

11501155
Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
11511156

clang/lib/Headers/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,9 @@ set(openmp_wrapper_files
323323
openmp_wrappers/__clang_openmp_device_functions.h
324324
openmp_wrappers/complex_cmath.h
325325
openmp_wrappers/new
326+
openmp_wrappers/__llvm_offload.h
327+
openmp_wrappers/__llvm_offload_host.h
328+
openmp_wrappers/__llvm_offload_device.h
326329
)
327330

328331
set(llvm_libc_wrapper_files

clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,11 @@
1010
#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
1111
#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
1212

13-
#ifndef _OPENMP
14-
#error "This file is for OpenMP compilation only."
15-
#endif
16-
1713
#ifdef __cplusplus
1814
extern "C" {
1915
#endif
2016

17+
#ifdef __NVPTX__
2118
#pragma omp begin declare variant match( \
2219
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
2320

@@ -34,6 +31,7 @@ extern "C" {
3431
#undef __CUDA__
3532

3633
#pragma omp end declare variant
34+
#endif
3735

3836
#ifdef __AMDGCN__
3937
#pragma omp begin declare variant match(device = {arch(amdgcn)})
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
10+
#include <stdlib.h>
11+
12+
#define __host__ __attribute__((host))
13+
#define __device__ __attribute__((device))
14+
#define __global__ __attribute__((global))
15+
#define __shared__ __attribute__((shared))
16+
#define __constant__ __attribute__((constant))
17+
#define __managed__ __attribute__((managed))
18+
19+
extern "C" {
20+
21+
typedef struct dim3 {
22+
dim3() {}
23+
dim3(unsigned x) : x(x) {}
24+
unsigned x = 0, y = 0, z = 0;
25+
} dim3;
26+
27+
// TODO: For some reason the CUDA device compilation requires this declaration
28+
// to be present but it should not.
29+
unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
30+
size_t sharedMem = 0, void *stream = 0);
31+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
10+
#include "__llvm_offload.h"
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
10+
#include "__llvm_offload.h"
11+
12+
extern "C" {
13+
unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
14+
void **args, size_t sharedMem = 0, void *stream = 0);
15+
}

clang/lib/Sema/SemaCUDA.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD,
10591059
}
10601060

10611061
std::string SemaCUDA::getConfigureFuncName() const {
1062+
if (getLangOpts().OffloadViaLLVM)
1063+
return "__llvmPushCallConfiguration";
1064+
10621065
if (getLangOpts().HIP)
10631066
return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
10641067
: "hipConfigureCall";
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
2+
// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
3+
// RUN: | FileCheck -check-prefix BINDINGS %s
4+
5+
// BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
6+
// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]"
7+
// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]"
8+
// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]"
9+
// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]"
10+
// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]"
11+
// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
12+
// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
13+
14+
// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
15+
// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
16+
// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s
17+
18+
// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]"
19+
// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]"
20+
21+
// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s
22+
23+
// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out"

offload/include/Shared/APITypes.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,9 @@ struct KernelArgsTy {
102102
0; // Tripcount for the teams / distribute loop, 0 otherwise.
103103
struct {
104104
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
105-
uint64_t Unused : 63;
106-
} Flags = {0, 0};
105+
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
106+
uint64_t Unused : 62;
107+
} Flags = {0, 0, 0};
107108
uint32_t NumTeams[3] = {0, 0,
108109
0}; // The number of teams (for x,y,z dimension).
109110
uint32_t ThreadLimit[3] = {0, 0,

offload/include/omptarget.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
107107

108108
inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
109109
nullptr, nullptr, nullptr, nullptr,
110-
0, {0,0}, {1, 0, 0}, {1, 0, 0}, 0};
110+
0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
111111

112112
struct DeviceTy;
113113

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3268,6 +3268,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
32683268
uint32_t NumThreads, uint64_t NumBlocks,
32693269
KernelArgsTy &KernelArgs, void *Args,
32703270
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3271+
if (KernelArgs.Flags.IsCUDA) {
3272+
// For CUDA kernels we compute the number of arguments here.
3273+
KernelArgs.NumArgs = (ArgsSize - ImplicitArgsSize) / sizeof(void *);
3274+
}
3275+
32713276
const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
32723277

32733278
if (ArgsSize < KernelArgsSize)
@@ -3310,9 +3315,14 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33103315
// Copy the explicit arguments.
33113316
// TODO: We should expose the args memory manager alloc to the common part as
33123317
// alternative to copying them twice.
3313-
if (KernelArgs.NumArgs)
3318+
if (KernelArgs.NumArgs && !KernelArgs.Flags.IsCUDA) {
33143319
std::memcpy(AllArgs, *static_cast<void **>(Args),
33153320
sizeof(void *) * KernelArgs.NumArgs);
3321+
} else {
3322+
for (uint32_t I = 0; I < KernelArgs.NumArgs; ++I)
3323+
std::memcpy(advanceVoidPtr(AllArgs, sizeof(void *) * I),
3324+
static_cast<void **>(Args)[I], sizeof(void *));
3325+
}
33163326

33173327
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
33183328

0 commit comments

Comments
 (0)