Skip to content

Commit 038c7d5

Browse files
committed
FIXES
1 parent 3d5c61a commit 038c7d5

File tree

11 files changed

+78
-34
lines changed

11 files changed

+78
-34
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "CGCXXABI.h"
1616
#include "CodeGenFunction.h"
1717
#include "CodeGenModule.h"
18+
#include "clang/AST/CharUnits.h"
1819
#include "clang/AST/Decl.h"
1920
#include "clang/Basic/Cuda.h"
2021
#include "clang/CodeGen/CodeGenABITypes.h"
@@ -138,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
138139
return DummyFunc;
139140
}
140141

142+
Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
143+
Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
144+
FunctionArgList &Args);
141145
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
142146
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
143147
std::string getDeviceSideName(const NamedDecl *ND) override;
@@ -322,12 +326,30 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
322326
emitDeviceStubBodyLegacy(CGF, Args);
323327
}
324328

325-
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
326-
// array and kernels are launched using cudaLaunchKernel().
327-
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
328-
FunctionArgList &Args) {
329-
// Build the shadow stack entry at the very start of the function.
329+
/// CUDA passes the arguments with a level of indirection. For example, a
330+
/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
331+
/// function. For the LLVM/offload launch we flatten the arguments into the
332+
/// struct directly, thus pass {void *, short, void *}
333+
Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
334+
FunctionArgList &Args) {
335+
SmallVector<llvm::Type *> ArgTypes;
336+
for (auto &Arg : Args)
337+
ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
338+
339+
llvm::StructType *ST = llvm::StructType::create(ArgTypes);
340+
Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
341+
ST, CharUnits::fromQuantity(16), "kernel_args");
342+
343+
for (unsigned i = 0; i < Args.size(); ++i) {
344+
auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
345+
CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
346+
}
330347

348+
return KernelArgs;
349+
}
350+
351+
Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
352+
FunctionArgList &Args) {
331353
// Calculate amount of space we will need for all arguments. If we have no
332354
// args, allocate a single pointer so we still have a valid pointer to the
333355
// argument array that we can pass to runtime, even if it will be unused.
@@ -342,6 +364,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
342364
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
343365
PtrTy, KernelArgs.emitRawPointer(CGF), i));
344366
}
367+
return KernelArgs;
368+
}
369+
370+
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
371+
// array and kernels are launched using cudaLaunchKernel().
372+
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
373+
FunctionArgList &Args) {
374+
// Build the shadow stack entry at the very start of the function.
375+
Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
376+
? prepareKernelArgsLLVMOffload(CGF, Args)
377+
: prepareKernelArgs(CGF, Args);
345378

346379
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
347380

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,15 +1126,19 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
11261126
}
11271127

11281128
if (Args.hasArg(options::OPT_foffload_via_llvm)) {
1129-
CmdArgs.push_back("-include");
1129+
// Add llvm_wrappers/* to our system include path. This lets us wrap
1130+
// standard library headers and other headers.
11301131
SmallString<128> P(D.ResourceDir);
11311132
llvm::sys::path::append(P, "include");
1132-
llvm::sys::path::append(P, "openmp_wrappers");
1133+
llvm::sys::path::append(P, "llvm_offload_wrappers");
1134+
CmdArgs.push_back("-internal-isystem");
1135+
CmdArgs.push_back(Args.MakeArgString(P));
1136+
1137+
CmdArgs.push_back("-include");
11331138
if (JA.isDeviceOffloading(Action::OFK_OpenMP))
1134-
llvm::sys::path::append(P, "__llvm_offload_device.h");
1139+
CmdArgs.push_back("__llvm_offload_device.h");
11351140
else
1136-
llvm::sys::path::append(P, "__llvm_offload_host.h");
1137-
CmdArgs.push_back(Args.MakeArgString(P));
1141+
CmdArgs.push_back("__llvm_offload_host.h");
11381142
}
11391143

11401144
// Add -i* options, and automatically translate to

clang/lib/Headers/CMakeLists.txt

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,9 +323,12 @@ set(openmp_wrapper_files
323323
openmp_wrappers/__clang_openmp_device_functions.h
324324
openmp_wrappers/complex_cmath.h
325325
openmp_wrappers/new
326-
openmp_wrappers/__llvm_offload.h
327-
openmp_wrappers/__llvm_offload_host.h
328-
openmp_wrappers/__llvm_offload_device.h
326+
)
327+
328+
set(llvm_offload_wrapper_files
329+
llvm_offload_wrappers/__llvm_offload.h
330+
llvm_offload_wrappers/__llvm_offload_host.h
331+
llvm_offload_wrappers/__llvm_offload_device.h
329332
)
330333

331334
set(llvm_libc_wrapper_files
@@ -378,7 +381,7 @@ endfunction(clang_generate_header)
378381
# Copy header files from the source directory to the build directory
379382
foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
380383
${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
381-
${llvm_libc_wrapper_files})
384+
${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files})
382385
copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
383386
endforeach( f )
384387

@@ -504,6 +507,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files})
504507
add_header_target("opencl-resource-headers" ${opencl_files})
505508
add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files})
506509
add_header_target("openmp-resource-headers" ${openmp_wrapper_files})
510+
add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files})
507511
add_header_target("windows-resource-headers" ${windows_only_files})
508512
add_header_target("utility-resource-headers" ${utility_files})
509513

@@ -545,6 +549,11 @@ install(
545549
DESTINATION ${header_install_dir}/openmp_wrappers
546550
COMPONENT clang-resource-headers)
547551

552+
install(
553+
FILES ${llvm_offload_wrapper_files}
554+
DESTINATION ${header_install_dir}/llvm_offload_wrappers
555+
COMPONENT clang-resource-headers)
556+
548557
install(
549558
FILES ${zos_wrapper_files}
550559
DESTINATION ${header_install_dir}/zos_wrappers
@@ -707,8 +716,8 @@ install(
707716
COMPONENT openmp-resource-headers)
708717

709718
install(
710-
FILES ${openmp_wrapper_files}
711-
DESTINATION ${header_install_dir}/openmp_wrappers
719+
FILES ${llvm_offload_wrapper_files}
720+
DESTINATION ${header_install_dir}/llvm_offload_wrappers
712721
EXCLUDE_FROM_ALL
713722
COMPONENT openmp-resource-headers)
714723

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3268,13 +3268,13 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
32683268
uint32_t NumThreads, uint64_t NumBlocks,
32693269
KernelArgsTy &KernelArgs, void *Args,
32703270
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3271+
uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
3272+
32713273
if (KernelArgs.Flags.IsCUDA) {
3272-
// For CUDA kernels we compute the number of arguments here.
3273-
KernelArgs.NumArgs = (ArgsSize - ImplicitArgsSize) / sizeof(void *);
3274+
// For CUDA kernels we compute the kernel argument size explicitly.
3275+
KernelArgsSize = ArgsSize - ImplicitArgsSize;
32743276
}
32753277

3276-
const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
3277-
32783278
if (ArgsSize < KernelArgsSize)
32793279
return Plugin::error("Mismatch of kernel arguments size");
32803280

@@ -3315,14 +3315,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33153315
// Copy the explicit arguments.
33163316
// TODO: We should expose the args memory manager alloc to the common part as
33173317
// alternative to copying them twice.
3318-
if (KernelArgs.NumArgs && !KernelArgs.Flags.IsCUDA) {
3319-
std::memcpy(AllArgs, *static_cast<void **>(Args),
3320-
sizeof(void *) * KernelArgs.NumArgs);
3321-
} else {
3322-
for (uint32_t I = 0; I < KernelArgs.NumArgs; ++I)
3323-
std::memcpy(advanceVoidPtr(AllArgs, sizeof(void *) * I),
3324-
static_cast<void **>(Args)[I], sizeof(void *));
3325-
}
3318+
if (KernelArgsSize)
3319+
std::memcpy(AllArgs, *static_cast<void **>(Args), KernelArgsSize);
33263320

33273321
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
33283322

offload/src/KernelLanguage/API.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
5858
KernelArgsTy *Args);
5959

6060
unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
61-
void **args, size_t sharedMem, void *stream) {
61+
void *args, size_t sharedMem, void *stream) {
6262
KernelArgsTy Args = {};
6363
Args.DynCGroupMem = sharedMem;
6464
Args.NumTeams[0] = gridDim.x;
@@ -67,7 +67,7 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
6767
Args.ThreadLimit[0] = blockDim.x;
6868
Args.ThreadLimit[1] = blockDim.y;
6969
Args.ThreadLimit[2] = blockDim.z;
70-
Args.ArgPtrs = args;
70+
Args.ArgPtrs = &args;
7171
Args.Flags.IsCUDA = true;
7272
int rv = __tgt_target_kernel(nullptr, 0, gridDim.x,
7373
blockDim.x, func, &Args);

offload/test/lit.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def evaluate_bool_env(env):
6666
config.name = 'libomptarget :: ' + config.libomptarget_current_target
6767

6868
# suffixes: A list of file extensions to treat as test files.
69-
config.suffixes = ['.c', '.cpp', '.cc', '.f90']
69+
config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu']
7070

7171
# excludes: A list of directories to exclude from the testuites.
7272
config.excludes = ['Inputs']

offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ int main(int argc, char **argv) {
2222
int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
2323
*Ptr = 0;
2424
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
25-
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
25+
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
2626
square<<<7, 6>>>(Ptr);
2727
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
2828
// CHECK: Ptr [[Ptr]], *Ptr: 42

offload/test/offloading/CUDA/basic_launch_multi_arg.cu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
1313
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
1414
}
1515

16-
__global__ void square(int *Dst, int *Src, short Q, short P) {
16+
__global__ void square(int *Dst, short Q, int *Src, short P) {
1717
*Dst = (Src[0] + Src[1]) * (Q + P);
18+
Src[0] = Q;
19+
Src[1] = P;
1820
}
1921

2022
int main(int argc, char **argv) {
@@ -25,9 +27,11 @@ int main(int argc, char **argv) {
2527
Src[0] = -2;
2628
Src[1] = 8;
2729
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
30+
printf("%i : %i\n", Src[0], Src[1]);
2831
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
29-
square<<<1, 1>>>(Ptr, Src, 3, 4);
32+
square<<<1, 1>>>(Ptr, 3, Src, 4);
3033
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
34+
printf("%i : %i\n", Src[0], Src[1]);
3135
// CHECK: Ptr [[Ptr]], *Ptr: 42
3236
llvm_omp_target_free_shared(Ptr, DevNo);
3337
}

0 commit comments

Comments
 (0)