1515#include " CGCXXABI.h"
1616#include " CodeGenFunction.h"
1717#include " CodeGenModule.h"
18+ #include " clang/AST/CharUnits.h"
1819#include " clang/AST/Decl.h"
1920#include " clang/Basic/Cuda.h"
2021#include " clang/CodeGen/CodeGenABITypes.h"
2122#include " clang/CodeGen/ConstantInitBuilder.h"
23+ #include " llvm/ADT/StringRef.h"
2224#include " llvm/Frontend/Offloading/Utility.h"
2325#include " llvm/IR/BasicBlock.h"
2426#include " llvm/IR/Constants.h"
@@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
3638
3739class CGNVCUDARuntime : public CGCUDARuntime {
3840
41+ // / The prefix used for function calls and section names (CUDA, HIP, LLVM)
42+ StringRef Prefix;
43+ // / TODO: We should transition the OpenMP section to LLVM/Offload
44+ StringRef SectionPrefix;
45+
3946private:
4047 llvm::IntegerType *IntTy, *SizeTy;
4148 llvm::Type *VoidTy;
@@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
132139 return DummyFunc;
133140 }
134141
142+ Address prepareKernelArgs (CodeGenFunction &CGF, FunctionArgList &Args);
143+ Address prepareKernelArgsLLVMOffload (CodeGenFunction &CGF,
144+ FunctionArgList &Args);
135145 void emitDeviceStubBodyLegacy (CodeGenFunction &CGF, FunctionArgList &Args);
136146 void emitDeviceStubBodyNew (CodeGenFunction &CGF, FunctionArgList &Args);
137147 std::string getDeviceSideName (const NamedDecl *ND) override ;
@@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
191201} // end anonymous namespace
192202
193203std::string CGNVCUDARuntime::addPrefixToName (StringRef FuncName) const {
194- if (CGM.getLangOpts ().HIP )
195- return ((Twine (" hip" ) + Twine (FuncName)).str ());
196- return ((Twine (" cuda" ) + Twine (FuncName)).str ());
204+ return (Prefix + FuncName).str ();
197205}
198206std::string
199207CGNVCUDARuntime::addUnderscoredPrefixToName (StringRef FuncName) const {
200- if (CGM.getLangOpts ().HIP )
201- return ((Twine (" __hip" ) + Twine (FuncName)).str ());
202- return ((Twine (" __cuda" ) + Twine (FuncName)).str ());
208+ return (" __" + Prefix + FuncName).str ();
203209}
204210
205211static std::unique_ptr<MangleContext> InitDeviceMC (CodeGenModule &CGM) {
@@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
227233 SizeTy = CGM.SizeTy ;
228234 VoidTy = CGM.VoidTy ;
229235 PtrTy = CGM.UnqualPtrTy ;
236+
237+ if (CGM.getLangOpts ().OffloadViaLLVM ) {
238+ Prefix = " llvm" ;
239+ SectionPrefix = " omp" ;
240+ } else if (CGM.getLangOpts ().HIP )
241+ SectionPrefix = Prefix = " hip" ;
242+ else
243+ SectionPrefix = Prefix = " cuda" ;
230244}
231245
232246llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn () const {
@@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
305319 }
306320 if (CudaFeatureEnabled (CGM.getTarget ().getSDKVersion (),
307321 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308- (CGF.getLangOpts ().HIP && CGF.getLangOpts ().HIPUseNewLaunchAPI ))
322+ (CGF.getLangOpts ().HIP && CGF.getLangOpts ().HIPUseNewLaunchAPI ) ||
323+ (CGF.getLangOpts ().OffloadViaLLVM ))
309324 emitDeviceStubBodyNew (CGF, Args);
310325 else
311326 emitDeviceStubBodyLegacy (CGF, Args);
312327}
313328
314- // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315- // array and kernels are launched using cudaLaunchKernel().
316- void CGNVCUDARuntime::emitDeviceStubBodyNew (CodeGenFunction &CGF,
317- FunctionArgList &Args) {
318- // Build the shadow stack entry at the very start of the function.
329+ // / CUDA passes the arguments with a level of indirection. For example, a
330+ // / (void*, short, void*) is passed as {void **, short *, void **} to the launch
331+ // / function. For the LLVM/offload launch we flatten the arguments into the
332+ // / struct directly. In addition, we include the size of the arguments, thus
333+ // / pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
334+ // / nullptr}. The last nullptr needs to be initialized to an array of pointers
335+ // / pointing to the arguments if we want to offload to the host.
336+ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload (CodeGenFunction &CGF,
337+ FunctionArgList &Args) {
338+ SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
339+ for (auto &Arg : Args)
340+ ArgTypes.push_back (CGF.ConvertTypeForMem (Arg->getType ()));
341+ llvm::StructType *KernelArgsTy = llvm::StructType::create (ArgTypes);
342+
343+ auto *Int64Ty = CGF.Builder .getInt64Ty ();
344+ KernelLaunchParamsTypes.push_back (Int64Ty);
345+ KernelLaunchParamsTypes.push_back (PtrTy);
346+ KernelLaunchParamsTypes.push_back (PtrTy);
347+
348+ llvm::StructType *KernelLaunchParamsTy =
349+ llvm::StructType::create (KernelLaunchParamsTypes);
350+ Address KernelArgs = CGF.CreateTempAllocaWithoutCast (
351+ KernelArgsTy, CharUnits::fromQuantity (16 ), " kernel_args" );
352+ Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast (
353+ KernelLaunchParamsTy, CharUnits::fromQuantity (16 ),
354+ " kernel_launch_params" );
355+
356+ auto KernelArgsSize = CGM.getDataLayout ().getTypeAllocSize (KernelArgsTy);
357+ CGF.Builder .CreateStore (llvm::ConstantInt::get (Int64Ty, KernelArgsSize),
358+ CGF.Builder .CreateStructGEP (KernelLaunchParams, 0 ));
359+ CGF.Builder .CreateStore (KernelArgs.emitRawPointer (CGF),
360+ CGF.Builder .CreateStructGEP (KernelLaunchParams, 1 ));
361+ CGF.Builder .CreateStore (llvm::Constant::getNullValue (PtrTy),
362+ CGF.Builder .CreateStructGEP (KernelLaunchParams, 2 ));
363+
364+ for (unsigned i = 0 ; i < Args.size (); ++i) {
365+ auto *ArgVal = CGF.Builder .CreateLoad (CGF.GetAddrOfLocalVar (Args[i]));
366+ CGF.Builder .CreateStore (ArgVal, CGF.Builder .CreateStructGEP (KernelArgs, i));
367+ }
319368
369+ return KernelLaunchParams;
370+ }
371+
372+ Address CGNVCUDARuntime::prepareKernelArgs (CodeGenFunction &CGF,
373+ FunctionArgList &Args) {
320374 // Calculate amount of space we will need for all arguments. If we have no
321375 // args, allocate a single pointer so we still have a valid pointer to the
322376 // argument array that we can pass to runtime, even if it will be unused.
@@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
331385 VoidVarPtr, CGF.Builder .CreateConstGEP1_32 (
332386 PtrTy, KernelArgs.emitRawPointer (CGF), i));
333387 }
388+ return KernelArgs;
389+ }
390+
391+ // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
392+ // array and kernels are launched using cudaLaunchKernel().
393+ void CGNVCUDARuntime::emitDeviceStubBodyNew (CodeGenFunction &CGF,
394+ FunctionArgList &Args) {
395+ // Build the shadow stack entry at the very start of the function.
396+ Address KernelArgs = CGF.getLangOpts ().OffloadViaLLVM
397+ ? prepareKernelArgsLLVMOffload (CGF, Args)
398+ : prepareKernelArgs (CGF, Args);
334399
335400 llvm::BasicBlock *EndBlock = CGF.createBasicBlock (" setup.end" );
336401
@@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
11291194// registered. The linker will provide a pointer to this section so we can
11301195// register the symbols with the linked device image.
11311196void CGNVCUDARuntime::createOffloadingEntries () {
1132- StringRef Section = CGM.getLangOpts ().HIP ? " hip_offloading_entries"
1133- : " cuda_offloading_entries" ;
1197+ SmallVector<char , 32 > Out;
1198+ StringRef Section = (SectionPrefix + " _offloading_entries" ).toStringRef (Out);
1199+
11341200 llvm::Module &M = CGM.getModule ();
11351201 for (KernelInfo &I : EmittedKernels)
11361202 llvm::offloading::emitOffloadingEntry (
@@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
11991265 }
12001266 return nullptr ;
12011267 }
1202- if (CGM.getLangOpts ().OffloadingNewDriver && RelocatableDeviceCode)
1268+ if (CGM.getLangOpts ().OffloadViaLLVM ||
1269+ (CGM.getLangOpts ().OffloadingNewDriver && RelocatableDeviceCode))
12031270 createOffloadingEntries ();
12041271 else
12051272 return makeModuleCtorFunction ();
0 commit comments