15
15
#include " CGCXXABI.h"
16
16
#include " CodeGenFunction.h"
17
17
#include " CodeGenModule.h"
18
+ #include " clang/AST/CharUnits.h"
18
19
#include " clang/AST/Decl.h"
19
20
#include " clang/Basic/Cuda.h"
20
21
#include " clang/CodeGen/CodeGenABITypes.h"
21
22
#include " clang/CodeGen/ConstantInitBuilder.h"
23
+ #include " llvm/ADT/StringRef.h"
22
24
#include " llvm/Frontend/Offloading/Utility.h"
23
25
#include " llvm/IR/BasicBlock.h"
24
26
#include " llvm/IR/Constants.h"
@@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
38
37
39
class CGNVCUDARuntime : public CGCUDARuntime {
38
40
41
+ // / The prefix used for function calls and section names (CUDA, HIP, LLVM)
42
+ StringRef Prefix;
43
+ // / TODO: We should transition the OpenMP section to LLVM/Offload
44
+ StringRef SectionPrefix;
45
+
39
46
private:
40
47
llvm::IntegerType *IntTy, *SizeTy;
41
48
llvm::Type *VoidTy;
@@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
132
139
return DummyFunc;
133
140
}
134
141
142
+ Address prepareKernelArgs (CodeGenFunction &CGF, FunctionArgList &Args);
143
+ Address prepareKernelArgsLLVMOffload (CodeGenFunction &CGF,
144
+ FunctionArgList &Args);
135
145
void emitDeviceStubBodyLegacy (CodeGenFunction &CGF, FunctionArgList &Args);
136
146
void emitDeviceStubBodyNew (CodeGenFunction &CGF, FunctionArgList &Args);
137
147
std::string getDeviceSideName (const NamedDecl *ND) override ;
@@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
191
201
} // end anonymous namespace
192
202
193
203
std::string CGNVCUDARuntime::addPrefixToName (StringRef FuncName) const {
194
- if (CGM.getLangOpts ().HIP )
195
- return ((Twine (" hip" ) + Twine (FuncName)).str ());
196
- return ((Twine (" cuda" ) + Twine (FuncName)).str ());
204
+ return (Prefix + FuncName).str ();
197
205
}
198
206
std::string
199
207
CGNVCUDARuntime::addUnderscoredPrefixToName (StringRef FuncName) const {
200
- if (CGM.getLangOpts ().HIP )
201
- return ((Twine (" __hip" ) + Twine (FuncName)).str ());
202
- return ((Twine (" __cuda" ) + Twine (FuncName)).str ());
208
+ return (" __" + Prefix + FuncName).str ();
203
209
}
204
210
205
211
static std::unique_ptr<MangleContext> InitDeviceMC (CodeGenModule &CGM) {
@@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
227
233
SizeTy = CGM.SizeTy ;
228
234
VoidTy = CGM.VoidTy ;
229
235
PtrTy = CGM.UnqualPtrTy ;
236
+
237
+ if (CGM.getLangOpts ().OffloadViaLLVM ) {
238
+ Prefix = " llvm" ;
239
+ SectionPrefix = " omp" ;
240
+ } else if (CGM.getLangOpts ().HIP )
241
+ SectionPrefix = Prefix = " hip" ;
242
+ else
243
+ SectionPrefix = Prefix = " cuda" ;
230
244
}
231
245
232
246
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn () const {
@@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
305
319
}
306
320
if (CudaFeatureEnabled (CGM.getTarget ().getSDKVersion (),
307
321
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308
- (CGF.getLangOpts ().HIP && CGF.getLangOpts ().HIPUseNewLaunchAPI ))
322
+ (CGF.getLangOpts ().HIP && CGF.getLangOpts ().HIPUseNewLaunchAPI ) ||
323
+ (CGF.getLangOpts ().OffloadViaLLVM ))
309
324
emitDeviceStubBodyNew (CGF, Args);
310
325
else
311
326
emitDeviceStubBodyLegacy (CGF, Args);
312
327
}
313
328
314
- // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315
- // array and kernels are launched using cudaLaunchKernel().
316
- void CGNVCUDARuntime::emitDeviceStubBodyNew (CodeGenFunction &CGF,
317
- FunctionArgList &Args) {
318
- // Build the shadow stack entry at the very start of the function.
329
+ // / CUDA passes the arguments with a level of indirection. For example, a
330
+ // / (void*, short, void*) is passed as {void **, short *, void **} to the launch
331
+ // / function. For the LLVM/offload launch we flatten the arguments into the
332
+ // / struct directly. In addition, we include the size of the arguments, thus
333
+ // / pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
334
+ // / nullptr}. The last nullptr needs to be initialized to an array of pointers
335
+ // / pointing to the arguments if we want to offload to the host.
336
+ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload (CodeGenFunction &CGF,
337
+ FunctionArgList &Args) {
338
+ SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
339
+ for (auto &Arg : Args)
340
+ ArgTypes.push_back (CGF.ConvertTypeForMem (Arg->getType ()));
341
+ llvm::StructType *KernelArgsTy = llvm::StructType::create (ArgTypes);
342
+
343
+ auto *Int64Ty = CGF.Builder .getInt64Ty ();
344
+ KernelLaunchParamsTypes.push_back (Int64Ty);
345
+ KernelLaunchParamsTypes.push_back (PtrTy);
346
+ KernelLaunchParamsTypes.push_back (PtrTy);
347
+
348
+ llvm::StructType *KernelLaunchParamsTy =
349
+ llvm::StructType::create (KernelLaunchParamsTypes);
350
+ Address KernelArgs = CGF.CreateTempAllocaWithoutCast (
351
+ KernelArgsTy, CharUnits::fromQuantity (16 ), " kernel_args" );
352
+ Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast (
353
+ KernelLaunchParamsTy, CharUnits::fromQuantity (16 ),
354
+ " kernel_launch_params" );
355
+
356
+ auto KernelArgsSize = CGM.getDataLayout ().getTypeAllocSize (KernelArgsTy);
357
+ CGF.Builder .CreateStore (llvm::ConstantInt::get (Int64Ty, KernelArgsSize),
358
+ CGF.Builder .CreateStructGEP (KernelLaunchParams, 0 ));
359
+ CGF.Builder .CreateStore (KernelArgs.emitRawPointer (CGF),
360
+ CGF.Builder .CreateStructGEP (KernelLaunchParams, 1 ));
361
+ CGF.Builder .CreateStore (llvm::Constant::getNullValue (PtrTy),
362
+ CGF.Builder .CreateStructGEP (KernelLaunchParams, 2 ));
363
+
364
+ for (unsigned i = 0 ; i < Args.size (); ++i) {
365
+ auto *ArgVal = CGF.Builder .CreateLoad (CGF.GetAddrOfLocalVar (Args[i]));
366
+ CGF.Builder .CreateStore (ArgVal, CGF.Builder .CreateStructGEP (KernelArgs, i));
367
+ }
319
368
369
+ return KernelLaunchParams;
370
+ }
371
+
372
+ Address CGNVCUDARuntime::prepareKernelArgs (CodeGenFunction &CGF,
373
+ FunctionArgList &Args) {
320
374
// Calculate amount of space we will need for all arguments. If we have no
321
375
// args, allocate a single pointer so we still have a valid pointer to the
322
376
// argument array that we can pass to runtime, even if it will be unused.
@@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
331
385
VoidVarPtr, CGF.Builder .CreateConstGEP1_32 (
332
386
PtrTy, KernelArgs.emitRawPointer (CGF), i));
333
387
}
388
+ return KernelArgs;
389
+ }
390
+
391
+ // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
392
+ // array and kernels are launched using cudaLaunchKernel().
393
+ void CGNVCUDARuntime::emitDeviceStubBodyNew (CodeGenFunction &CGF,
394
+ FunctionArgList &Args) {
395
+ // Build the shadow stack entry at the very start of the function.
396
+ Address KernelArgs = CGF.getLangOpts ().OffloadViaLLVM
397
+ ? prepareKernelArgsLLVMOffload (CGF, Args)
398
+ : prepareKernelArgs (CGF, Args);
334
399
335
400
llvm::BasicBlock *EndBlock = CGF.createBasicBlock (" setup.end" );
336
401
@@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
1129
1194
// registered. The linker will provide a pointer to this section so we can
1130
1195
// register the symbols with the linked device image.
1131
1196
void CGNVCUDARuntime::createOffloadingEntries () {
1132
- StringRef Section = CGM.getLangOpts ().HIP ? " hip_offloading_entries"
1133
- : " cuda_offloading_entries" ;
1197
+ SmallVector<char , 32 > Out;
1198
+ StringRef Section = (SectionPrefix + " _offloading_entries" ).toStringRef (Out);
1199
+
1134
1200
llvm::Module &M = CGM.getModule ();
1135
1201
for (KernelInfo &I : EmittedKernels)
1136
1202
llvm::offloading::emitOffloadingEntry (
@@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
1199
1265
}
1200
1266
return nullptr ;
1201
1267
}
1202
- if (CGM.getLangOpts ().OffloadingNewDriver && RelocatableDeviceCode)
1268
+ if (CGM.getLangOpts ().OffloadViaLLVM ||
1269
+ (CGM.getLangOpts ().OffloadingNewDriver && RelocatableDeviceCode))
1203
1270
createOffloadingEntries ();
1204
1271
else
1205
1272
return makeModuleCtorFunction ();
0 commit comments