Skip to content

Commit 00167a3

Browse files
author
anikelal
committed
[Clang][OpenCL][AMDGPU] Allow a kernel to call another kernel
This feature is currently not supported in the compiler. To facilitate this we emit a stub version of each kernel function body with different name mangling scheme, and replaces the respective kernel call-sites appropriately. Fixes #60313 D120566 was an earlier attempt made to upstream a solution for this issue.
1 parent 53c0a25 commit 00167a3

15 files changed

+178
-41
lines changed

clang/include/clang/AST/GlobalDecl.h

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,19 @@ class GlobalDecl {
7171
GlobalDecl(const FunctionDecl *D, unsigned MVIndex = 0)
7272
: MultiVersionIndex(MVIndex) {
7373
if (!D->hasAttr<CUDAGlobalAttr>()) {
74+
if (D->hasAttr<OpenCLKernelAttr>()) {
75+
Value.setPointerAndInt(D, unsigned(KernelReferenceKind::Kernel));
76+
return;
77+
}
7478
Init(D);
7579
return;
7680
}
7781
Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D)));
7882
}
7983
GlobalDecl(const FunctionDecl *D, KernelReferenceKind Kind)
8084
: Value(D, unsigned(Kind)) {
81-
assert(D->hasAttr<CUDAGlobalAttr>() && "Decl is not a GPU kernel!");
85+
assert((D->hasAttr<CUDAGlobalAttr>() && "Decl is not a GPU kernel!") ||
86+
(D->hasAttr<OpenCLKernelAttr>() && "Decl is not a OpenCL kernel!"));
8287
}
8388
GlobalDecl(const NamedDecl *D) { Init(D); }
8489
GlobalDecl(const BlockDecl *D) { Init(D); }
@@ -130,13 +135,15 @@ class GlobalDecl {
130135
}
131136

132137
KernelReferenceKind getKernelReferenceKind() const {
133-
assert(((isa<FunctionDecl>(getDecl()) &&
134-
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>()) ||
135-
(isa<FunctionTemplateDecl>(getDecl()) &&
136-
cast<FunctionTemplateDecl>(getDecl())
137-
->getTemplatedDecl()
138-
->hasAttr<CUDAGlobalAttr>())) &&
139-
"Decl is not a GPU kernel!");
138+
assert((((isa<FunctionDecl>(getDecl()) &&
139+
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>()) ||
140+
(isa<FunctionTemplateDecl>(getDecl()) &&
141+
cast<FunctionTemplateDecl>(getDecl())
142+
->getTemplatedDecl()
143+
->hasAttr<CUDAGlobalAttr>())) &&
144+
"Decl is not a GPU kernel!") ||
145+
(isDeclOpenCLKernel() && "Decl is not a OpenCL kernel!"));
146+
140147
return static_cast<KernelReferenceKind>(Value.getInt());
141148
}
142149

@@ -196,13 +203,21 @@ class GlobalDecl {
196203
}
197204

198205
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind) {
199-
assert(isa<FunctionDecl>(getDecl()) &&
200-
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>() &&
201-
"Decl is not a GPU kernel!");
206+
assert((isa<FunctionDecl>(getDecl()) &&
207+
cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>() &&
208+
"Decl is not a GPU kernel!") ||
209+
(isDeclOpenCLKernel() && "Decl is not a OpenCL kernel!"));
202210
GlobalDecl Result(*this);
203211
Result.Value.setInt(unsigned(Kind));
204212
return Result;
205213
}
214+
215+
bool isDeclOpenCLKernel() const {
216+
auto FD = dyn_cast<FunctionDecl>(getDecl());
217+
if (FD)
218+
return FD->hasAttr<OpenCLKernelAttr>();
219+
return FD;
220+
}
206221
};
207222

208223
} // namespace clang

clang/lib/AST/Expr.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,8 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
693693
GD = GlobalDecl(CD, Ctor_Base);
694694
else if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(ND))
695695
GD = GlobalDecl(DD, Dtor_Base);
696-
else if (ND->hasAttr<CUDAGlobalAttr>())
696+
else if (ND->hasAttr<CUDAGlobalAttr>() ||
697+
ND->hasAttr<OpenCLKernelAttr>())
697698
GD = GlobalDecl(cast<FunctionDecl>(ND));
698699
else
699700
GD = GlobalDecl(ND);

clang/lib/AST/ItaniumMangle.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,7 @@ class CXXNameMangler {
526526
void mangleSourceName(const IdentifierInfo *II);
527527
void mangleRegCallName(const IdentifierInfo *II);
528528
void mangleDeviceStubName(const IdentifierInfo *II);
529+
void mangleOCLDeviceStubName(const IdentifierInfo *II);
529530
void mangleSourceNameWithAbiTags(
530531
const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr);
531532
void mangleLocalName(GlobalDecl GD,
@@ -1561,8 +1562,13 @@ void CXXNameMangler::mangleUnqualifiedName(
15611562
bool IsDeviceStub =
15621563
FD && FD->hasAttr<CUDAGlobalAttr>() &&
15631564
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
1565+
bool IsOCLDeviceStub =
1566+
FD && FD->hasAttr<OpenCLKernelAttr>() &&
1567+
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
15641568
if (IsDeviceStub)
15651569
mangleDeviceStubName(II);
1570+
else if (IsOCLDeviceStub)
1571+
mangleOCLDeviceStubName(II);
15661572
else if (IsRegCall)
15671573
mangleRegCallName(II);
15681574
else
@@ -1780,6 +1786,15 @@ void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) {
17801786
<< II->getName();
17811787
}
17821788

1789+
void CXXNameMangler::mangleOCLDeviceStubName(const IdentifierInfo *II) {
1790+
// <source-name> ::= <positive length number> __clang_ocl_kern_imp_
1791+
// <identifier> <number> ::= [n] <non-negative decimal integer> <identifier>
1792+
// ::= <unqualified source code identifier>
1793+
StringRef OCLDeviceStubNamePrefix = "__clang_ocl_kern_imp_";
1794+
Out << II->getLength() + OCLDeviceStubNamePrefix.size() - 1
1795+
<< OCLDeviceStubNamePrefix << II->getName();
1796+
}
1797+
17831798
void CXXNameMangler::mangleSourceName(const IdentifierInfo *II) {
17841799
// <source-name> ::= <positive length number> <identifier>
17851800
// <number> ::= [n] <non-negative decimal integer>

clang/lib/AST/Mangle.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ class ASTNameGenerator::Implementation {
539539
GD = GlobalDecl(CtorD, Ctor_Complete);
540540
else if (const auto *DtorD = dyn_cast<CXXDestructorDecl>(D))
541541
GD = GlobalDecl(DtorD, Dtor_Complete);
542-
else if (D->hasAttr<CUDAGlobalAttr>())
542+
else if (D->hasAttr<CUDAGlobalAttr>() || D->hasAttr<OpenCLKernelAttr>())
543543
GD = GlobalDecl(cast<FunctionDecl>(D));
544544
else
545545
GD = GlobalDecl(D);

clang/lib/AST/MicrosoftMangle.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,9 +1161,15 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD,
11611161
->getTemplatedDecl()
11621162
->hasAttr<CUDAGlobalAttr>())) &&
11631163
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
1164+
bool IsOCLDeviceStub =
1165+
ND && (isa<FunctionDecl>(ND) && ND->hasAttr<OpenCLKernelAttr>()) &&
1166+
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
11641167
if (IsDeviceStub)
11651168
mangleSourceName(
11661169
(llvm::Twine("__device_stub__") + II->getName()).str());
1170+
else if (IsOCLDeviceStub)
1171+
mangleSourceName(
1172+
(llvm::Twine("__clang_ocl_kern_imp_") + II->getName()).str());
11671173
else
11681174
mangleSourceName(II->getName());
11691175
break;

clang/lib/CodeGen/CGBlocks.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ CGBlockInfo::CGBlockInfo(const BlockDecl *block, StringRef name)
4848
BlockByrefHelpers::~BlockByrefHelpers() {}
4949

5050
/// Build the given block as a global block.
51-
static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
51+
static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, GlobalDecl GD,
5252
const CGBlockInfo &blockInfo,
5353
llvm::Constant *blockFn);
5454

@@ -1085,8 +1085,10 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
10851085
blockAddr.getPointer(), ConvertType(blockInfo.getBlockExpr()->getType()));
10861086

10871087
if (IsOpenCL) {
1088-
CGM.getOpenCLRuntime().recordBlockInfo(blockInfo.BlockExpression, InvokeFn,
1089-
result, blockInfo.StructureType);
1088+
CGM.getOpenCLRuntime().recordBlockInfo(
1089+
blockInfo.BlockExpression, InvokeFn, result, blockInfo.StructureType,
1090+
CurGD && CurGD.isDeclOpenCLKernel() &&
1091+
(CurGD.getKernelReferenceKind() == KernelReferenceKind::Kernel));
10901092
}
10911093

10921094
return result;
@@ -1285,7 +1287,7 @@ CodeGenModule::GetAddrOfGlobalBlock(const BlockExpr *BE,
12851287
return getAddrOfGlobalBlockIfEmitted(BE);
12861288
}
12871289

1288-
static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
1290+
static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM, GlobalDecl GD,
12891291
const CGBlockInfo &blockInfo,
12901292
llvm::Constant *blockFn) {
12911293
assert(blockInfo.CanBeGlobal);
@@ -1378,7 +1380,9 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
13781380
CGM.getOpenCLRuntime().recordBlockInfo(
13791381
blockInfo.BlockExpression,
13801382
cast<llvm::Function>(blockFn->stripPointerCasts()), Result,
1381-
literal->getValueType());
1383+
literal->getValueType(),
1384+
GD && GD.isDeclOpenCLKernel() &&
1385+
(GD.getKernelReferenceKind() == KernelReferenceKind::Kernel));
13821386
return Result;
13831387
}
13841388

@@ -1487,7 +1491,7 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction(
14871491
auto GenVoidPtrTy = getContext().getLangOpts().OpenCL
14881492
? CGM.getOpenCLRuntime().getGenericVoidPointerType()
14891493
: VoidPtrTy;
1490-
buildGlobalBlock(CGM, blockInfo,
1494+
buildGlobalBlock(CGM, CurGD, blockInfo,
14911495
llvm::ConstantExpr::getPointerCast(fn, GenVoidPtrTy));
14921496
}
14931497

clang/lib/CodeGen/CGCall.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,6 +2343,15 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
23432343
// Collect function IR attributes from the CC lowering.
23442344
// We'll collect the paramete and result attributes later.
23452345
CallingConv = FI.getEffectiveCallingConvention();
2346+
GlobalDecl GD = CalleeInfo.getCalleeDecl();
2347+
const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl();
2348+
if (TargetDecl) {
2349+
if (auto FD = dyn_cast<FunctionDecl>(TargetDecl)) {
2350+
if (FD->hasAttr<OpenCLKernelAttr>() &&
2351+
GD.getKernelReferenceKind() == KernelReferenceKind::Stub)
2352+
CallingConv = llvm::CallingConv::C;
2353+
}
2354+
}
23462355
if (FI.isNoReturn())
23472356
FuncAttrs.addAttribute(llvm::Attribute::NoReturn);
23482357
if (FI.isCmseNSCall())
@@ -2352,8 +2361,6 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
23522361
AddAttributesFromFunctionProtoType(getContext(), FuncAttrs,
23532362
CalleeInfo.getCalleeFunctionProtoType());
23542363

2355-
const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl();
2356-
23572364
// Attach assumption attributes to the declaration. If this is a call
23582365
// site, attach assumptions from the caller to the call as well.
23592366
AddAttributesFromOMPAssumes(FuncAttrs, TargetDecl);

clang/lib/CodeGen/CGExpr.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5692,7 +5692,10 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
56925692
// Resolve direct calls.
56935693
} else if (auto DRE = dyn_cast<DeclRefExpr>(E)) {
56945694
if (auto FD = dyn_cast<FunctionDecl>(DRE->getDecl())) {
5695-
return EmitDirectCallee(*this, FD);
5695+
auto CalleeDecl = FD->hasAttr<OpenCLKernelAttr>()
5696+
? GlobalDecl(FD, KernelReferenceKind::Stub)
5697+
: FD;
5698+
return EmitDirectCallee(*this, CalleeDecl);
56965699
}
56975700
} else if (auto ME = dyn_cast<MemberExpr>(E)) {
56985701
if (auto FD = dyn_cast<FunctionDecl>(ME->getMemberDecl())) {

clang/lib/CodeGen/CGOpenCLRuntime.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,14 +126,21 @@ static const BlockExpr *getBlockExpr(const Expr *E) {
126126
/// corresponding block expression.
127127
void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E,
128128
llvm::Function *InvokeF,
129-
llvm::Value *Block, llvm::Type *BlockTy) {
130-
assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice");
129+
llvm::Value *Block, llvm::Type *BlockTy,
130+
bool isBlkExprInOCLKern) {
131+
132+
// FIXME: Since OpenCL Kernels are emitted twice (kernel version and stub
133+
// version), its constituent BlockExpr will also be emitted twice.
134+
assert((!EnqueuedBlockMap.contains(E) ||
135+
EnqueuedBlockMap[E].isBlkExprInOCLKern != isBlkExprInOCLKern) &&
136+
"Block expression emitted twice");
131137
assert(isa<llvm::Function>(InvokeF) && "Invalid invoke function");
132138
assert(Block->getType()->isPointerTy() && "Invalid block literal type");
133139
EnqueuedBlockMap[E].InvokeFunc = InvokeF;
134140
EnqueuedBlockMap[E].BlockArg = Block;
135141
EnqueuedBlockMap[E].BlockTy = BlockTy;
136142
EnqueuedBlockMap[E].KernelHandle = nullptr;
143+
EnqueuedBlockMap[E].isBlkExprInOCLKern = isBlkExprInOCLKern;
137144
}
138145

139146
llvm::Function *CGOpenCLRuntime::getInvokeFunction(const Expr *E) {

clang/lib/CodeGen/CGOpenCLRuntime.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class CGOpenCLRuntime {
4646
llvm::Value *KernelHandle; /// Enqueued block kernel reference.
4747
llvm::Value *BlockArg; /// The first argument to enqueued block kernel.
4848
llvm::Type *BlockTy; /// Type of the block argument.
49+
bool isBlkExprInOCLKern; /// Does the BlockExpr reside in an OpenCL Kernel.
4950
};
5051
/// Maps block expression to block information.
5152
llvm::DenseMap<const Expr *, EnqueuedBlockInfo> EnqueuedBlockMap;
@@ -93,7 +94,8 @@ class CGOpenCLRuntime {
9394
/// \param InvokeF invoke function emitted for the block expression.
9495
/// \param Block block literal emitted for the block expression.
9596
void recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF,
96-
llvm::Value *Block, llvm::Type *BlockTy);
97+
llvm::Value *Block, llvm::Type *BlockTy,
98+
bool isBlkExprInOCLKern);
9799

98100
/// \return LLVM block invoke function emitted for an expression derived from
99101
/// the block expression.

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,6 +1887,9 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
18871887
} else if (FD && FD->hasAttr<CUDAGlobalAttr>() &&
18881888
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
18891889
Out << "__device_stub__" << II->getName();
1890+
} else if (FD && FD->hasAttr<OpenCLKernelAttr>() &&
1891+
GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
1892+
Out << "__clang_ocl_kern_imp_" << II->getName();
18901893
} else {
18911894
Out << II->getName();
18921895
}
@@ -3850,6 +3853,10 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
38503853

38513854
// Ignore declarations, they will be emitted on their first use.
38523855
if (const auto *FD = dyn_cast<FunctionDecl>(Global)) {
3856+
3857+
if (FD->hasAttr<OpenCLKernelAttr>() && FD->doesThisDeclarationHaveABody())
3858+
addDeferredDeclToEmit(GlobalDecl(FD, KernelReferenceKind::Stub));
3859+
38533860
// Update deferred annotations with the latest declaration if the function
38543861
// function was already used or defined.
38553862
if (FD->hasAttr<AnnotateAttr>()) {
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
2+
3+
// CHECK: define dso_local amdgpu_kernel void @callee_kern({{.*}})
4+
__attribute__((noinline)) kernel void callee_kern(global int *A){
5+
*A = 1;
6+
}
7+
8+
__attribute__((noinline)) kernel void ext_callee_kern(global int *A);
9+
10+
// CHECK: define dso_local void @callee_func({{.*}})
11+
__attribute__((noinline)) void callee_func(global int *A){
12+
*A = 2;
13+
}
14+
15+
// CHECK: define dso_local amdgpu_kernel void @caller_kern({{.*}})
16+
kernel void caller_kern(global int* A){
17+
callee_kern(A);
18+
// CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}})
19+
ext_callee_kern(A);
20+
// CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}})
21+
callee_func(A);
22+
// CHECK: tail call void @callee_func({{.*}})
23+
24+
}
25+
26+
// CHECK: define dso_local void @__clang_ocl_kern_imp_callee_kern({{.*}})
27+
28+
// CHECK: declare void @__clang_ocl_kern_imp_ext_callee_kern({{.*}})
29+
30+
// CHECK: define dso_local void @caller_func({{.*}})
31+
void caller_func(global int* A){
32+
callee_kern(A);
33+
// CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}}) #7
34+
ext_callee_kern(A);
35+
// CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}}) #8
36+
callee_func(A);
37+
// CHECK: tail call void @callee_func({{.*}})
38+
}
39+
40+
// CHECK: define dso_local void @__clang_ocl_kern_imp_caller_kern({{.*}})
41+
// CHECK: tail call void @__clang_ocl_kern_imp_callee_kern({{.*}})
42+
// CHECK: tail call void @__clang_ocl_kern_imp_ext_callee_kern({{.*}})
43+
// CHECK: tail call void @callee_func({{.*}})

clang/test/CodeGenOpenCL/reflect.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ bool device_function() {
1313
}
1414

1515
// CHECK-LABEL: define dso_local spir_kernel void @kernel_function(
16-
// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
16+
// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 {
1717
// CHECK-NEXT: entry:
1818
// CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4
1919
// CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4

clang/test/CodeGenOpenCL/spir-calling-conv.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ kernel void foo(global int *A)
1111
// CHECK: %{{[a-z0-9_]+}} = tail call spir_func i32 @get_dummy_id(i32 noundef 0)
1212
A[id] = id;
1313
bar(A);
14-
// CHECK: tail call spir_kernel void @bar(ptr addrspace(1) noundef align 4 %A)
14+
// CHECK: tail call void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4 %A)
1515
}
1616

1717
// CHECK: declare spir_func i32 @get_dummy_id(i32 noundef)
18-
// CHECK: declare spir_kernel void @bar(ptr addrspace(1) noundef align 4)
18+
// CHECK: declare void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4)

0 commit comments

Comments
 (0)