Skip to content

Commit c1bf88b

Browse files
committed
[OpenMP] Introduce the initial support for OpenMP kernel language
This patch starts the support for OpenMP kernel language, basically to write OpenMP target region in SIMT style, similar to kernel languages such as CUDA. What included in this first patch is the `ompx_bare` clause for `target teams` directive. When `ompx_bare` exists, globalization is disabled such that local variables will not be globalized. The runtime init/deinit function calls will not be emitted. That being said, almost all OpenMP executable directives are not supported in the region, such as parallel, task. This patch doesn't include the Sema checks for that, so the use of them is UB. Simple directives, such as atomic, can be used. We provide a set of APIs (for C, they are prefix with `ompx_`; for C++, they are in `ompx` namespace) to get thread id, block id, etc. For more details, you can refer to https://tianshilei.me/wp-content/uploads/llvm-hpc-2022.pdf?swcfpc=1.
1 parent c88c281 commit c1bf88b

20 files changed

+1145
-657
lines changed

clang/include/clang/AST/OpenMPClause.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9220,6 +9220,27 @@ class OMPXAttributeClause
92209220
}
92219221
};
92229222

9223+
/// This represents 'ompx_bare' clause in the '#pragma omp target teams ...'
9224+
/// directive.
9225+
///
9226+
/// \code
9227+
/// #pragma omp target teams ompx_bare
9228+
/// \endcode
9229+
/// In this example directive '#pragma omp target teams' has a 'ompx_bare'
9230+
/// clause.
9231+
class OMPXBareClause : public OMPNoChildClause<llvm::omp::OMPC_ompx_bare> {
9232+
public:
9233+
/// Build 'ompx_bare' clause.
9234+
///
9235+
/// \param StartLoc Starting location of the clause.
9236+
/// \param EndLoc Ending location of the clause.
9237+
OMPXBareClause(SourceLocation StartLoc, SourceLocation EndLoc)
9238+
: OMPNoChildClause(StartLoc, EndLoc) {}
9239+
9240+
/// Build an empty clause.
9241+
OMPXBareClause() = default;
9242+
};
9243+
92239244
} // namespace clang
92249245

92259246
#endif // LLVM_CLANG_AST_OPENMPCLAUSE_H

clang/include/clang/AST/RecursiveASTVisitor.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3890,6 +3890,11 @@ bool RecursiveASTVisitor<Derived>::VisitOMPXAttributeClause(
38903890
return true;
38913891
}
38923892

3893+
template <typename Derived>
3894+
bool RecursiveASTVisitor<Derived>::VisitOMPXBareClause(OMPXBareClause *C) {
3895+
return true;
3896+
}
3897+
38933898
// FIXME: look at the following tricky-seeming exprs to see if we
38943899
// need to recurse on anything. These are ones that have methods
38953900
// returning decls or qualtypes or nestednamespecifier -- though I'm

clang/include/clang/Basic/DiagnosticParseKinds.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1360,6 +1360,8 @@ def warn_clause_expected_string : Warning<
13601360
"expected string literal in 'clause %0' - ignoring">, InGroup<IgnoredPragmas>;
13611361
def err_omp_unexpected_clause : Error<
13621362
"unexpected OpenMP clause '%0' in directive '#pragma omp %1'">;
1363+
def err_omp_unexpected_clause_extension_only : Error<
1364+
"OpenMP clause '%0' is only available as extension, use '-fopenmp-extensions'">;
13631365
def err_omp_immediate_directive : Error<
13641366
"'#pragma omp %0' %select{|with '%2' clause }1cannot be an immediate substatement">;
13651367
def err_omp_expected_identifier_for_critical : Error<
@@ -1452,6 +1454,8 @@ def warn_unknown_declare_variant_isa_trait
14521454
"spelling or consider restricting the context selector with the "
14531455
"'arch' selector further">,
14541456
InGroup<SourceUsesOpenMP>;
1457+
def note_ompx_bare_clause : Note<
1458+
"OpenMP extension clause '%0' only allowed with '#pragma omp %1'">;
14551459
def note_omp_declare_variant_ctx_options
14561460
: Note<"context %select{set|selector|property}0 options are: %1">;
14571461
def warn_omp_declare_variant_expected

clang/include/clang/Sema/Sema.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12447,6 +12447,10 @@ class Sema final {
1244712447
SourceLocation LParenLoc,
1244812448
SourceLocation EndLoc);
1244912449

12450+
/// Called on a well-formed 'ompx_bare' clause.
12451+
OMPClause *ActOnOpenMPXBareClause(SourceLocation StartLoc,
12452+
SourceLocation EndLoc);
12453+
1245012454
/// The kind of conversion being performed.
1245112455
enum CheckedConversionKind {
1245212456
/// An implicit conversion.

clang/lib/AST/OpenMPClause.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
170170
case OMPC_affinity:
171171
case OMPC_when:
172172
case OMPC_bind:
173+
case OMPC_ompx_bare:
173174
break;
174175
default:
175176
break;
@@ -2546,6 +2547,10 @@ void OMPClausePrinter::VisitOMPXAttributeClause(OMPXAttributeClause *Node) {
25462547
OS << ")";
25472548
}
25482549

2550+
void OMPClausePrinter::VisitOMPXBareClause(OMPXBareClause *Node) {
2551+
OS << "ompx_bare";
2552+
}
2553+
25492554
void OMPTraitInfo::getAsVariantMatchInfo(ASTContext &ASTCtx,
25502555
VariantMatchInfo &VMI) const {
25512556
for (const OMPTraitSet &Set : Sets) {

clang/lib/AST/StmtProfile.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,7 @@ void OMPClauseProfiler::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
930930
}
931931
void OMPClauseProfiler::VisitOMPXAttributeClause(const OMPXAttributeClause *C) {
932932
}
933+
void OMPClauseProfiler::VisitOMPXBareClause(const OMPXBareClause *C) {}
933934
} // namespace
934935

935936
void

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -551,10 +551,9 @@ CGOpenMPRuntimeGPU::getExecutionMode() const {
551551
return CurrentExecutionMode;
552552
}
553553

554-
static CGOpenMPRuntimeGPU::DataSharingMode
555-
getDataSharingMode(CodeGenModule &CGM) {
556-
return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
557-
: CGOpenMPRuntimeGPU::Generic;
554+
CGOpenMPRuntimeGPU::DataSharingMode
555+
CGOpenMPRuntimeGPU::getDataSharingMode() const {
556+
return CurrentDataSharingMode;
558557
}
559558

560559
/// Check for inner (nested) SPMD construct, if any
@@ -752,6 +751,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
752751
EntryFunctionState EST;
753752
WrapperFunctionsMap.clear();
754753

754+
[[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
755+
assert(!IsBareKernel && "bare kernel should not be at generic mode");
756+
755757
// Emit target region as a standalone region.
756758
class NVPTXPrePostActionTy : public PrePostActionTy {
757759
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
@@ -760,15 +762,13 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
760762
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
761763
: EST(EST) {}
762764
void Enter(CodeGenFunction &CGF) override {
763-
auto &RT =
764-
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
765+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
765766
RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
766767
// Skip target region initialization.
767768
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
768769
}
769770
void Exit(CodeGenFunction &CGF) override {
770-
auto &RT =
771-
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
771+
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
772772
RT.clearLocThreadIdInsertPt(CGF);
773773
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
774774
}
@@ -807,25 +807,39 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
807807
ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
808808
EntryFunctionState EST;
809809

810+
bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
811+
810812
// Emit target region as a standalone region.
811813
class NVPTXPrePostActionTy : public PrePostActionTy {
812814
CGOpenMPRuntimeGPU &RT;
813815
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
816+
bool IsBareKernel;
817+
DataSharingMode Mode;
814818

815819
public:
816820
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817-
CGOpenMPRuntimeGPU::EntryFunctionState &EST)
818-
: RT(RT), EST(EST) {}
821+
CGOpenMPRuntimeGPU::EntryFunctionState &EST,
822+
bool IsBareKernel)
823+
: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
824+
Mode(RT.CurrentDataSharingMode) {}
819825
void Enter(CodeGenFunction &CGF) override {
826+
if (IsBareKernel) {
827+
RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
828+
return;
829+
}
820830
RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
821831
// Skip target region initialization.
822832
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
823833
}
824834
void Exit(CodeGenFunction &CGF) override {
835+
if (IsBareKernel) {
836+
RT.CurrentDataSharingMode = Mode;
837+
return;
838+
}
825839
RT.clearLocThreadIdInsertPt(CGF);
826840
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
827841
}
828-
} Action(*this, EST);
842+
} Action(*this, EST, IsBareKernel);
829843
CodeGen.setAction(Action);
830844
IsInTTDRegion = true;
831845
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
@@ -843,7 +857,8 @@ void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
843857
assert(!ParentName.empty() && "Invalid target region parent name!");
844858

845859
bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
846-
if (Mode)
860+
bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
861+
if (Mode || IsBareKernel)
847862
emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
848863
CodeGen);
849864
else
@@ -867,6 +882,9 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
867882
if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
868883
return;
869884

885+
if (CGM.getLangOpts().OpenMPCUDAMode)
886+
CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
887+
870888
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
871889
"__omp_rtl_debug_kind");
872890
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
@@ -1030,7 +1048,7 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
10301048
void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
10311049
SourceLocation Loc,
10321050
bool WithSPMDCheck) {
1033-
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1051+
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
10341052
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
10351053
return;
10361054

@@ -1142,7 +1160,7 @@ void CGOpenMPRuntimeGPU::getKmpcFreeShared(
11421160

11431161
void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
11441162
bool WithSPMDCheck) {
1145-
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1163+
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
11461164
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
11471165
return;
11481166

@@ -1178,11 +1196,18 @@ void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
11781196
if (!CGF.HaveInsertPoint())
11791197
return;
11801198

1199+
bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1200+
11811201
Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
11821202
/*Name=*/".zero.addr");
11831203
CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
11841204
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1185-
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1205+
// We don't emit any thread id function call in bare kernel, but because the
1206+
// outlined function has a pointer argument, we emit a nullptr here.
1207+
if (IsBareKernel)
1208+
OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1209+
else
1210+
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
11861211
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
11871212
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
11881213
emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
@@ -3273,7 +3298,7 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
32733298

32743299
void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
32753300
const Decl *D) {
3276-
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3301+
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
32773302
return;
32783303

32793304
assert(D && "Expected function or captured|block decl.");
@@ -3382,7 +3407,7 @@ Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
33823407
VarTy, Align);
33833408
}
33843409

3385-
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3410+
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
33863411
return Address::invalid();
33873412

33883413
VD = VD->getCanonicalDecl();

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,18 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
3232
/// Unknown execution mode (orphaned directive).
3333
EM_Unknown,
3434
};
35+
36+
/// Target codegen is specialized based on two data-sharing modes: CUDA, in
37+
/// which the local variables are actually global threadlocal, and Generic, in
38+
/// which the local variables are placed in global memory if they may escape
39+
/// their declaration context.
40+
enum DataSharingMode {
41+
/// CUDA data sharing mode.
42+
DS_CUDA,
43+
/// Generic data-sharing mode.
44+
DS_Generic,
45+
};
46+
3547
private:
3648
/// Parallel outlined function work for workers to execute.
3749
llvm::SmallVector<llvm::Function *, 16> Work;
@@ -42,6 +54,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
4254

4355
ExecutionMode getExecutionMode() const;
4456

57+
DataSharingMode getDataSharingMode() const;
58+
4559
/// Get barrier to synchronize all threads in a block.
4660
void syncCTAThreads(CodeGenFunction &CGF);
4761

@@ -297,17 +311,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
297311
Address getAddressOfLocalVariable(CodeGenFunction &CGF,
298312
const VarDecl *VD) override;
299313

300-
/// Target codegen is specialized based on two data-sharing modes: CUDA, in
301-
/// which the local variables are actually global threadlocal, and Generic, in
302-
/// which the local variables are placed in global memory if they may escape
303-
/// their declaration context.
304-
enum DataSharingMode {
305-
/// CUDA data sharing mode.
306-
CUDA,
307-
/// Generic data-sharing mode.
308-
Generic,
309-
};
310-
311314
/// Cleans up references to the objects in finished function.
312315
///
313316
void functionFinished(CodeGenFunction &CGF) override;
@@ -343,6 +346,10 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
343346
/// to emit optimized code.
344347
ExecutionMode CurrentExecutionMode = EM_Unknown;
345348

349+
/// Track the data sharing mode when codegening directives within a target
350+
/// region.
351+
DataSharingMode CurrentDataSharingMode = DataSharingMode::DS_Generic;
352+
346353
/// true if currently emitting code for target/teams/distribute region, false
347354
/// - otherwise.
348355
bool IsInTTDRegion = false;

clang/lib/Parse/ParseOpenMP.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3416,6 +3416,17 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
34163416
case OMPC_ompx_attribute:
34173417
Clause = ParseOpenMPOMPXAttributesClause(WrongDirective);
34183418
break;
3419+
case OMPC_ompx_bare:
3420+
if (WrongDirective)
3421+
Diag(Tok, diag::note_ompx_bare_clause)
3422+
<< getOpenMPClauseName(CKind) << "target teams";
3423+
if (!ErrorFound && !getLangOpts().OpenMPExtensions) {
3424+
Diag(Tok, diag::err_omp_unexpected_clause_extension_only)
3425+
<< getOpenMPClauseName(CKind) << getOpenMPDirectiveName(DKind);
3426+
ErrorFound = true;
3427+
}
3428+
Clause = ParseOpenMPClause(CKind, WrongDirective);
3429+
break;
34193430
default:
34203431
break;
34213432
}

clang/lib/Sema/SemaOpenMP.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17553,6 +17553,9 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
1755317553
case OMPC_partial:
1755417554
Res = ActOnOpenMPPartialClause(nullptr, StartLoc, /*LParenLoc=*/{}, EndLoc);
1755517555
break;
17556+
case OMPC_ompx_bare:
17557+
Res = ActOnOpenMPXBareClause(StartLoc, EndLoc);
17558+
break;
1755617559
case OMPC_if:
1755717560
case OMPC_final:
1755817561
case OMPC_num_threads:
@@ -24279,3 +24282,8 @@ OMPClause *Sema::ActOnOpenMPXAttributeClause(ArrayRef<const Attr *> Attrs,
2427924282
SourceLocation EndLoc) {
2428024283
return new (Context) OMPXAttributeClause(Attrs, StartLoc, LParenLoc, EndLoc);
2428124284
}
24285+
24286+
OMPClause *Sema::ActOnOpenMPXBareClause(SourceLocation StartLoc,
24287+
SourceLocation EndLoc) {
24288+
return new (Context) OMPXBareClause(StartLoc, EndLoc);
24289+
}

clang/lib/Sema/TreeTransform.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2391,6 +2391,15 @@ class TreeTransform {
23912391
EndLoc);
23922392
}
23932393

2394+
/// Build a new OpenMP 'ompx_bare' clause.
2395+
///
2396+
/// By default, performs semantic analysis to build the new OpenMP clause.
2397+
/// Subclasses may override this routine to provide different behavior.
2398+
OMPClause *RebuildOMPXBareClause(SourceLocation StartLoc,
2399+
SourceLocation EndLoc) {
2400+
return getSema().ActOnOpenMPXBareClause(StartLoc, EndLoc);
2401+
}
2402+
23942403
/// Build a new OpenMP 'align' clause.
23952404
///
23962405
/// By default, performs semantic analysis to build the new OpenMP clause.
@@ -10804,6 +10813,11 @@ TreeTransform<Derived>::TransformOMPXAttributeClause(OMPXAttributeClause *C) {
1080410813
NewAttrs, C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
1080510814
}
1080610815

10816+
template <typename Derived>
10817+
OMPClause *TreeTransform<Derived>::TransformOMPXBareClause(OMPXBareClause *C) {
10818+
return getDerived().RebuildOMPXBareClause(C->getBeginLoc(), C->getEndLoc());
10819+
}
10820+
1080710821
//===----------------------------------------------------------------------===//
1080810822
// Expression transformation
1080910823
//===----------------------------------------------------------------------===//

clang/lib/Serialization/ASTReader.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10446,6 +10446,9 @@ OMPClause *OMPClauseReader::readClause() {
1044610446
case llvm::omp::OMPC_ompx_attribute:
1044710447
C = new (Context) OMPXAttributeClause();
1044810448
break;
10449+
case llvm::omp::OMPC_ompx_bare:
10450+
C = new (Context) OMPXBareClause();
10451+
break;
1044910452
#define OMP_CLAUSE_NO_CLASS(Enum, Str) \
1045010453
case llvm::omp::Enum: \
1045110454
break;
@@ -11547,6 +11550,8 @@ void OMPClauseReader::VisitOMPXAttributeClause(OMPXAttributeClause *C) {
1154711550
C->setLocEnd(Record.readSourceLocation());
1154811551
}
1154911552

11553+
void OMPClauseReader::VisitOMPXBareClause(OMPXBareClause *C) {}
11554+
1155011555
OMPTraitInfo *ASTRecordReader::readOMPTraitInfo() {
1155111556
OMPTraitInfo &TI = getContext().getNewOMPTraitInfo();
1155211557
TI.Sets.resize(readUInt32());

0 commit comments

Comments
 (0)