Skip to content

Commit 74a68b7

Browse files
authored
[SYCL] Adjust parallel-for range global size to improve group size selection (#2703)
This change rounds up a parallel-for range to be a multiple of 32. This value can be changed later when we have better strategies for selecting work-group sizes. It works well for now. The rounding-up improves performance by 8-10x for the odd cases when the original range is a prime number. It has negligible performance impact cases where the range is already a multiple of 32. Signed-off-by: rdeodhar [email protected]
1 parent a5fde5a commit 74a68b7

File tree

12 files changed

+550
-26
lines changed

12 files changed

+550
-26
lines changed

clang/include/clang/Sema/Sema.h

+6
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,9 @@ class SYCLIntegrationHeader {
346346
/// Registers a specialization constant to emit info for it into the header.
347347
void addSpecConstant(StringRef IDName, QualType IDType);
348348

349+
/// Notes that this_item is called within the kernel.
350+
void setCallsThisItem(bool B);
351+
349352
private:
350353
// Kernel actual parameter descriptor.
351354
struct KernelParamDesc {
@@ -382,6 +385,9 @@ class SYCLIntegrationHeader {
382385
/// Descriptor of kernel actual parameters.
383386
SmallVector<KernelParamDesc, 8> Params;
384387

388+
// Whether kernel calls this_item()
389+
bool CallsThisItem;
390+
385391
KernelDesc() = default;
386392
};
387393

clang/lib/Sema/SemaSYCL.cpp

+116-12
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,23 @@ class Util {
9999
/// \param Tmpl whether the class is template instantiation or simple record
100100
static bool isSyclType(const QualType &Ty, StringRef Name, bool Tmpl = false);
101101

102+
/// Checks whether given function is a standard SYCL API function with given
103+
/// name.
104+
/// \param FD the function being checked.
105+
/// \param Name the function name to be checked against.
106+
static bool isSyclFunction(const FunctionDecl *FD, StringRef Name);
107+
102108
/// Checks whether given clang type is a full specialization of the SYCL
103109
/// specialization constant class.
104110
static bool isSyclSpecConstantType(const QualType &Ty);
105111

112+
// Checks declaration context hierarchy.
113+
/// \param DC the context of the item to be checked.
114+
/// \param Scopes the declaration scopes leading from the item context to the
115+
/// translation unit (excluding the latter)
116+
static bool matchContext(const DeclContext *DC,
117+
ArrayRef<Util::DeclContextDesc> Scopes);
118+
106119
/// Checks whether given clang type is declared in the given hierarchy of
107120
/// declaration contexts.
108121
/// \param Ty the clang type being checked
@@ -487,6 +500,21 @@ class MarkDeviceFunction : public RecursiveASTVisitor<MarkDeviceFunction> {
487500
FunctionDecl *FD = WorkList.back().first;
488501
FunctionDecl *ParentFD = WorkList.back().second;
489502

503+
// To implement rounding-up of a parallel-for range the
504+
// SYCL header implementation modifies the kernel call like this:
505+
// auto Wrapper = [=](TransformedArgType Arg) {
506+
// if (Arg[0] >= NumWorkItems[0])
507+
// return;
508+
// Arg.set_allowed_range(NumWorkItems);
509+
// KernelFunc(Arg);
510+
// };
511+
//
512+
// This transformation leads to a condition where a kernel body
513+
// function becomes callable from a new kernel body function.
514+
// Hence this test.
515+
if ((ParentFD == KernelBody) && isSYCLKernelBodyFunction(FD))
516+
KernelBody = FD;
517+
490518
if ((ParentFD == SYCLKernel) && isSYCLKernelBodyFunction(FD)) {
491519
assert(!KernelBody && "inconsistent call graph - only one kernel body "
492520
"function can be called");
@@ -2667,15 +2695,63 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler {
26672695
return !SemaRef.getASTContext().hasSameType(FD->getType(), Ty);
26682696
}
26692697

2698+
// Sets a flag if the kernel is a parallel_for that calls the
2699+
// free function API "this_item".
2700+
void setThisItemIsCalled(const CXXRecordDecl *KernelObj,
2701+
FunctionDecl *KernelFunc) {
2702+
if (getKernelInvocationKind(KernelFunc) != InvokeParallelFor)
2703+
return;
2704+
2705+
const CXXMethodDecl *WGLambdaFn = getOperatorParens(KernelObj);
2706+
if (!WGLambdaFn)
2707+
return;
2708+
2709+
// The call graph for this translation unit.
2710+
CallGraph SYCLCG;
2711+
SYCLCG.addToCallGraph(SemaRef.getASTContext().getTranslationUnitDecl());
2712+
using ChildParentPair =
2713+
std::pair<const FunctionDecl *, const FunctionDecl *>;
2714+
llvm::SmallPtrSet<const FunctionDecl *, 16> Visited;
2715+
llvm::SmallVector<ChildParentPair, 16> WorkList;
2716+
WorkList.push_back({WGLambdaFn, nullptr});
2717+
2718+
while (!WorkList.empty()) {
2719+
const FunctionDecl *FD = WorkList.back().first;
2720+
WorkList.pop_back();
2721+
if (!Visited.insert(FD).second)
2722+
continue; // We've already seen this Decl
2723+
2724+
// Check whether this call is to sycl::this_item().
2725+
if (Util::isSyclFunction(FD, "this_item")) {
2726+
Header.setCallsThisItem(true);
2727+
return;
2728+
}
2729+
2730+
CallGraphNode *N = SYCLCG.getNode(FD);
2731+
if (!N)
2732+
continue;
2733+
2734+
for (const CallGraphNode *CI : *N) {
2735+
if (auto *Callee = dyn_cast<FunctionDecl>(CI->getDecl())) {
2736+
Callee = Callee->getMostRecentDecl();
2737+
if (!Visited.count(Callee))
2738+
WorkList.push_back({Callee, FD});
2739+
}
2740+
}
2741+
}
2742+
}
2743+
26702744
public:
26712745
static constexpr const bool VisitInsideSimpleContainers = false;
26722746
SyclKernelIntHeaderCreator(Sema &S, SYCLIntegrationHeader &H,
26732747
const CXXRecordDecl *KernelObj, QualType NameType,
2674-
StringRef Name, StringRef StableName)
2748+
StringRef Name, StringRef StableName,
2749+
FunctionDecl *KernelFunc)
26752750
: SyclKernelFieldHandler(S), Header(H) {
26762751
bool IsSIMDKernel = isESIMDKernelType(KernelObj);
26772752
Header.startKernel(Name, NameType, StableName, KernelObj->getLocation(),
26782753
IsSIMDKernel);
2754+
setThisItemIsCalled(KernelObj, KernelFunc);
26792755
}
26802756

26812757
bool handleSyclAccessorType(const CXXRecordDecl *RD,
@@ -3123,7 +3199,7 @@ void Sema::ConstructOpenCLKernel(FunctionDecl *KernelCallerFunc,
31233199
SyclKernelIntHeaderCreator int_header(
31243200
*this, getSyclIntegrationHeader(), KernelObj,
31253201
calculateKernelNameType(Context, KernelCallerFunc), KernelName,
3126-
StableName);
3202+
StableName, KernelCallerFunc);
31273203

31283204
KernelObjVisitor Visitor{*this};
31293205
Visitor.VisitRecordBases(KernelObj, kernel_decl, kernel_body, int_header);
@@ -3842,6 +3918,9 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) {
38423918
O << " __SYCL_DLL_LOCAL\n";
38433919
O << " static constexpr bool isESIMD() { return " << K.IsESIMDKernel
38443920
<< "; }\n";
3921+
O << " __SYCL_DLL_LOCAL\n";
3922+
O << " static constexpr bool callsThisItem() { return ";
3923+
O << K.CallsThisItem << "; }\n";
38453924
O << "};\n";
38463925
CurStart += N;
38473926
}
@@ -3900,6 +3979,12 @@ void SYCLIntegrationHeader::addSpecConstant(StringRef IDName, QualType IDType) {
39003979
SpecConsts.emplace_back(std::make_pair(IDType, IDName.str()));
39013980
}
39023981

3982+
void SYCLIntegrationHeader::setCallsThisItem(bool B) {
3983+
KernelDesc *K = getCurKernelDesc();
3984+
assert(K && "no kernels");
3985+
K->CallsThisItem = B;
3986+
}
3987+
39033988
SYCLIntegrationHeader::SYCLIntegrationHeader(DiagnosticsEngine &_Diag,
39043989
bool _UnnamedLambdaSupport,
39053990
Sema &_S)
@@ -3967,6 +4052,21 @@ bool Util::isSyclType(const QualType &Ty, StringRef Name, bool Tmpl) {
39674052
return matchQualifiedTypeName(Ty, Scopes);
39684053
}
39694054

4055+
bool Util::isSyclFunction(const FunctionDecl *FD, StringRef Name) {
4056+
if (!FD->isFunctionOrMethod() || !FD->getIdentifier() ||
4057+
FD->getName().empty() || Name != FD->getName())
4058+
return false;
4059+
4060+
const DeclContext *DC = FD->getDeclContext();
4061+
if (DC->isTranslationUnit())
4062+
return false;
4063+
4064+
std::array<DeclContextDesc, 2> Scopes = {
4065+
Util::DeclContextDesc{clang::Decl::Kind::Namespace, "cl"},
4066+
Util::DeclContextDesc{clang::Decl::Kind::Namespace, "sycl"}};
4067+
return matchContext(DC, Scopes);
4068+
}
4069+
39704070
bool Util::isAccessorPropertyListType(const QualType &Ty) {
39714071
const StringRef &Name = "accessor_property_list";
39724072
std::array<DeclContextDesc, 4> Scopes = {
@@ -3977,21 +4077,15 @@ bool Util::isAccessorPropertyListType(const QualType &Ty) {
39774077
return matchQualifiedTypeName(Ty, Scopes);
39784078
}
39794079

3980-
bool Util::matchQualifiedTypeName(const QualType &Ty,
3981-
ArrayRef<Util::DeclContextDesc> Scopes) {
3982-
// The idea: check the declaration context chain starting from the type
4080+
bool Util::matchContext(const DeclContext *Ctx,
4081+
ArrayRef<Util::DeclContextDesc> Scopes) {
4082+
// The idea: check the declaration context chain starting from the item
39834083
// itself. At each step check the context is of expected kind
39844084
// (namespace) and name.
3985-
const CXXRecordDecl *RecTy = Ty->getAsCXXRecordDecl();
3986-
3987-
if (!RecTy)
3988-
return false; // only classes/structs supported
3989-
const auto *Ctx = cast<DeclContext>(RecTy);
39904085
StringRef Name = "";
39914086

39924087
for (const auto &Scope : llvm::reverse(Scopes)) {
39934088
clang::Decl::Kind DK = Ctx->getDeclKind();
3994-
39954089
if (DK != Scope.first)
39964090
return false;
39974091

@@ -4005,11 +4099,21 @@ bool Util::matchQualifiedTypeName(const QualType &Ty,
40054099
Name = cast<NamespaceDecl>(Ctx)->getName();
40064100
break;
40074101
default:
4008-
llvm_unreachable("matchQualifiedTypeName: decl kind not supported");
4102+
llvm_unreachable("matchContext: decl kind not supported");
40094103
}
40104104
if (Name != Scope.second)
40114105
return false;
40124106
Ctx = Ctx->getParent();
40134107
}
40144108
return Ctx->isTranslationUnit();
40154109
}
4110+
4111+
bool Util::matchQualifiedTypeName(const QualType &Ty,
4112+
ArrayRef<Util::DeclContextDesc> Scopes) {
4113+
const CXXRecordDecl *RecTy = Ty->getAsCXXRecordDecl();
4114+
4115+
if (!RecTy)
4116+
return false; // only classes/structs supported
4117+
const auto *Ctx = cast<DeclContext>(RecTy);
4118+
return Util::matchContext(Ctx, Scopes);
4119+
}

clang/test/CodeGenSYCL/Inputs/sycl.hpp

+12
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,18 @@ struct id {
118118
int Data;
119119
};
120120

121+
template <int dim> struct item {
122+
template <typename... T>
123+
item(T... args) {} // fake constructor
124+
private:
125+
// Some fake field added to see using of item arguments in the
126+
// kernel wrapper
127+
int Data;
128+
};
129+
130+
template <int Dims> item<Dims>
131+
this_item() { return item<Dims>{}; }
132+
121133
template <int dim>
122134
struct range {
123135
template <typename... T>

clang/test/CodeGenSYCL/kernel-by-reference.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@ int simple_add(int i) {
1515
int main() {
1616
queue q;
1717
#if defined(SYCL2020)
18-
// expected-warning@Inputs/sycl.hpp:286 {{Passing kernel functions by value is deprecated in SYCL 2020}}
18+
// expected-warning@Inputs/sycl.hpp:298 {{Passing kernel functions by value is deprecated in SYCL 2020}}
1919
// expected-note@+3 {{in instantiation of function template specialization}}
2020
#endif
2121
q.submit([&](handler &h) {
2222
h.single_task_2017<class sycl2017>([]() { simple_add(10); });
2323
});
2424

2525
#if defined(SYCL2017)
26-
// expected-warning@Inputs/sycl.hpp:281 {{Passing of kernel functions by reference is a SYCL 2020 extension}}
26+
// expected-warning@Inputs/sycl.hpp:293 {{Passing of kernel functions by reference is a SYCL 2020 extension}}
2727
// expected-note@+3 {{in instantiation of function template specialization}}
2828
#endif
2929
q.submit([&](handler &h) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// RUN: %clang_cc1 -fsycl -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -fsyntax-only
2+
// RUN: FileCheck -input-file=%t.h %s
3+
4+
// This test checks that compiler generates correct kernel description
5+
// for parallel_for kernels that use the this_item API.
6+
7+
// CHECK: __SYCL_INLINE_NAMESPACE(cl) {
8+
// CHECK-NEXT: namespace sycl {
9+
// CHECK-NEXT: namespace detail {
10+
11+
// CHECK: static constexpr
12+
// CHECK-NEXT: const char* const kernel_names[] = {
13+
// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3GNU",
14+
// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3EMU",
15+
// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3OWL",
16+
// CHECK-NEXT: "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT"
17+
// CHECK-NEXT: };
18+
19+
// CHECK:template <> struct KernelInfo<class GNU> {
20+
// CHECK-NEXT: __SYCL_DLL_LOCAL
21+
// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3GNU"; }
22+
// CHECK-NEXT: __SYCL_DLL_LOCAL
23+
// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; }
24+
// CHECK-NEXT: __SYCL_DLL_LOCAL
25+
// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
26+
// CHECK-NEXT: return kernel_signatures[i+0];
27+
// CHECK-NEXT: }
28+
// CHECK-NEXT: __SYCL_DLL_LOCAL
29+
// CHECK-NEXT: static constexpr bool isESIMD() { return 0; }
30+
// CHECK-NEXT: __SYCL_DLL_LOCAL
31+
// CHECK-NEXT: static constexpr bool callsThisItem() { return 0; }
32+
// CHECK-NEXT:};
33+
// CHECK-NEXT:template <> struct KernelInfo<class EMU> {
34+
// CHECK-NEXT: __SYCL_DLL_LOCAL
35+
// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3EMU"; }
36+
// CHECK-NEXT: __SYCL_DLL_LOCAL
37+
// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; }
38+
// CHECK-NEXT: __SYCL_DLL_LOCAL
39+
// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
40+
// CHECK-NEXT: return kernel_signatures[i+0];
41+
// CHECK-NEXT: }
42+
// CHECK-NEXT: __SYCL_DLL_LOCAL
43+
// CHECK-NEXT: static constexpr bool isESIMD() { return 0; }
44+
// CHECK-NEXT: __SYCL_DLL_LOCAL
45+
// CHECK-NEXT: static constexpr bool callsThisItem() { return 1; }
46+
// CHECK-NEXT:};
47+
// CHECK-NEXT:template <> struct KernelInfo<class OWL> {
48+
// CHECK-NEXT: __SYCL_DLL_LOCAL
49+
// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3OWL"; }
50+
// CHECK-NEXT: __SYCL_DLL_LOCAL
51+
// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; }
52+
// CHECK-NEXT: __SYCL_DLL_LOCAL
53+
// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
54+
// CHECK-NEXT: return kernel_signatures[i+0];
55+
// CHECK-NEXT: }
56+
// CHECK-NEXT: __SYCL_DLL_LOCAL
57+
// CHECK-NEXT: static constexpr bool isESIMD() { return 0; }
58+
// CHECK-NEXT: __SYCL_DLL_LOCAL
59+
// CHECK-NEXT: static constexpr bool callsThisItem() { return 0; }
60+
// CHECK-NEXT:};
61+
// CHECK-NEXT:template <> struct KernelInfo<class RAT> {
62+
// CHECK-NEXT: __SYCL_DLL_LOCAL
63+
// CHECK-NEXT: static constexpr const char* getName() { return "_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE3RAT"; }
64+
// CHECK-NEXT: __SYCL_DLL_LOCAL
65+
// CHECK-NEXT: static constexpr unsigned getNumParams() { return 0; }
66+
// CHECK-NEXT: __SYCL_DLL_LOCAL
67+
// CHECK-NEXT: static constexpr const kernel_param_desc_t& getParamDesc(unsigned i) {
68+
// CHECK-NEXT: return kernel_signatures[i+0];
69+
// CHECK-NEXT: }
70+
// CHECK-NEXT: __SYCL_DLL_LOCAL
71+
// CHECK-NEXT: static constexpr bool isESIMD() { return 0; }
72+
// CHECK-NEXT: __SYCL_DLL_LOCAL
73+
// CHECK-NEXT: static constexpr bool callsThisItem() { return 1; }
74+
// CHECK-NEXT:};
75+
76+
#include "sycl.hpp"
77+
78+
using namespace cl::sycl;
79+
80+
SYCL_EXTERNAL item<1> g() { return this_item<1>(); }
81+
SYCL_EXTERNAL item<1> f() { return g(); }
82+
83+
// This is a similar-looking this_item function but not the real one.
84+
template <int Dims> item<Dims> this_item(int i) { return item<1>{i}; }
85+
86+
// This is a method named this_item but not the real one.
87+
class C {
88+
public:
89+
template <int Dims> item<Dims> this_item() { return item<1>{66}; };
90+
};
91+
92+
int main() {
93+
queue myQueue;
94+
myQueue.submit([&](::handler &cgh) {
95+
// This kernel does not call sycl::this_item
96+
cgh.parallel_for<class GNU>(range<1>(1),
97+
[=](item<1> I) { this_item<1>(55); });
98+
99+
// This kernel calls sycl::this_item
100+
cgh.parallel_for<class EMU>(range<1>(1),
101+
[=](::item<1> I) { this_item<1>(); });
102+
103+
// This kernel does not call sycl::this_item
104+
cgh.parallel_for<class OWL>(range<1>(1), [=](id<1> I) {
105+
class C c;
106+
c.this_item<1>();
107+
});
108+
109+
// This kernel calls sycl::this_item
110+
cgh.parallel_for<class RAT>(range<1>(1), [=](id<1> I) { f(); });
111+
});
112+
113+
return 0;
114+
}

sycl/doc/EnvironmentVariables.md

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ subject to change. Do not rely on these variables in production code.
2929
| SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
3030
| SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
3131
| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
32+
| SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE | Any(\*) | Enables tracing of parallel_for invocations with rounded-up ranges. |
33+
| SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING | Any(\*) | Disables automatic rounding-up of parallel_for invocation ranges. |
3234

3335
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
3436

0 commit comments

Comments
 (0)