Skip to content

Commit ec4b348

Browse files
[SYCL] Use unnamed lambdas to name fill/copy/etc. kernels when possible (#10862)
This is an attempt to fix #469 in a scenario when unnamed lambda feature is enabled.
1 parent e3786df commit ec4b348

File tree

3 files changed

+100
-20
lines changed

3 files changed

+100
-20
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,46 @@
8888
#endif
8989
#define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a
9090

91+
#if defined(__SYCL_UNNAMED_LAMBDA__)
92+
// We can't use nested types (e.g. struct S defined inside main() routine) to
93+
// name kernels. At the same time, we have to provide a unique kernel name for
94+
// sycl::fill and the only thing we can use to introduce that uniqueness (in
95+
// general) is the template parameter T which might be exactly that nested type.
96+
// That means we cannot support sycl::fill(void *, T&, size_t) for such types in
97+
// general. However, we can do better than that when unnamed lambdas are
98+
// enabled, so do it here! See also https://github.com/intel/llvm/issues/469.
9199
template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
92100
sycl::access::target AccessTarget,
93101
sycl::access::placeholder IsPlaceholder>
94-
class __fill;
102+
using __fill = sycl::detail::auto_name;
103+
template <typename T> using __usmfill = sycl::detail::auto_name;
104+
template <typename T> using __usmfill2d = sycl::detail::auto_name;
105+
template <typename T> using __usmmemcpy2d = sycl::detail::auto_name;
106+
107+
template <typename T_Src, typename T_Dst, int Dims,
108+
sycl::access::mode AccessMode, sycl::access::target AccessTarget,
109+
sycl::access::placeholder IsPlaceholder>
110+
using __copyAcc2Ptr = sycl::detail::auto_name;
95111

112+
template <typename T_Src, typename T_Dst, int Dims,
113+
sycl::access::mode AccessMode, sycl::access::target AccessTarget,
114+
sycl::access::placeholder IsPlaceholder>
115+
using __copyPtr2Acc = sycl::detail::auto_name;
116+
117+
template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
118+
sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
119+
sycl::access::mode AccessMode_Dst,
120+
sycl::access::target AccessTarget_Dst,
121+
sycl::access::placeholder IsPlaceholder_Src,
122+
sycl::access::placeholder IsPlaceholder_Dst>
123+
using __copyAcc2Acc = sycl::detail::auto_name;
124+
#else
125+
// Limited fallback path for when unnamed lambdas aren't available. Cannot
126+
// handle nested types.
127+
template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
128+
sycl::access::target AccessTarget,
129+
sycl::access::placeholder IsPlaceholder>
130+
class __fill;
96131
template <typename T> class __usmfill;
97132
template <typename T> class __usmfill2d;
98133
template <typename T> class __usmmemcpy2d;
@@ -114,6 +149,7 @@ template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
114149
sycl::access::placeholder IsPlaceholder_Src,
115150
sycl::access::placeholder IsPlaceholder_Dst>
116151
class __copyAcc2Acc;
152+
#endif
117153

118154
// For unit testing purposes
119155
class MockHandler;
@@ -860,9 +896,8 @@ class __SYCL_EXPORT handler {
860896
return false;
861897

862898
range<1> LinearizedRange(Src.size());
863-
parallel_for<
864-
class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
865-
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
899+
parallel_for<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
900+
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
866901
LinearizedRange, [=](id<1> Id) {
867902
size_t Index = Id[0];
868903
id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
@@ -889,9 +924,8 @@ class __SYCL_EXPORT handler {
889924
if (!MIsHost)
890925
return false;
891926

892-
single_task<
893-
class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
894-
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
927+
single_task<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
928+
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
895929
[=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
896930
return true;
897931
}
@@ -908,8 +942,7 @@ class __SYCL_EXPORT handler {
908942
copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
909943
TDst *Dst) {
910944
range<Dim> Range = Src.get_range();
911-
parallel_for<
912-
class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
945+
parallel_for<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
913946
Range, [=](id<Dim> Index) {
914947
const size_t LinearIndex = detail::getLinearIndex(Index, Range);
915948
using TSrcNonConst = typename std::remove_const_t<TSrc>;
@@ -927,7 +960,7 @@ class __SYCL_EXPORT handler {
927960
std::enable_if_t<Dim == 0>
928961
copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
929962
TDst *Dst) {
930-
single_task<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
963+
single_task<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
931964
[=]() {
932965
using TSrcNonConst = typename std::remove_const_t<TSrc>;
933966
*(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
@@ -944,8 +977,7 @@ class __SYCL_EXPORT handler {
944977
copyPtrToAccHost(TSrc *Src,
945978
accessor<TDst, Dim, AccMode, AccTarget, IsPH> Dst) {
946979
range<Dim> Range = Dst.get_range();
947-
parallel_for<
948-
class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
980+
parallel_for<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
949981
Range, [=](id<Dim> Index) {
950982
const size_t LinearIndex = detail::getLinearIndex(Index, Range);
951983
Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
@@ -962,7 +994,7 @@ class __SYCL_EXPORT handler {
962994
std::enable_if_t<Dim == 0>
963995
copyPtrToAccHost(TSrc *Src,
964996
accessor<TDst, Dim, AccMode, AccTarget, IsPH> Dst) {
965-
single_task<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
997+
single_task<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
966998
[=]() {
967999
*(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
9681000
});
@@ -2551,13 +2583,11 @@ class __SYCL_EXPORT handler {
25512583
*PatternPtr = Pattern;
25522584
} else if constexpr (Dims == 0) {
25532585
// Special case for zero-dim accessors.
2554-
parallel_for<
2555-
class __fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2586+
parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
25562587
range<1>(1), [=](id<1>) { Dst = Pattern; });
25572588
} else {
25582589
range<Dims> Range = Dst.get_range();
2559-
parallel_for<
2560-
class __fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
2590+
parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
25612591
Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
25622592
}
25632593
}
@@ -2572,7 +2602,7 @@ class __SYCL_EXPORT handler {
25722602
throwIfActionIsCreated();
25732603
static_assert(is_device_copyable<T>::value,
25742604
"Pattern must be device copyable");
2575-
parallel_for<class __usmfill<T>>(range<1>(Count), [=](id<1> Index) {
2605+
parallel_for<__usmfill<T>>(range<1>(Count), [=](id<1> Index) {
25762606
T *CastedPtr = static_cast<T *>(Ptr);
25772607
CastedPtr[Index] = Pattern;
25782608
});
@@ -3277,7 +3307,7 @@ class __SYCL_EXPORT handler {
32773307
// Limit number of work items to be resistant to big copies.
32783308
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
32793309
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3280-
parallel_for<class __usmmemcpy2d<T>>(
3310+
parallel_for<__usmmemcpy2d<T>>(
32813311
range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
32823312
T *CastedDest = static_cast<T *>(Dest);
32833313
const T *CastedSrc = static_cast<const T *>(Src);
@@ -3323,7 +3353,7 @@ class __SYCL_EXPORT handler {
33233353
// Limit number of work items to be resistant to big fill operations.
33243354
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
33253355
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
3326-
parallel_for<class __usmfill2d<T>>(
3356+
parallel_for<__usmfill2d<T>>(
33273357
range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
33283358
T *CastedDest = static_cast<T *>(Dest);
33293359
for (uint32_t I = 0; I < Iterations[0]; ++I) {
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// RUN: %{build} -fno-sycl-unnamed-lambda -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include <sycl/sycl.hpp>
5+
6+
// Second test for https://github.com/intel/llvm/issues/469. Verify that the
7+
// mode without unnamed lambdas support still has some limited support.
8+
9+
int main(int argc, char *argv[]) {
10+
sycl::queue q;
11+
void *p = sycl::aligned_alloc_device(alignof(int), sizeof(int), q);
12+
13+
q.fill(p, static_cast<int>(-1), 1).wait();
14+
// Same sizeof/alignment but different type to ensure no kernel name
15+
// collisions happen.
16+
q.fill(p, static_cast<unsigned int>(2), 1).wait();
17+
18+
sycl::free(p, q);
19+
return 0;
20+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include <sycl/sycl.hpp>
5+
6+
// Test for https://github.com/intel/llvm/issues/469.
7+
8+
#if !defined(__SYCL_UNNAMED_LAMBDA__)
9+
#error "This test verifies unnamed lambda code path!"
10+
#endif
11+
12+
int main(int argc, char *argv[]) {
13+
struct Simple {
14+
int a;
15+
};
16+
17+
// Same layout but different name to ensure no kernel name collisions happen.
18+
struct Simple2 {
19+
int c;
20+
};
21+
22+
sycl::queue q;
23+
void *p = sycl::aligned_alloc_device(alignof(Simple), sizeof(Simple), q);
24+
25+
q.fill(p, Simple{1}, 1).wait();
26+
q.fill(p, Simple2{2}, 1).wait();
27+
28+
sycl::free(p, q);
29+
return 0;
30+
}

0 commit comments

Comments
 (0)