Skip to content

[SYCL] Use unnamed lambdas to name fill/copy/etc. kernels when possible #10862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 50 additions & 20 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,46 @@
#endif
#define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a

#if defined(__SYCL_UNNAMED_LAMBDA__)
// We can't use nested types (e.g. struct S defined inside main() routine) to
// name kernels. At the same time, we have to provide a unique kernel name for
// sycl::fill and the only thing we can use to introduce that uniqueness (in
// general) is the template parameter T which might be exactly that nested type.
// That means we cannot support sycl::fill(void *, T&, size_t) for such types in
// general. However, we can do better than that when unnamed lambdas are
// enabled, so do it here! See also https://github.com/intel/llvm/issues/469.
template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
sycl::access::target AccessTarget,
sycl::access::placeholder IsPlaceholder>
class __fill;
using __fill = sycl::detail::auto_name;
template <typename T> using __usmfill = sycl::detail::auto_name;
template <typename T> using __usmfill2d = sycl::detail::auto_name;
template <typename T> using __usmmemcpy2d = sycl::detail::auto_name;

template <typename T_Src, typename T_Dst, int Dims,
sycl::access::mode AccessMode, sycl::access::target AccessTarget,
sycl::access::placeholder IsPlaceholder>
using __copyAcc2Ptr = sycl::detail::auto_name;

template <typename T_Src, typename T_Dst, int Dims,
sycl::access::mode AccessMode, sycl::access::target AccessTarget,
sycl::access::placeholder IsPlaceholder>
using __copyPtr2Acc = sycl::detail::auto_name;

template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
sycl::access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
sycl::access::mode AccessMode_Dst,
sycl::access::target AccessTarget_Dst,
sycl::access::placeholder IsPlaceholder_Src,
sycl::access::placeholder IsPlaceholder_Dst>
using __copyAcc2Acc = sycl::detail::auto_name;
#else
// Limited fallback path for when unnamed lambdas aren't available. Cannot
// handle nested types.
template <typename DataT, int Dimensions, sycl::access::mode AccessMode,
sycl::access::target AccessTarget,
sycl::access::placeholder IsPlaceholder>
class __fill;
template <typename T> class __usmfill;
template <typename T> class __usmfill2d;
template <typename T> class __usmmemcpy2d;
Expand All @@ -114,6 +149,7 @@ template <typename T_Src, int Dims_Src, sycl::access::mode AccessMode_Src,
sycl::access::placeholder IsPlaceholder_Src,
sycl::access::placeholder IsPlaceholder_Dst>
class __copyAcc2Acc;
#endif

// For unit testing purposes
class MockHandler;
Expand Down Expand Up @@ -860,9 +896,8 @@ class __SYCL_EXPORT handler {
return false;

range<1> LinearizedRange(Src.size());
parallel_for<
class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
parallel_for<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
LinearizedRange, [=](id<1> Id) {
size_t Index = Id[0];
id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
Expand All @@ -889,9 +924,8 @@ class __SYCL_EXPORT handler {
if (!MIsHost)
return false;

single_task<
class __copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
single_task<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
[=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
return true;
}
Expand All @@ -908,8 +942,7 @@ class __SYCL_EXPORT handler {
copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
TDst *Dst) {
range<Dim> Range = Src.get_range();
parallel_for<
class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
parallel_for<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
Range, [=](id<Dim> Index) {
const size_t LinearIndex = detail::getLinearIndex(Index, Range);
using TSrcNonConst = typename std::remove_const_t<TSrc>;
Expand All @@ -927,7 +960,7 @@ class __SYCL_EXPORT handler {
std::enable_if_t<Dim == 0>
copyAccToPtrHost(accessor<TSrc, Dim, AccMode, AccTarget, IsPH> Src,
TDst *Dst) {
single_task<class __copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
single_task<__copyAcc2Ptr<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
[=]() {
using TSrcNonConst = typename std::remove_const_t<TSrc>;
*(reinterpret_cast<TSrcNonConst *>(Dst)) = *(Src.get_pointer());
Expand All @@ -944,8 +977,7 @@ class __SYCL_EXPORT handler {
copyPtrToAccHost(TSrc *Src,
accessor<TDst, Dim, AccMode, AccTarget, IsPH> Dst) {
range<Dim> Range = Dst.get_range();
parallel_for<
class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
parallel_for<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
Range, [=](id<Dim> Index) {
const size_t LinearIndex = detail::getLinearIndex(Index, Range);
Dst[Index] = (reinterpret_cast<const TDst *>(Src))[LinearIndex];
Expand All @@ -962,7 +994,7 @@ class __SYCL_EXPORT handler {
std::enable_if_t<Dim == 0>
copyPtrToAccHost(TSrc *Src,
accessor<TDst, Dim, AccMode, AccTarget, IsPH> Dst) {
single_task<class __copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
single_task<__copyPtr2Acc<TSrc, TDst, Dim, AccMode, AccTarget, IsPH>>(
[=]() {
*(Dst.get_pointer()) = *(reinterpret_cast<const TDst *>(Src));
});
Expand Down Expand Up @@ -2521,13 +2553,11 @@ class __SYCL_EXPORT handler {
*PatternPtr = Pattern;
} else if constexpr (Dims == 0) {
// Special case for zero-dim accessors.
parallel_for<
class __fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
range<1>(1), [=](id<1>) { Dst = Pattern; });
} else {
range<Dims> Range = Dst.get_range();
parallel_for<
class __fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
parallel_for<__fill<T, Dims, AccessMode, AccessTarget, IsPlaceholder>>(
Range, [=](id<Dims> Index) { Dst[Index] = Pattern; });
}
}
Expand All @@ -2542,7 +2572,7 @@ class __SYCL_EXPORT handler {
throwIfActionIsCreated();
static_assert(is_device_copyable<T>::value,
"Pattern must be device copyable");
parallel_for<class __usmfill<T>>(range<1>(Count), [=](id<1> Index) {
parallel_for<__usmfill<T>>(range<1>(Count), [=](id<1> Index) {
T *CastedPtr = static_cast<T *>(Ptr);
CastedPtr[Index] = Pattern;
});
Expand Down Expand Up @@ -3235,7 +3265,7 @@ class __SYCL_EXPORT handler {
// Limit number of work items to be resistant to big copies.
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
parallel_for<class __usmmemcpy2d<T>>(
parallel_for<__usmmemcpy2d<T>>(
range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
T *CastedDest = static_cast<T *>(Dest);
const T *CastedSrc = static_cast<const T *>(Src);
Expand Down Expand Up @@ -3281,7 +3311,7 @@ class __SYCL_EXPORT handler {
// Limit number of work items to be resistant to big fill operations.
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
parallel_for<class __usmfill2d<T>>(
parallel_for<__usmfill2d<T>>(
range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) {
T *CastedDest = static_cast<T *>(Dest);
for (uint32_t I = 0; I < Iterations[0]; ++I) {
Expand Down
20 changes: 20 additions & 0 deletions sycl/test-e2e/Basic/memop/memop_no_unnamed_lambda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// RUN: %{build} -fno-sycl-unnamed-lambda -o %t.out
// RUN: %{run} %t.out

#include <sycl/sycl.hpp>

// Second test for https://github.com/intel/llvm/issues/469. Verify that the
// mode without unnamed lambdas support still has some limited support.

int main(int argc, char *argv[]) {
sycl::queue q;
void *p = sycl::aligned_alloc_device(alignof(int), sizeof(int), q);

q.fill(p, static_cast<int>(-1), 1).wait();
// Same sizeof/alignment but different type to ensure no kernel name
// collisions happen.
q.fill(p, static_cast<unsigned int>(2), 1).wait();

sycl::free(p, q);
return 0;
}
30 changes: 30 additions & 0 deletions sycl/test-e2e/Basic/memop/memop_unnamed_lambda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include <sycl/sycl.hpp>

// Test for https://github.com/intel/llvm/issues/469.

#if !defined(__SYCL_UNNAMED_LAMBDA__)
#error "This test verifies unnamed lambda code path!"
#endif

int main(int argc, char *argv[]) {
struct Simple {
int a;
};

// Same layout but different name to ensure no kernel name collisions happen.
struct Simple2 {
int c;
};

sycl::queue q;
void *p = sycl::aligned_alloc_device(alignof(Simple), sizeof(Simple), q);

q.fill(p, Simple{1}, 1).wait();
q.fill(p, Simple2{2}, 1).wait();

sycl::free(p, q);
return 0;
}