diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 33cfcdc70057d..39028feeda701 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -88,11 +88,46 @@ #endif #define _KERNELFUNCPARAM(a) _KERNELFUNCPARAMTYPE a +#if defined(__SYCL_UNNAMED_LAMBDA__) +// We can't use nested types (e.g. struct S defined inside main() routine) to +// name kernels. At the same time, we have to provide a unique kernel name for +// sycl::fill and the only thing we can use to introduce that uniqueness (in +// general) is the template parameter T which might be exactly that nested type. +// That means we cannot support sycl::fill(void *, T&, size_t) for such types in +// general. However, we can do better than that when unnamed lambdas are +// enabled, so do it here! See also https://github.com/intel/llvm/issues/469. template -class __fill; +using __fill = sycl::detail::auto_name; +template using __usmfill = sycl::detail::auto_name; +template using __usmfill2d = sycl::detail::auto_name; +template using __usmmemcpy2d = sycl::detail::auto_name; + +template +using __copyAcc2Ptr = sycl::detail::auto_name; +template +using __copyPtr2Acc = sycl::detail::auto_name; + +template +using __copyAcc2Acc = sycl::detail::auto_name; +#else +// Limited fallback path for when unnamed lambdas aren't available. Cannot +// handle nested types. +template +class __fill; template class __usmfill; template class __usmfill2d; template class __usmmemcpy2d; @@ -114,6 +149,7 @@ template class __copyAcc2Acc; +#endif // For unit testing purposes class MockHandler; @@ -860,9 +896,8 @@ class __SYCL_EXPORT handler { return false; range<1> LinearizedRange(Src.size()); - parallel_for< - class __copyAcc2Acc>( + parallel_for<__copyAcc2Acc>( LinearizedRange, [=](id<1> Id) { size_t Index = Id[0]; id SrcId = detail::getDelinearizedId(Src.get_range(), Index); @@ -889,9 +924,8 @@ class __SYCL_EXPORT handler { if (!MIsHost) return false; - single_task< - class __copyAcc2Acc>( + single_task<__copyAcc2Acc>( [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); }); return true; } @@ -908,8 +942,7 @@ class __SYCL_EXPORT handler { copyAccToPtrHost(accessor Src, TDst *Dst) { range Range = Src.get_range(); - parallel_for< - class __copyAcc2Ptr>( + parallel_for<__copyAcc2Ptr>( Range, [=](id Index) { const size_t LinearIndex = detail::getLinearIndex(Index, Range); using TSrcNonConst = typename std::remove_const_t; @@ -927,7 +960,7 @@ class __SYCL_EXPORT handler { std::enable_if_t copyAccToPtrHost(accessor Src, TDst *Dst) { - single_task>( + single_task<__copyAcc2Ptr>( [=]() { using TSrcNonConst = typename std::remove_const_t; *(reinterpret_cast(Dst)) = *(Src.get_pointer()); @@ -944,8 +977,7 @@ class __SYCL_EXPORT handler { copyPtrToAccHost(TSrc *Src, accessor Dst) { range Range = Dst.get_range(); - parallel_for< - class __copyPtr2Acc>( + parallel_for<__copyPtr2Acc>( Range, [=](id Index) { const size_t LinearIndex = detail::getLinearIndex(Index, Range); Dst[Index] = (reinterpret_cast(Src))[LinearIndex]; @@ -962,7 +994,7 @@ class __SYCL_EXPORT handler { std::enable_if_t copyPtrToAccHost(TSrc *Src, accessor Dst) { - single_task>( + single_task<__copyPtr2Acc>( [=]() { *(Dst.get_pointer()) = *(reinterpret_cast(Src)); }); @@ -2521,13 +2553,11 @@ class __SYCL_EXPORT handler { *PatternPtr = Pattern; } else if constexpr (Dims == 0) { // Special case for zero-dim accessors. - parallel_for< - class __fill>( + parallel_for<__fill>( range<1>(1), [=](id<1>) { Dst = Pattern; }); } else { range Range = Dst.get_range(); - parallel_for< - class __fill>( + parallel_for<__fill>( Range, [=](id Index) { Dst[Index] = Pattern; }); } } @@ -2542,7 +2572,7 @@ class __SYCL_EXPORT handler { throwIfActionIsCreated(); static_assert(is_device_copyable::value, "Pattern must be device copyable"); - parallel_for>(range<1>(Count), [=](id<1> Index) { + parallel_for<__usmfill>(range<1>(Count), [=](id<1> Index) { T *CastedPtr = static_cast(Ptr); CastedPtr[Index] = Pattern; }); @@ -3235,7 +3265,7 @@ class __SYCL_EXPORT handler { // Limit number of work items to be resistant to big copies. id<2> Chunk = computeFallbackKernelBounds(Height, Width); id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk; - parallel_for>( + parallel_for<__usmmemcpy2d>( range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) { T *CastedDest = static_cast(Dest); const T *CastedSrc = static_cast(Src); @@ -3281,7 +3311,7 @@ class __SYCL_EXPORT handler { // Limit number of work items to be resistant to big fill operations. id<2> Chunk = computeFallbackKernelBounds(Height, Width); id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk; - parallel_for>( + parallel_for<__usmfill2d>( range<2>{Chunk[0], Chunk[1]}, [=](id<2> Index) { T *CastedDest = static_cast(Dest); for (uint32_t I = 0; I < Iterations[0]; ++I) { diff --git a/sycl/test-e2e/Basic/memop/memop_no_unnamed_lambda.cpp b/sycl/test-e2e/Basic/memop/memop_no_unnamed_lambda.cpp new file mode 100644 index 0000000000000..ead3a2f2d6590 --- /dev/null +++ b/sycl/test-e2e/Basic/memop/memop_no_unnamed_lambda.cpp @@ -0,0 +1,20 @@ +// RUN: %{build} -fno-sycl-unnamed-lambda -o %t.out +// RUN: %{run} %t.out + +#include + +// Second test for https://github.com/intel/llvm/issues/469. Verify that the +// mode without unnamed lambdas support still has some limited support. + +int main(int argc, char *argv[]) { + sycl::queue q; + void *p = sycl::aligned_alloc_device(alignof(int), sizeof(int), q); + + q.fill(p, static_cast(-1), 1).wait(); + // Same sizeof/alignment but different type to ensure no kernel name + // collisions happen. + q.fill(p, static_cast(2), 1).wait(); + + sycl::free(p, q); + return 0; +} diff --git a/sycl/test-e2e/Basic/memop/memop_unnamed_lambda.cpp b/sycl/test-e2e/Basic/memop/memop_unnamed_lambda.cpp new file mode 100644 index 0000000000000..b9a2654c1807a --- /dev/null +++ b/sycl/test-e2e/Basic/memop/memop_unnamed_lambda.cpp @@ -0,0 +1,30 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include + +// Test for https://github.com/intel/llvm/issues/469. + +#if !defined(__SYCL_UNNAMED_LAMBDA__) +#error "This test verifies unnamed lambda code path!" +#endif + +int main(int argc, char *argv[]) { + struct Simple { + int a; + }; + + // Same layout but different name to ensure no kernel name collisions happen. + struct Simple2 { + int c; + }; + + sycl::queue q; + void *p = sycl::aligned_alloc_device(alignof(Simple), sizeof(Simple), q); + + q.fill(p, Simple{1}, 1).wait(); + q.fill(p, Simple2{2}, 1).wait(); + + sycl::free(p, q); + return 0; +}