diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 53bec90a35bb6..48044a804eb4e 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -15,7 +15,7 @@ include(AddSYCLExecutable) set(SYCL_MAJOR_VERSION 5) set(SYCL_MINOR_VERSION 1) set(SYCL_PATCH_VERSION 0) -set(SYCL_DEV_ABI_VERSION 0) +set(SYCL_DEV_ABI_VERSION 1) if (SYCL_ADD_DEV_VERSION_POSTFIX) set(SYCL_VERSION_POSTFIX "-${SYCL_DEV_ABI_VERSION}") endif() diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 3d3078ac95bc9..a73ea6ab687f7 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -432,6 +432,29 @@ template struct AreAllButLastReductions { static constexpr bool value = !std::is_base_of::value; }; +/// Predicate returning true if all template type parameters except the last two +/// are reductions. +template +struct AreAllButLastTwoReductions { + static constexpr bool value = + std::is_base_of::value && + AreAllButLastTwoReductions::value; +}; + +/// Helper specialization of AreAllButLastTwoReductions for two elements. +/// Returns true if the template parameters are not a reduction. +template struct AreAllButLastTwoReductions { + static constexpr bool value = + !std::is_base_of::value && + !std::is_base_of::value; +}; + +/// Helper specialization of AreAllButLastTwoReductions for one element only. +/// Returns true if the template parameter is not a reduction. +template struct AreAllButLastTwoReductions { + static constexpr bool value = !std::is_base_of::value; +}; + /// This class encapsulates the reduction variable/accessor, /// the reduction operator and an optional operator identity. template MHostKernel; shared_ptr_class MSyclKernel; vector_class MArgs; @@ -128,7 +129,8 @@ class CGExecKernel : public CG { detail::OSModuleHandle MOSModuleHandle; vector_class> MStreams; - CGExecKernel(NDRDescT NDRDesc, unique_ptr_class HKernel, + CGExecKernel(NDRDescT NDRDesc, const property_list &PropList, + unique_ptr_class HKernel, shared_ptr_class SyclKernel, vector_class> ArgsStorage, vector_class AccStorage, @@ -142,10 +144,10 @@ class CGExecKernel : public CG { : CG(Type, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), std::move(loc)), - MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), - MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)), - MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle), - MStreams(std::move(Streams)) { + MNDRDesc(std::move(NDRDesc)), MPropList(PropList), + MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), + MArgs(std::move(Args)), MKernelName(std::move(KernelName)), + MOSModuleHandle(OSModuleHandle), MStreams(std::move(Streams)) { assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) && "Wrong type of exec kernel CG."); } diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index ef60964e3d332..5f1d10aa17f5e 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -270,6 +270,7 @@ std::tuple...> tuple_select_elements(TupleT Tuple, std::index_sequence); template struct AreAllButLastReductions; +template struct AreAllButLastTwoReductions; } // namespace detail } // namespace ONEAPI @@ -716,8 +717,8 @@ class __SYCL_EXPORT handler { /// \param NumWorkItems is a range defining indexing space. /// \param KernelFunc is a SYCL kernel function. template - void parallel_for_lambda_impl(range NumWorkItems, - KernelType KernelFunc) { + void parallel_for_lambda_impl(range NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { throwIfActionIsCreated(); using LambdaArgType = sycl::detail::lambda_arg_type>; @@ -813,10 +814,12 @@ class __SYCL_EXPORT handler { range AdjustedRange = NumWorkItems; AdjustedRange.set_range_dim0(NewValX); #ifdef __SYCL_DEVICE_ONLY__ + (void)PropList; kernel_parallel_for(Wrapper); #else detail::checkValueRange(AdjustedRange); MNDRDesc.set(std::move(AdjustedRange)); + MPropList = PropList; StoreLambda( std::move(Wrapper)); MCGType = detail::CG::KERNEL; @@ -828,10 +831,12 @@ class __SYCL_EXPORT handler { { #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkItems; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); + MPropList = PropList; StoreLambda( std::move(KernelFunc)); MCGType = detail::CG::KERNEL; @@ -847,12 +852,14 @@ class __SYCL_EXPORT handler { /// \param NumWorkItems is a range defining indexing space. /// \param Kernel is a SYCL kernel function. template - void parallel_for_impl(range NumWorkItems, kernel Kernel) { + void parallel_for_impl(range NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); + MPropList = PropList; MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); @@ -996,19 +1003,22 @@ class __SYCL_EXPORT handler { /// \param KernelFunc is a SYCL kernel function. template #ifdef __SYCL_NONCONST_FUNCTOR__ - void single_task(KernelType KernelFunc) { + void single_task(KernelType KernelFunc, const property_list &PropList = {}) { #else - void single_task(const KernelType &KernelFunc) { + void single_task(const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ + (void)PropList; kernel_single_task(KernelFunc); #else // No need to check if range is out of INT_MAX limits as it's compile-time // known constant. MNDRDesc.set(range<1>{1}); + MPropList = PropList; StoreLambda(KernelFunc); MCGType = detail::CG::KERNEL; @@ -1017,29 +1027,38 @@ class __SYCL_EXPORT handler { template #ifdef __SYCL_NONCONST_FUNCTOR__ - void parallel_for(range<1> NumWorkItems, KernelType KernelFunc) { + void parallel_for(range<1> NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { #else - void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc) { + void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif - parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc)); + parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc), + PropList); } template #ifdef __SYCL_NONCONST_FUNCTOR__ - void parallel_for(range<2> NumWorkItems, KernelType KernelFunc) { + void parallel_for(range<2> NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { #else - void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc) { + void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif - parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc)); + parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc), + PropList); } template #ifdef __SYCL_NONCONST_FUNCTOR__ - void parallel_for(range<3> NumWorkItems, KernelType KernelFunc) { + void parallel_for(range<3> NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { #else - void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc) { + void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif - parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc)); + parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc), + PropList); } /// Defines and invokes a SYCL kernel on host device. @@ -1110,7 +1129,8 @@ class __SYCL_EXPORT handler { template void parallel_for(range NumWorkItems, id WorkItemOffset, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1118,8 +1138,10 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkItems; (void)WorkItemOffset; + (void)PropList; kernel_parallel_for(KernelFunc); #else + MPropList = PropList; detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); StoreLambda(std::move(KernelFunc)); @@ -1141,8 +1163,8 @@ class __SYCL_EXPORT handler { /// \param KernelFunc is a SYCL kernel function. template - void parallel_for(nd_range ExecutionRange, - _KERNELFUNCPARAM(KernelFunc)) { + void parallel_for(nd_range ExecutionRange, _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1150,8 +1172,10 @@ class __SYCL_EXPORT handler { sycl::detail::lambda_arg_type>; #ifdef __SYCL_DEVICE_ONLY__ (void)ExecutionRange; + (void)PropList; kernel_parallel_for(KernelFunc); #else + MPropList = PropList; detail::checkValueRange(ExecutionRange); MNDRDesc.set(std::move(ExecutionRange)); StoreLambda(std::move(KernelFunc)); @@ -1168,7 +1192,9 @@ class __SYCL_EXPORT handler { detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { + MPropList = PropList; ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, Redu, Redu.getUserAccessor()); } @@ -1182,7 +1208,9 @@ class __SYCL_EXPORT handler { detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { + MPropList = PropList; ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, Redu, Redu.getUSMPointer()); } @@ -1202,7 +1230,9 @@ class __SYCL_EXPORT handler { detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { + MPropList = PropList; shared_ptr_class QueueCopy = MQueue; auto RWAcc = Redu.getReadWriteScalarAcc(*this); ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, Redu, @@ -1238,7 +1268,8 @@ class __SYCL_EXPORT handler { int Dims, typename Reduction> detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { // This parallel_for() is lowered to the following sequence: // 1) Call a kernel that a) call user's lambda function and b) performs // one iteration of reduction, storing the partial reductions/sums @@ -1260,6 +1291,7 @@ class __SYCL_EXPORT handler { // the main kernel, but simply generate Range.get_global_range.size() number // of partial sums, leaving the reduction work to the additional/aux // kernels. + MPropList = PropList; constexpr bool HFR = Reduction::has_fast_reduce; size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type); // TODO: currently the maximal work group size is determined for the given @@ -1302,20 +1334,16 @@ class __SYCL_EXPORT handler { } // end while (NWorkItems > 1) } - // This version of parallel_for may handle one or more reductions packed in - // \p Rest argument. Note thought that the last element in \p Rest pack is - // the kernel function. - // TODO: this variant is currently enabled for 2+ reductions only as the - // versions handling 1 reduction variable are more efficient right now. - template - std::enable_if_t<(sizeof...(RestT) >= 3 && - ONEAPI::detail::AreAllButLastReductions::value)> - parallel_for(nd_range Range, RestT... Rest) { + void parallel_for_impl(nd_range Range, RestT... Rest) { std::tuple ArgsTuple(Rest...); constexpr size_t NumArgs = sizeof...(RestT); - auto KernelFunc = std::get(ArgsTuple); - auto ReduIndices = std::make_index_sequence(); + constexpr size_t Offset = hasPropList ? 2 : 1; + auto KernelFunc = std::get(ArgsTuple); + auto ReduIndices = std::make_index_sequence(); auto ReduTuple = ONEAPI::detail::tuple_select_elements(ArgsTuple, ReduIndices); @@ -1350,6 +1378,36 @@ class __SYCL_EXPORT handler { } // end while (NWorkItems > 1) } + // This version of parallel_for may handle one or more reductions packed in + // \p Rest argument. Note though that the last element in \p Rest pack is + // the kernel function. + // TODO: this variant is currently enabled for 2+ reductions only as the + // versions handling 1 reduction variable are more efficient right now. + template + std::enable_if_t<(sizeof...(RestT) >= 3 && + ONEAPI::detail::AreAllButLastReductions::value)> + parallel_for(nd_range Range, RestT... Rest) { + parallel_for_impl(Range, Rest...); + } + + // This version of parallel_for may handle one or more reductions packed in + // \p Rest argument. Note though that the last two elements in \p Rest pack + // are the kernel function and the property list. + // TODO: this variant is currently enabled for 2+ reductions only as the + // versions handling 1 reduction variable are more efficient right now. + template + std::enable_if_t< + (sizeof...(RestT) >= 4 && + ONEAPI::detail::AreAllButLastTwoReductions::value)> + parallel_for(nd_range Range, RestT... Rest) { + std::tuple ArgsTuple(Rest...); + constexpr size_t NumArgs = sizeof...(RestT); + MPropList = std::get(ArgsTuple); + parallel_for_impl(Range, Rest...); + } + /// Hierarchical kernel invocation method of a kernel defined as a lambda /// encoding the body of each work-group to launch. /// @@ -1363,7 +1421,8 @@ class __SYCL_EXPORT handler { template void parallel_for_work_group(range NumWorkGroups, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1371,10 +1430,12 @@ class __SYCL_EXPORT handler { sycl::detail::lambda_arg_type>; #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkGroups; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else detail::checkValueRange(NumWorkGroups); MNDRDesc.setNumWorkGroups(NumWorkGroups); + MPropList = PropList; StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; #endif // __SYCL_DEVICE_ONLY__ @@ -1396,7 +1457,8 @@ class __SYCL_EXPORT handler { int Dims> void parallel_for_work_group(range NumWorkGroups, range WorkGroupSize, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1405,12 +1467,14 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkGroups; (void)WorkGroupSize; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else nd_range ExecRange = nd_range(NumWorkGroups * WorkGroupSize, WorkGroupSize); detail::checkValueRange(ExecRange); MNDRDesc.set(std::move(ExecRange)); + MPropList = PropList; StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; #endif // __SYCL_DEVICE_ONLY__ @@ -1422,28 +1486,32 @@ class __SYCL_EXPORT handler { /// cannot be called on host. /// /// \param Kernel is a SYCL kernel object. - void single_task(kernel Kernel) { + void single_task(kernel Kernel, const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); // No need to check if range is out of INT_MAX limits as it's compile-time // known constant MNDRDesc.set(range<1>{1}); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); } - void parallel_for(range<1> NumWorkItems, kernel Kernel) { - parallel_for_impl(NumWorkItems, Kernel); + void parallel_for(range<1> NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { + parallel_for_impl(NumWorkItems, Kernel, PropList); } - void parallel_for(range<2> NumWorkItems, kernel Kernel) { - parallel_for_impl(NumWorkItems, Kernel); + void parallel_for(range<2> NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { + parallel_for_impl(NumWorkItems, Kernel, PropList); } - void parallel_for(range<3> NumWorkItems, kernel Kernel) { - parallel_for_impl(NumWorkItems, Kernel); + void parallel_for(range<3> NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { + parallel_for_impl(NumWorkItems, Kernel, PropList); } /// Defines and invokes a SYCL kernel function for the specified range and @@ -1456,12 +1524,13 @@ class __SYCL_EXPORT handler { /// \param Kernel is a SYCL kernel function. template void parallel_for(range NumWorkItems, id WorkItemOffset, - kernel Kernel) { + kernel Kernel, const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); + MPropList = PropList; MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1475,12 +1544,15 @@ class __SYCL_EXPORT handler { /// \param NDRange is a ND-range defining global and local sizes as /// well as offset. /// \param Kernel is a SYCL kernel function. - template void parallel_for(nd_range NDRange, kernel Kernel) { + template + void parallel_for(nd_range NDRange, kernel Kernel, + const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); + MPropList = PropList; MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1493,17 +1565,20 @@ class __SYCL_EXPORT handler { /// \param KernelFunc is a lambda that is used if device, queue is bound to, /// is a host device. template - void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc)) { + void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; + (void)PropList; kernel_single_task(KernelFunc); #else // No need to check if range is out of INT_MAX limits as it's compile-time // known constant MNDRDesc.set(range<1>{1}); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1533,7 +1608,8 @@ class __SYCL_EXPORT handler { template void parallel_for(kernel Kernel, range NumWorkItems, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1541,10 +1617,12 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; (void)NumWorkItems; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1568,7 +1646,8 @@ class __SYCL_EXPORT handler { template void parallel_for(kernel Kernel, range NumWorkItems, - id WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) { + id WorkItemOffset, _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1577,10 +1656,12 @@ class __SYCL_EXPORT handler { (void)Kernel; (void)NumWorkItems; (void)WorkItemOffset; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1604,7 +1685,8 @@ class __SYCL_EXPORT handler { template void parallel_for(kernel Kernel, nd_range NDRange, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1613,10 +1695,12 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; (void)NDRange; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1644,7 +1728,8 @@ class __SYCL_EXPORT handler { template void parallel_for_work_group(kernel Kernel, range NumWorkGroups, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1653,10 +1738,12 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; (void)NumWorkGroups; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else detail::checkValueRange(NumWorkGroups); MNDRDesc.setNumWorkGroups(NumWorkGroups); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; @@ -1682,7 +1769,8 @@ class __SYCL_EXPORT handler { int Dims> void parallel_for_work_group(kernel Kernel, range NumWorkGroups, range WorkGroupSize, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1692,12 +1780,14 @@ class __SYCL_EXPORT handler { (void)Kernel; (void)NumWorkGroups; (void)WorkGroupSize; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else nd_range ExecRange = nd_range(NumWorkGroups * WorkGroupSize, WorkGroupSize); detail::checkValueRange(ExecRange); MNDRDesc.set(std::move(ExecRange)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; @@ -2066,6 +2156,7 @@ class __SYCL_EXPORT handler { detail::code_location MCodeLoc = {}; bool MIsFinalized = false; event MLastEvent; + property_list MPropList; // Make queue_impl class friend to be able to call finalize method. friend class detail::queue_impl; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2e0e28a3538f7..523a307c423da 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1641,8 +1641,8 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { pi_result ExecCGCommand::SetKernelParamsAndLaunch( CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, RT::PiEvent &Event, - ProgramManager::KernelArgMask EliminatedArgMask) { + const property_list &PropList, std::vector &RawEvents, + RT::PiEvent &Event, ProgramManager::KernelArgMask EliminatedArgMask) { vector_class &Args = ExecKernel->MArgs; // TODO this is not necessary as long as we can guarantee that the arguments // are already sorted (e. g. handle the sorting in handler if necessary due @@ -1884,6 +1884,7 @@ cl_int ExecCGCommand::enqueueImp() { CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); NDRDescT &NDRDesc = ExecKernel->MNDRDesc; + const property_list &PropList = ExecKernel->MPropList; if (MQueue->is_host()) { for (ArgDesc &Arg : ExecKernel->MArgs) @@ -1950,11 +1951,11 @@ cl_int ExecCGCommand::enqueueImp() { if (KernelMutex != nullptr) { // For cacheable kernels, we use per-kernel mutex std::lock_guard Lock(*KernelMutex); - Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event, EliminatedArgMask); + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, PropList, + RawEvents, Event, EliminatedArgMask); } else { - Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event, EliminatedArgMask); + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, PropList, + RawEvents, Event, EliminatedArgMask); } if (PI_SUCCESS != Error) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8713096be9127..e63664e66512c 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -520,8 +520,8 @@ class ExecCGCommand : public Command { pi_result SetKernelParamsAndLaunch( CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, RT::PiEvent &Event, - ProgramManager::KernelArgMask EliminatedArgMask); + const property_list &, std::vector &RawEvents, + RT::PiEvent &Event, ProgramManager::KernelArgMask EliminatedArgMask); std::unique_ptr MCommandGroup; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 5e63addf8e446..77231bf6dfb7b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -33,8 +33,8 @@ event handler::finalize() { case detail::CG::KERNEL: case detail::CG::RUN_ON_HOST_INTEL: { CommandGroup.reset(new detail::CGExecKernel( - std::move(MNDRDesc), std::move(MHostKernel), std::move(MKernel), - std::move(MArgsStorage), std::move(MAccStorage), + std::move(MNDRDesc), std::move(MPropList), std::move(MHostKernel), + std::move(MKernel), std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), std::move(MEvents), std::move(MArgs), std::move(MKernelName), std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType, diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp index 9ba991356e6ad..2f623607363a9 100644 --- a/sycl/test/abi/layout_handler.cpp +++ b/sycl/test/abi/layout_handler.cpp @@ -186,5 +186,19 @@ void foo() { // CHECK-NEXT: 544 | std::__shared_ptr::element_type * _M_ptr // CHECK-NEXT: 552 | class std::__shared_count<__gnu_cxx::_S_atomic> _M_refcount // CHECK-NEXT: 552 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi -// CHECK-NEXT: | [sizeof=560, dsize=560, align=8, -// CHECK-NEXT: | nvsize=560, nvalign=8] +// CHECK-NEXT: 560 | class sycl::property_list MPropList +// CHECK-NEXT: 560 | class sycl::detail::PropertyListBase (base) +// CHECK-NEXT: 560 | class std::bitset<7> MDataLessProps +// CHECK-NEXT: 560 | struct std::_Base_bitset<1> (base) +// CHECK-NEXT: 560 | std::_Base_bitset<1>::_WordT _M_w +// CHECK-NEXT: 568 | class std::vector > MPropsWithData +// CHECK-NEXT: 568 | struct std::_Vector_base, class std::allocator > > (base) +// CHECK-NEXT: 568 | struct std::_Vector_base, class std::allocator > >::_Vector_impl _M_impl +// CHECK-NEXT: 568 | class std::allocator > (base) (empty) +// CHECK-NEXT: 568 | class __gnu_cxx::new_allocator > (base) (empty) +// CHECK-NEXT: 568 | struct std::_Vector_base, class std::allocator > >::_Vector_impl_data (base) +// CHECK-NEXT: 568 | std::_Vector_base, class std::allocator > >::pointer _M_start +// CHECK-NEXT: 576 | std::_Vector_base, class std::allocator > >::pointer _M_finish +// CHECK-NEXT: 584 | std::_Vector_base, class std::allocator > >::pointer _M_end_of_storage +// CHECK-NEXT: | [sizeof=592, dsize=592, align=8, +// CHECK-NEXT: | nvsize=592, nvalign=8] diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp index 4ff2cdac83d8e..81fd10efde62a 100644 --- a/sycl/test/abi/symbol_size_alignment.cpp +++ b/sycl/test/abi/symbol_size_alignment.cpp @@ -62,7 +62,7 @@ int main() { check(); check, 272, 8>(); #else - check(); + check(); check(); check, 240, 8>(); #endif diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 01d9e8a20bd6f..d8568fda7b7df 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -33,13 +33,13 @@ class MockHandler : public sycl::handler { case detail::CG::KERNEL: case detail::CG::RUN_ON_HOST_INTEL: { CommandGroup.reset(new detail::CGExecKernel( - std::move(CGH->MNDRDesc), std::move(CGH->MHostKernel), - std::move(CGH->MKernel), std::move(CGH->MArgsStorage), - std::move(CGH->MAccStorage), std::move(CGH->MSharedPtrStorage), - std::move(CGH->MRequirements), std::move(CGH->MEvents), - std::move(CGH->MArgs), std::move(CGH->MKernelName), - std::move(CGH->MOSModuleHandle), std::move(CGH->MStreamStorage), - CGH->MCGType, CGH->MCodeLoc)); + std::move(CGH->MNDRDesc), std::move(CGH->MPropList), + std::move(CGH->MHostKernel), std::move(CGH->MKernel), + std::move(CGH->MArgsStorage), std::move(CGH->MAccStorage), + std::move(CGH->MSharedPtrStorage), std::move(CGH->MRequirements), + std::move(CGH->MEvents), std::move(CGH->MArgs), + std::move(CGH->MKernelName), std::move(CGH->MOSModuleHandle), + std::move(CGH->MStreamStorage), CGH->MCGType, CGH->MCodeLoc)); break; } default: