diff --git a/README.md b/README.md index 9287b18e707d0..8d460f10cd349 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,8 @@ is something we are interested in expanding on. | Recording an in-order queue preserves linear dependencies | Not implemented | | Using `handler::parallel_for` in a graph node | Implemented | | Using `handler::single_task` in a graph node | Implemented | -| Using `handler::memcpy` in a graph node | Implemented for USM, not implemented for buffer accessors | -| Using `handler::copy` in a graph node | Not implemented | +| Using `handler::memcpy` in a graph node | Implemented | +| Using `handler::copy` in a graph node | Implemented | | Using `handler::host_task` in a graph node | Not implemented | | Using `handler::fill` in a graph node | Implemented for USM, not implemented for buffer accessors | | Using `handler::memset` in a graph node | Not implemented | diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 023e0f555b64c..fe8043a5696e8 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -148,6 +148,8 @@ _PI_API(piextCommandBufferRelease) _PI_API(piextCommandBufferFinalize) _PI_API(piextCommandBufferNDRangeKernel) _PI_API(piextCommandBufferMemcpyUSM) +_PI_API(piextCommandBufferMemBufferCopy) +_PI_API(piextCommandBufferMemBufferCopyRect) _PI_API(piextEnqueueCommandBuffer) _PI_API(piPluginGetLastError) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 8c3b471c42202..1f8ccd5b8cd63 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -2183,6 +2183,50 @@ __SYCL_EXPORT pi_result piextCommandBufferMemcpyUSM( const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); +/// API to append a mem buffer copy command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param src_buffer is the data to be copied +/// \param dst_buffer is the location the data will be copied +/// \param src_offset offset into \p src_buffer +/// \param dst_offset offset into \p dst_buffer +/// \param size is number of bytes to copy +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a rectangular mem buffer copy command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param src_buffer is the data to be copied +/// \param dst_buffer is the location the data will be copied +/// \param src_origin offset for the start of the region to copy in src_buffer +/// \param dst_origin offset for the start of the region to copy in dst_buffer +/// \param region The size of the region to be copied +/// \param src_row_pitch Row pitch for the src data +/// \param src_slice_pitch Slice pitch for the src data +/// \param dst_row_pitch Row pitch for the dst data +/// \param dst_slice_pitch Slice pitch for the dst data +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + /// API to submit the command-buffer to queue for execution, returns an error if /// command-buffer not finalized or another instance of same command-buffer /// currently executing. diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 0aa8a64427705..4de1bfb00dae3 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -5845,6 +5845,28 @@ pi_result cuda_piextCommandBufferMemcpyUSM( return {}; } +pi_result cuda_piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + sycl::detail::pi::die("command-buffer API not implemented in CUDA backend"); + return {}; +} + +pi_result cuda_piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + sycl::detail::pi::die("command-buffer API not implemented in CUDA backend"); + return {}; +} + pi_result cuda_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, @@ -6050,6 +6072,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextCommandBufferRelease, cuda_piextCommandBufferRelease) _PI_CL(piextCommandBufferNDRangeKernel, cuda_piextCommandBufferNDRangeKernel) _PI_CL(piextCommandBufferMemcpyUSM, cuda_piextCommandBufferMemcpyUSM) + _PI_CL(piextCommandBufferMemBufferCopy, cuda_piextCommandBufferMemBufferCopy) + _PI_CL(piextCommandBufferMemBufferCopyRect, + cuda_piextCommandBufferMemBufferCopyRect) _PI_CL(piextEnqueueCommandBuffer, cuda_piextEnqueueCommandBuffer) _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj) diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index 4f22a2425033c..841aceabc4078 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -2136,6 +2136,26 @@ piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr, DIE_NO_IMPLEMENTATION; } +pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + DIE_NO_IMPLEMENTATION; +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 4d3d98e601721..56a234818977b 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -5593,6 +5593,28 @@ hip_piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, return {}; } +pi_result hip_piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + pi_result hip_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, @@ -5798,6 +5820,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextCommandBufferRelease, hip_piextCommandBufferRelease) _PI_CL(piextCommandBufferNDRangeKernel, hip_piextCommandBufferNDRangeKernel) _PI_CL(piextCommandBufferMemcpyUSM, hip_piextCommandBufferMemcpyUSM) + _PI_CL(piextCommandBufferMemBufferCopy, hip_piextCommandBufferMemBufferCopy) + _PI_CL(piextCommandBufferMemBufferCopyRect, + hip_piextCommandBufferMemBufferCopyRect) _PI_CL(piextEnqueueCommandBuffer, hip_piextEnqueueCommandBuffer) _PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 8761dc4404356..141a67723ced2 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -9058,21 +9058,16 @@ pi_result piextCommandBufferNDRangeKernel( return PI_SUCCESS; } -pi_result piextCommandBufferMemcpyUSM( - pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr, +// Helper function for common code when enqueuing memory operations to a command +// buffer. +static pi_result enqueueCommandBufferMemCopyHelper( + pi_ext_command_buffer CommandBuffer, void *Dst, const void *Src, size_t Size, pi_uint32 NumSyncPointsInWaitList, const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { - if (!DstPtr) { - return PI_ERROR_INVALID_VALUE; - } - std::vector ZeEventList; pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints, NumSyncPointsInWaitList, SyncPointWaitList, ZeEventList); - if (Res) { - return Res; - } pi_event LaunchEvent; Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent); @@ -9080,8 +9075,8 @@ pi_result piextCommandBufferMemcpyUSM( return PI_ERROR_OUT_OF_HOST_MEMORY; ZE_CALL(zeCommandListAppendMemoryCopy, - (CommandBuffer->ZeCommandList, DstPtr, SrcPtr, Size, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + (CommandBuffer->ZeCommandList, Dst, Src, Size, LaunchEvent->ZeEvent, + ZeEventList.size(), ZeEventList.data())); urPrint("calling zeCommandListAppendMemoryCopy() with" " ZeEvent %#lx\n", @@ -9093,6 +9088,140 @@ pi_result piextCommandBufferMemcpyUSM( return PI_SUCCESS; } +// Helper function for common code when enqueuing rectangular memory operations +// to a command buffer. +static pi_result enqueueCommandBufferMemCopyRectHelper( + pi_ext_command_buffer CommandBuffer, void *Dst, const void *Src, + pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, + pi_buff_rect_region Region, size_t SrcRowPitch, size_t DstRowPitch, + size_t SrcSlicePitch, size_t DstSlicePitch, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + PI_ASSERT(Region && SrcOrigin && DstOrigin, PI_ERROR_INVALID_VALUE); + + uint32_t SrcOriginX = ur_cast(SrcOrigin->x_bytes); + uint32_t SrcOriginY = ur_cast(SrcOrigin->y_scalar); + uint32_t SrcOriginZ = ur_cast(SrcOrigin->z_scalar); + + uint32_t SrcPitch = SrcRowPitch; + if (SrcPitch == 0) + SrcPitch = ur_cast(Region->width_bytes); + + if (SrcSlicePitch == 0) + SrcSlicePitch = ur_cast(Region->height_scalar) * SrcPitch; + + uint32_t DstOriginX = ur_cast(DstOrigin->x_bytes); + uint32_t DstOriginY = ur_cast(DstOrigin->y_scalar); + uint32_t DstOriginZ = ur_cast(DstOrigin->z_scalar); + + uint32_t DstPitch = DstRowPitch; + if (DstPitch == 0) + DstPitch = ur_cast(Region->width_bytes); + + if (DstSlicePitch == 0) + DstSlicePitch = ur_cast(Region->height_scalar) * DstPitch; + + uint32_t Width = ur_cast(Region->width_bytes); + uint32_t Height = ur_cast(Region->height_scalar); + uint32_t Depth = ur_cast(Region->depth_scalar); + + const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, + Width, Height, Depth}; + const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, + Width, Height, Depth}; + + std::vector ZeEventList; + pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints, + NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList); + + pi_event LaunchEvent; + Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent); + if (Res) + return PI_ERROR_OUT_OF_HOST_MEMORY; + + ZE_CALL(zeCommandListAppendMemoryCopyRegion, + (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, + DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, + LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + + urPrint("calling zeCommandListAppendMemoryCopyRegion() with" + " ZeEvent %#lx\n", + ur_cast(LaunchEvent->ZeEvent)); + + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + return PI_SUCCESS; +} + +pi_result piextCommandBufferMemcpyUSM( + pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr, + size_t Size, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + if (!DstPtr) { + return PI_ERROR_INVALID_VALUE; + } + + return enqueueCommandBufferMemCopyHelper(CommandBuffer, DstPtr, SrcPtr, Size, + NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + size_t SrcOffset, size_t DstOffset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); + + auto SrcBuffer = ur_cast(SrcMem); + auto DstBuffer = ur_cast(DstMem); + + std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex> LockAll( + SrcLock, DstBuffer->Mutex); + + char *ZeHandleSrc; + PI_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, + CommandBuffer->Device)); + char *ZeHandleDst; + PI_CALL(DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, + CommandBuffer->Device)); + + return enqueueCommandBufferMemCopyHelper( + CommandBuffer, ZeHandleDst, ZeHandleSrc, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, + pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch, + size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); + + auto SrcBuffer = ur_cast(SrcMem); + auto DstBuffer = ur_cast(DstMem); + + std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex> LockAll( + SrcLock, DstBuffer->Mutex); + + char *ZeHandleSrc; + PI_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, + CommandBuffer->Device)); + char *ZeHandleDst; + PI_CALL(DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, + CommandBuffer->Device)); + + return enqueueCommandBufferMemCopyRectHelper( + CommandBuffer, ZeHandleDst, ZeHandleSrc, SrcOrigin, DstOrigin, Region, + SrcRowPitch, DstRowPitch, SrcSlicePitch, DstSlicePitch, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 5100f762725b3..0b56b08200dcd 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -2320,6 +2320,28 @@ piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr, return {}; } +pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + // Not implemented + return {}; +} + +pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + // Not implemented + return {}; +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, pi_uint32 num_events_in_wait_list, @@ -2530,6 +2552,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextCommandBufferRelease, piextCommandBufferRelease) _PI_CL(piextCommandBufferNDRangeKernel, piextCommandBufferNDRangeKernel) _PI_CL(piextCommandBufferMemcpyUSM, piextCommandBufferMemcpyUSM) + _PI_CL(piextCommandBufferMemBufferCopy, piextCommandBufferMemBufferCopy) + _PI_CL(piextCommandBufferMemBufferCopyRect, + piextCommandBufferMemBufferCopyRect) _PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer) _PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index ef7fdf09c9796..b1bb693191249 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -48,24 +48,24 @@ void connect_to_exit_nodes( } } -/// Recursive check if a graph node or its successors contains a given kernel -/// argument. -/// @param[in] Arg The kernel argument to check for. +/// Recursive check if a graph node or its successors contains a given +/// requirement. +/// @param[in] Req The requirement to check for. /// @param[in] CurrentNode The current graph node being checked. /// @param[in,out] Deps The unique list of dependencies which have been -/// identified for this arg. +/// identified for this requirement. /// @return True if a dependency was added in this node or any of its /// successors. -bool check_for_arg(const sycl::detail::ArgDesc &Arg, - const std::shared_ptr &CurrentNode, - std::set> &Deps) { +bool check_for_requirement(sycl::detail::AccessorImplHost *Req, + const std::shared_ptr &CurrentNode, + std::set> &Deps) { bool SuccessorAddedDep = false; for (auto &Successor : CurrentNode->MSuccessors) { - SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps); + SuccessorAddedDep |= check_for_requirement(Req, Successor, Deps); } if (!CurrentNode->is_empty() && Deps.find(CurrentNode) == Deps.end() && - CurrentNode->has_arg(Arg) && !SuccessorAddedDep) { + CurrentNode->has_requirement(Req) && !SuccessorAddedDep) { Deps.insert(CurrentNode); return true; } @@ -164,36 +164,32 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType, const std::vector> &Dep) { // Copy deps so we can modify them auto Deps = Dep; - if (CGType == sycl::detail::CG::Kernel) { - // A unique set of dependencies obtained by checking kernel arguments - // for accessors - std::set> UniqueDeps; - const auto &Args = - static_cast(CommandGroup.get())->MArgs; - for (auto &Arg : Args) { - if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) { - continue; - } - // Look through the graph for nodes which share this argument - for (auto NodePtr : MRoots) { - check_for_arg(Arg, NodePtr, UniqueDeps); - } - } - // Add any deps determined from accessor arguments into the dependency list - Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); + // A unique set of dependencies obtained by checking requirements and events + std::set> UniqueDeps; + const auto &Requirements = CommandGroup->MRequirements; + for (auto &Req : Requirements) { + // Look through the graph for nodes which share this requirement + for (auto NodePtr : MRoots) { + check_for_requirement(Req, NodePtr, UniqueDeps); + } } // Add any nodes specified by event dependencies into the dependency list for (auto Dep : CommandGroup->MEvents) { if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { - Deps.push_back(NodeImpl->second); + if (UniqueDeps.find(NodeImpl->second) == UniqueDeps.end()) { + UniqueDeps.insert(NodeImpl->second); + } } else { throw sycl::exception(errc::invalid, "Event dependency from handler::depends_on does " "not correspond to a node within the graph"); } } + // Add any deps determined from requirements and events into the dependency + // list + Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); const std::shared_ptr &NodeImpl = std::make_shared(CGType, std::move(CommandGroup)); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 67f0e888d7a91..b3c75c7f2e2c5 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -90,23 +91,14 @@ class node_impl { Schedule.push_front(NodeImpl); } - /// Checks if this node has an argument. - /// @param Arg Argument to lookup. - /// @return True if \p Arg is used in node, false otherwise. - bool has_arg(const sycl::detail::ArgDesc &Arg) { - // TODO: Handle types other than exec kernel - assert(MCGType == sycl::detail::CG::Kernel); - const auto &Args = - static_cast(MCommandGroup.get())->MArgs; - for (auto &NodeArg : Args) { - if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) { - // Args are actually void** so we need to dereference them to compare - // actual values - void *IncomingPtr = *static_cast(Arg.MPtr); - void *ArgPtr = *static_cast(NodeArg.MPtr); - if (IncomingPtr == ArgPtr) { - return true; - } + /// Checks if this node has a given requirement. + /// @param Requirement Requirement to lookup. + /// @return True if \p Requirement is present in node, false otherwise. + bool has_requirement(sycl::detail::AccessorImplHost *IncomingReq) { + for (sycl::detail::AccessorImplHost *CurrentReq : + MCommandGroup->MRequirements) { + if (IncomingReq->MSYCLMemObj == CurrentReq->MSYCLMemObj) { + return true; } } return false; diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index abf5a400ca748..6a26fde95decf 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -1224,6 +1224,68 @@ void MemoryManager::copy_from_device_global( } // Command buffer methods +void MemoryManager::ext_oneapi_copy_cmd_buffer( + sycl::detail::ContextImplPtr Context, RT::PiExtCommandBuffer CommandBuffer, + SYCLMemObjI *SYCLMemObj, void *SrcMem, unsigned int DimSrc, + sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange, + sycl::id<3> SrcOffset, unsigned int SrcElemSize, void *DstMem, + unsigned int DimDst, sycl::range<3> DstSize, sycl::range<3> DstAccessRange, + sycl::id<3> DstOffset, unsigned int DstElemSize, + std::vector Deps, RT::PiExtSyncPoint *OutSyncPoint) { + assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + + const PluginPtr &Plugin = Context->getPlugin(); + + detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType(); + TermPositions SrcPos, DstPos; + prepTermPositions(SrcPos, DimSrc, MemType); + prepTermPositions(DstPos, DimDst, MemType); + + size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize; + size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize; + size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize; + size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize; + size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize; + + if (MemType == detail::SYCLMemObjI::MemObjType::Buffer) { + if (1 == DimDst && 1 == DimSrc) { + Plugin->call( + CommandBuffer, RT::cast(SrcMem), + RT::cast(DstMem), SrcXOffBytes, DstXOffBytes, + SrcAccessRangeWidthBytes, Deps.size(), Deps.data(), OutSyncPoint); + } else { + // passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will + // calculate both src and dest pitch using region[0], which is not correct + // if src and dest are not the same size. + size_t SrcRowPitch = SrcSzWidthBytes; + size_t SrcSlicePitch = (DimSrc <= 1) + ? SrcSzWidthBytes + : SrcSzWidthBytes * SrcSize[SrcPos.YTerm]; + size_t DstRowPitch = DstSzWidthBytes; + size_t DstSlicePitch = (DimDst <= 1) + ? DstSzWidthBytes + : DstSzWidthBytes * DstSize[DstPos.YTerm]; + + pi_buff_rect_offset_struct SrcOrigin{ + SrcXOffBytes, SrcOffset[SrcPos.YTerm], SrcOffset[SrcPos.ZTerm]}; + pi_buff_rect_offset_struct DstOrigin{ + DstXOffBytes, DstOffset[DstPos.YTerm], DstOffset[DstPos.ZTerm]}; + pi_buff_rect_region_struct Region{SrcAccessRangeWidthBytes, + SrcAccessRange[SrcPos.YTerm], + SrcAccessRange[SrcPos.ZTerm]}; + + Plugin->call( + CommandBuffer, RT::cast(SrcMem), + RT::cast(DstMem), &SrcOrigin, &DstOrigin, &Region, + SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, Deps.size(), + Deps.data(), OutSyncPoint); + } + } else { + throw sycl::exception(sycl::errc::invalid, + "Images are not supported in Graphs"); + } +} + void MemoryManager::ext_oneapi_copy_usm_cmd_buffer( ContextImplPtr Context, const void *SrcMem, RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem, diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 578730d45c4fc..f967c0fd3b162 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -175,6 +175,16 @@ class __SYCL_EXPORT MemoryManager { const std::vector &DepEvents, RT::PiEvent *OutEvent); // Command buffer extension methods + static void ext_oneapi_copy_cmd_buffer( + sycl::detail::ContextImplPtr Context, + RT::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, + void *SrcMem, unsigned int DimSrc, sycl::range<3> SrcSize, + sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset, + unsigned int SrcElemSize, void *DstMem, unsigned int DimDst, + sycl::range<3> DstSize, sycl::range<3> DstAccessRange, + sycl::id<3> DstOffset, unsigned int DstElemSize, + std::vector Deps, RT::PiExtSyncPoint *OutSyncPoint); + static void ext_oneapi_copy_usm_cmd_buffer( ContextImplPtr Context, const void *SrcMem, RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index e50f3823cbbdc..4adcd934bd4f4 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2521,6 +2521,25 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return PI_SUCCESS; } + case CG::CGTYPE::CopyAccToAcc: { + CGCopy *Copy = (CGCopy *)MCommandGroup.get(); + Requirement *ReqSrc = (Requirement *)(Copy->getSrc()); + Requirement *ReqDst = (Requirement *)(Copy->getDst()); + + AllocaCommandBase *AllocaCmdSrc = getAllocaForReq(ReqSrc); + AllocaCommandBase *AllocaCmdDst = getAllocaForReq(ReqDst); + + MemoryManager::ext_oneapi_copy_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, + AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), + ReqSrc->MDims, ReqSrc->MMemoryRange, ReqSrc->MAccessRange, + ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(), + ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange, + ReqDst->MOffset, ReqDst->MElemSize, std::move(MSyncPointDeps), + &OutSyncPoint); + MEvent->setSyncPoint(OutSyncPoint); + return PI_SUCCESS; + } default: throw runtime_error("CG type not implemented for command buffers.", PI_ERROR_INVALID_OPERATION); diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp new file mode 100644 index 0000000000000..6b3b0501f49e0 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp @@ -0,0 +1,123 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests adding a buffer copy node using the explicit API and submitting +// the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (unsigned i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA{DataA}; + BufferA.set_write_back(false); + buffer BufferB{DataB}; + BufferB.set_write_back(false); + buffer BufferC{DataC}; + BufferC.set_write_back(false); + + // Copy from B to A + auto NodeA = Graph.add([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); + }); + + // Read & write A + auto NodeB = Graph.add( + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccA[LinID] += ModValue; + }); + }, + {exp_ext::property::node::depends_on(NodeA)}); + + // Read & write B + auto NodeModB = Graph.add( + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }, + {exp_ext::property::node::depends_on(NodeA)}); + + // memcpy from A to B + auto NodeC = Graph.add( + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }, + {exp_ext::property::node::depends_on(NodeB, NodeModB)}); + + // Read and write B + auto NodeD = Graph.add( + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }, + {exp_ext::property::node::depends_on(NodeC)}); + + // Copy from B to C + Graph.add( + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }, + {exp_ext::property::node::depends_on(NodeD)}); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == HostAccA[i]); + assert(ReferenceB[i] == HostAccB[i]); + assert(ReferenceC[i] == HostAccC[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp new file mode 100644 index 0000000000000..df11345306217 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp @@ -0,0 +1,120 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests adding buffer 2d copy nodes using the explicit API and submitting +// the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size * Size), DataB(Size * Size), DataC(Size * Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (unsigned i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size * Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + // Make the buffers 2D so we can test the rect copy path + buffer BufferA{DataA.data(), range<2>(Size, Size)}; + BufferA.set_write_back(false); + buffer BufferB{DataB.data(), range<2>(Size, Size)}; + BufferB.set_write_back(false); + buffer BufferC{DataC.data(), range<2>(Size, Size)}; + BufferC.set_write_back(false); + + // Copy from B to A + auto NodeA = Graph.add([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); + }); + + // Read & write A + auto NodeB = Graph.add( + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccA[id] += ModValue; }); + }, + {exp_ext::property::node::depends_on(NodeA)}); + + // Read & write B + auto NodeModB = Graph.add( + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }, + {exp_ext::property::node::depends_on(NodeA)}); + + // memcpy from A to B + auto NodeC = Graph.add( + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }, + {exp_ext::property::node::depends_on(NodeB, NodeModB)}); + + // Read and write B + auto NodeD = Graph.add( + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }, + {exp_ext::property::node::depends_on(NodeC)}); + + // Copy from B to C + Graph.add( + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }, + {exp_ext::property::node::depends_on(NodeD)}); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + for (size_t j = 0; j < Size; j++) { + assert(ReferenceA[i * Size + j] == HostAccA[i][j]); + assert(ReferenceB[i * Size + j] == HostAccB[i][j]); + assert(ReferenceC[i * Size + j] == HostAccC[i][j]); + } + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp new file mode 100644 index 0000000000000..64b2380c0cab3 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp @@ -0,0 +1,117 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests adding buffer copy nodes using the record and replay API and submitting +// the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (unsigned i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA{DataA}; + BufferA.set_write_back(false); + buffer BufferB{DataB}; + BufferB.set_write_back(false); + buffer BufferC{DataC}; + BufferC.set_write_back(false); + + Graph.begin_recording(Queue); + + // Copy from B to A + Queue.submit([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); + }); + + // Read & write A + Queue.submit([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccA[LinID] += ModValue; + }); + }); + + // Read & write B + Queue.submit([&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }); + + // memcpy from A to B + Queue.submit([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }); + + // Read and write B + Queue.submit([&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }); + + // Copy from B to C + Queue.submit([&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }); + + Graph.end_recording(Queue); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == HostAccA[i]); + assert(ReferenceB[i] == HostAccB[i]); + assert(ReferenceC[i] == HostAccC[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp new file mode 100644 index 0000000000000..8e0df866b3041 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp @@ -0,0 +1,114 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests adding buffer 2d copy nodes using the record and replay API +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size * Size), DataB(Size * Size), DataC(Size * Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (unsigned i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size * Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + // Make the buffers 2D so we can test the rect copy path + buffer BufferA{DataA.data(), range<2>(Size, Size)}; + BufferA.set_write_back(false); + buffer BufferB{DataB.data(), range<2>(Size, Size)}; + BufferB.set_write_back(false); + buffer BufferC{DataC.data(), range<2>(Size, Size)}; + BufferC.set_write_back(false); + + Graph.begin_recording(Queue); + + // Copy from B to A + Queue.submit([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); + }); + + // Read & write A + Queue.submit([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccA[id] += ModValue; }); + }); + + // Read & write B + Queue.submit([&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }); + + // memcpy from A to B + Queue.submit([&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }); + + // Read and write B + Queue.submit([&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }); + + // Copy from B to C + Queue.submit([&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }); + + Graph.end_recording(Queue); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + for (size_t j = 0; j < Size; j++) { + assert(ReferenceA[i * Size + j] == HostAccA[i][j]); + assert(ReferenceB[i * Size + j] == HostAccB[i][j]); + assert(ReferenceC[i * Size + j] == HostAccC[i][j]); + } + } + + return 0; +} diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 7db83ecdb4304..b02dbd119db2a 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -85,6 +85,8 @@ piTearDown piclProgramCreateWithSource piextCommandBufferCreate piextCommandBufferFinalize +piextCommandBufferMemBufferCopy +piextCommandBufferMemBufferCopyRect piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferRelease diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index f276ec47a80f5..0a707f2892718 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -36,6 +36,8 @@ piTearDown piclProgramCreateWithSource piextCommandBufferCreate piextCommandBufferFinalize +piextCommandBufferMemBufferCopy +piextCommandBufferMemBufferCopyRect piextCommandBufferMemcpyUSM piextCommandBufferNDRangeKernel piextCommandBufferRelease diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 75d02522585df..464572462a40b 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3843,6 +3843,7 @@ _ZN4sycl3_V16detail13MemoryManager20allocateMemSubBufferESt10shared_ptrINS1_12co _ZN4sycl3_V16detail13MemoryManager21copy_to_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmS4_lRKSt6vectorIP9_pi_eventSaISA_EEPSA_ _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvlRKSt6vectorIP9_pi_eventSaISB_EEPSB_ _ZN4sycl3_V16detail13MemoryManager24allocateInteropMemObjectESt10shared_ptrINS1_12context_implEEPvRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event +_ZN4sycl3_V16detail13MemoryManager26ext_oneapi_copy_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSA_jSC_SC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager30ext_oneapi_copy_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEPKvP22_pi_ext_command_buffermPvSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager3mapEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEENS0_6access4modeEjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSt6vectorIP9_pi_eventSaISH_EERSH_ _ZN4sycl3_V16detail13MemoryManager4copyEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEEjNS0_5rangeILi3EEESA_NS0_2idILi3EEEjS5_S8_jSA_SA_SC_jSt6vectorIP9_pi_eventSaISF_EERSF_ diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 7449b727bbedb..0e6456d350df6 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1183,6 +1183,26 @@ inline pi_result mock_piextEnqueueCommandBuffer( return PI_SUCCESS; } +inline pi_result mock_piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + inline pi_result mock_piTearDown(void *PluginParameter) { return PI_SUCCESS; } inline pi_result mock_piPluginGetLastError(char **message) {