diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index c97e82fb24733..0880b3c0788ba 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -158,4 +158,19 @@ _PI_API(piextEnqueueDeviceGlobalVariableRead) _PI_API(piPluginGetBackendOption) +// command-buffer Extension +_PI_API(piextCommandBufferCreate) +_PI_API(piextCommandBufferRetain) +_PI_API(piextCommandBufferRelease) +_PI_API(piextCommandBufferFinalize) +_PI_API(piextCommandBufferNDRangeKernel) +_PI_API(piextCommandBufferMemcpyUSM) +_PI_API(piextCommandBufferMemBufferCopy) +_PI_API(piextCommandBufferMemBufferCopyRect) +_PI_API(piextCommandBufferMemBufferWrite) +_PI_API(piextCommandBufferMemBufferWriteRect) +_PI_API(piextCommandBufferMemBufferRead) +_PI_API(piextCommandBufferMemBufferReadRect) +_PI_API(piextEnqueueCommandBuffer) + #undef _PI_API diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 7ad983ec5331d..3283261c6748d 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -96,9 +96,10 @@ // piextQueueGetNativeHandle // 14.33 Added new parameter (memory object properties) to // piextKernelSetArgMemObj +// 14.34 Added command-buffer extension methods #define _PI_H_VERSION_MAJOR 14 -#define _PI_H_VERSION_MINOR 33 +#define _PI_H_VERSION_MINOR 34 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -477,6 +478,7 @@ typedef enum { PI_COMMAND_TYPE_SVM_MEMFILL = 0x120B, PI_COMMAND_TYPE_SVM_MAP = 0x120C, PI_COMMAND_TYPE_SVM_UNMAP = 0x120D, + PI_COMMAND_TYPE_EXT_COMMAND_BUFFER = 0x12A8, PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ = 0x418E, PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE = 0x418F } _pi_command_type; @@ -2128,6 +2130,228 @@ __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime); +/// Command buffer extension +struct _pi_ext_command_buffer; +struct _pi_ext_sync_point; +using pi_ext_command_buffer = _pi_ext_command_buffer *; +using pi_ext_sync_point = pi_uint32; + +typedef enum { + PI_EXT_STRUCTURE_TYPE_COMMAND_BUFFER_DESC = 0 +} pi_ext_structure_type; + +struct pi_ext_command_buffer_desc final { + pi_ext_structure_type stype; + const void *pNext; + pi_queue_properties *properties; +}; + +/// API to create a command-buffer. +/// \param context The context to associate the command-buffer with. +/// \param device The device to associate the command-buffer with. +/// \param desc Descriptor for the new command-buffer. +/// \param ret_command_buffer Pointer to fill with the address of the new +/// command-buffer. +__SYCL_EXPORT pi_result +piextCommandBufferCreate(pi_context context, pi_device device, + const pi_ext_command_buffer_desc *desc, + pi_ext_command_buffer *ret_command_buffer); + +/// API to increment the reference count of the command-buffer +/// \param command_buffer The command_buffer to retain. +__SYCL_EXPORT pi_result +piextCommandBufferRetain(pi_ext_command_buffer command_buffer); + +/// API to decrement the reference count of the command-buffer. After the +/// command_buffer reference count becomes zero and has finished execution, the +/// command-buffer is deleted. +/// \param command_buffer The command_buffer to release. +__SYCL_EXPORT pi_result +piextCommandBufferRelease(pi_ext_command_buffer command_buffer); + +/// API to stop command-buffer recording such that no more commands can be +/// appended, and makes the command-buffer ready to enqueue on a command-queue. +/// \param command_buffer The command_buffer to finalize. +__SYCL_EXPORT pi_result +piextCommandBufferFinalize(pi_ext_command_buffer command_buffer); + +/// API to append a kernel execution command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param kernel The kernel to append. +/// \param work_dim Dimension of the kernel execution. +/// \param global_work_offset Offset to use when executing kernel. +/// \param global_work_size Global work size to use when executing kernel. +/// \param local_work_size Local work size to use when executing kernel. +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this kernel execution. +__SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel( + pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a USM memcpy command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param dst_ptr is the location the data will be copied +/// \param src_ptr is the data to be copied +/// \param size is number of bytes to copy +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemcpyUSM( + pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr, + size_t size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a mem buffer copy command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param src_buffer is the data to be copied +/// \param dst_buffer is the location the data will be copied +/// \param src_offset offset into \p src_buffer +/// \param dst_offset offset into \p dst_buffer +/// \param size is number of bytes to copy +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a rectangular mem buffer copy command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param src_buffer is the data to be copied +/// \param dst_buffer is the location the data will be copied +/// \param src_origin offset for the start of the region to copy in src_buffer +/// \param dst_origin offset for the start of the region to copy in dst_buffer +/// \param region The size of the region to be copied +/// \param src_row_pitch Row pitch for the src data +/// \param src_slice_pitch Slice pitch for the src data +/// \param dst_row_pitch Row pitch for the dst data +/// \param dst_slice_pitch Slice pitch for the dst data +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a mem buffer read command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param buffer is the data to be read +/// \param offset offset into \p buffer +/// \param size is number of bytes to read +/// \param dst is the pointer to the destination +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferRead( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, void *dst, pi_uint32 num_events_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a rectangular mem buffer read command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param buffer is the data to be read +/// \param buffer_offset offset for the start of the region to read in buffer +/// \param host_offset offset for the start of the region to be written from ptr +/// \param region The size of the region to read +/// \param buffer_row_pitch Row pitch for the source buffer data +/// \param buffer_slice_pitch Slice pitch for the source buffer data +/// \param host_row_pitch Row pitch for the destination data ptr +/// \param host_slice_pitch Slice pitch for the destination data ptr +/// \param ptr is the location the data will be written +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + void *ptr, pi_uint32 num_events_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a mem buffer write command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param buffer is the location to write the data +/// \param offset offset into \p buffer +/// \param size is number of bytes to write +/// \param ptr is the pointer to the source +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferWrite( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, const void *ptr, pi_uint32 num_events_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a rectangular mem buffer write command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param buffer is the location to write the data +/// \param buffer_offset offset for the start of the region to write in buffer +/// \param host_offset offset for the start of the region to be read from ptr +/// \param region The size of the region to write +/// \param buffer_row_pitch Row pitch for the buffer data +/// \param buffer_slice_pitch Slice pitch for the buffer data +/// \param host_row_pitch Row pitch for the source data ptr +/// \param host_slice_pitch Slice pitch for the source data ptr +/// \param ptr is the pointer to the source +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + const void *ptr, pi_uint32 num_events_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to submit the command-buffer to queue for execution, returns an error if +/// the command-buffer is not finalized or another instance of the same +/// command-buffer is currently executing. +/// \param command_buffer The command-buffer to be submitted. +/// \param queue The PI queue to submit on. +/// \param num_events_in_wait_list The number of events that this execution +/// depends on. +/// \param event_wait_list List of pi_events to wait on. +/// \param event The pi_event associated with this enqueue. +__SYCL_EXPORT pi_result +piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, pi_queue queue, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); + struct _pi_plugin { // PI version supported by host passed to the plugin. The Plugin // checks and writes the appropriate Function Pointers in diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp index ed46c0547d4f7..3898771754943 100644 --- a/sycl/include/sycl/detail/pi.hpp +++ b/sycl/include/sycl/detail/pi.hpp @@ -148,6 +148,9 @@ using PiMemObjectType = ::pi_mem_type; using PiMemImageChannelOrder = ::pi_image_channel_order; using PiMemImageChannelType = ::pi_image_channel_type; using PiKernelCacheConfig = ::pi_kernel_cache_config; +using PiExtSyncPoint = ::pi_ext_sync_point; +using PiExtCommandBuffer = ::pi_ext_command_buffer; +using PiExtCommandBufferDesc = ::pi_ext_command_buffer_desc; __SYCL_EXPORT void contextSetExtendedDeleter(const sycl::context &constext, pi_context_extended_deleter func, diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index 2570b6f7e7348..e25856515f2f5 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -79,6 +79,8 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/tracing.cpp" "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" "../unified_runtime/ur/adapters/cuda/usm.cpp" + "../unified_runtime/ur/adapters/cuda/command_buffer.hpp" + "../unified_runtime/ur/adapters/cuda/command_buffer.cpp" # --- "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 9af47b47a6b2a..074959f91c4d4 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -186,6 +186,19 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer) _PI_CL(piPluginGetBackendOption, pi2ur::piPluginGetBackendOption) + // command-buffer + _PI_CL(piextCommandBufferCreate, pi2ur::piextCommandBufferCreate) + _PI_CL(piextCommandBufferRetain, pi2ur::piextCommandBufferRetain) + _PI_CL(piextCommandBufferRelease, pi2ur::piextCommandBufferRelease) + _PI_CL(piextCommandBufferNDRangeKernel, + pi2ur::piextCommandBufferNDRangeKernel) + _PI_CL(piextCommandBufferMemcpyUSM, pi2ur::piextCommandBufferMemcpyUSM) + _PI_CL(piextCommandBufferMemBufferCopy, + pi2ur::piextCommandBufferMemBufferCopy) + _PI_CL(piextCommandBufferMemBufferCopyRect, + pi2ur::piextCommandBufferMemBufferCopyRect) + _PI_CL(piextEnqueueCommandBuffer, pi2ur::piextEnqueueCommandBuffer) + #undef _PI_CL return PI_SUCCESS; diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 8fb4664199286..b65c867c71a03 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -25,6 +25,7 @@ #define _PI_CUDA_PLUGIN_VERSION_STRING \ _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION) +#include #include #include #include @@ -76,4 +77,8 @@ struct _pi_sampler : ur_sampler_handle_t_ { using ur_sampler_handle_t_::ur_sampler_handle_t_; }; +struct _pi_ext_command_buffer : ur_exp_command_buffer_handle_t_ { + using ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_; +}; + #endif // PI_CUDA_HPP diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index 1248f0ac3f402..c390102c2131f 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -2099,6 +2099,86 @@ pi_result piextEnqueueDeviceGlobalVariableRead(pi_queue, pi_program, DIE_NO_IMPLEMENTATION; } +pi_result piextCommandBufferCreate(pi_context, pi_device, + const pi_ext_command_buffer_desc *, + pi_ext_command_buffer *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferRetain(pi_ext_command_buffer) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferRelease(pi_ext_command_buffer) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferFinalize(pi_ext_command_buffer) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferNDRangeKernel(pi_ext_command_buffer, pi_kernel, + pi_uint32, const size_t *, + const size_t *, const size_t *, + pi_uint32, const pi_ext_sync_point *, + pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemcpyUSM(pi_ext_command_buffer, void *, + const void *, size_t, pi_uint32, + const pi_ext_sync_point *, + pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemBufferCopy(pi_ext_command_buffer, pi_mem, pi_mem, + size_t, size_t, size_t, pi_uint32, + const pi_ext_sync_point *, + pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer, pi_mem, pi_mem, pi_buff_rect_offset, + pi_buff_rect_offset, pi_buff_rect_region, size_t, size_t, size_t, size_t, + pi_uint32, const pi_ext_sync_point *, pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemBufferRead(pi_ext_command_buffer, pi_mem, size_t, + size_t, void *, pi_uint32, + const pi_ext_sync_point *, + pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer, pi_mem, pi_buff_rect_offset, pi_buff_rect_offset, + pi_buff_rect_region, size_t, size_t, size_t, size_t, void *, pi_uint32, + const pi_ext_sync_point *, pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemBufferWrite(pi_ext_command_buffer, pi_mem, + size_t, size_t, const void *, + pi_uint32, const pi_ext_sync_point *, + pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer, pi_mem, pi_buff_rect_offset, pi_buff_rect_offset, + pi_buff_rect_region, size_t, size_t, size_t, size_t, const void *, + pi_uint32, const pi_ext_sync_point *, pi_ext_sync_point *) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer, pi_queue, pi_uint32, + const pi_event *, pi_event *) { + DIE_NO_IMPLEMENTATION; +} + pi_result piextPluginGetOpaqueData(void *, void **OpaqueDataReturn) { *OpaqueDataReturn = reinterpret_cast(PiESimdDeviceAccess); return PI_SUCCESS; diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp index 9733275314d87..625ed7527cc8a 100755 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp @@ -218,4 +218,6 @@ struct _pi_kernel : _pi_object { _pi_kernel() {} }; +struct _pi_ext_command_buffer {}; + #include diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index ded91d264f46b..753cc29c58199 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -5593,6 +5593,223 @@ pi_result hip_piextEnqueueWriteHostPipe( sycl::detail::pi::die("hip_piextEnqueueWriteHostPipe not implemented"); return {}; } +pi_result +hip_piextCommandBufferCreate(pi_context context, pi_device device, + const pi_ext_command_buffer_desc *desc, + pi_ext_command_buffer *ret_command_buffer) { + (void)context; + (void)device; + (void)desc; + (void)ret_command_buffer; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferRetain(pi_ext_command_buffer command_buffer) { + (void)command_buffer; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferRelease(pi_ext_command_buffer command_buffer) { + (void)command_buffer; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferFinalize(pi_ext_command_buffer command_buffer) { + (void)command_buffer; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferNDRangeKernel( + pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)kernel; + (void)work_dim; + (void)global_work_offset; + (void)global_work_size; + (void)local_work_size; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result +hip_piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, + void *dst_ptr, const void *src_ptr, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)dst_ptr; + (void)src_ptr; + (void)size; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)src_buffer; + (void)dst_buffer; + (void)src_offset; + (void)dst_offset; + (void)size; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)src_buffer; + (void)dst_buffer; + (void)src_origin; + (void)dst_origin; + (void)region; + (void)src_row_pitch; + (void)src_slice_pitch; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferMemBufferRead( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, void *dst, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)offset; + (void)size; + (void)dst; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)buffer_offset; + (void)host_offset; + (void)region; + (void)buffer_row_pitch; + (void)buffer_slice_pitch; + (void)host_row_pitch; + (void)host_slice_pitch; + (void)ptr; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferMemBufferWrite( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)offset; + (void)size; + (void)ptr; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)buffer_offset; + (void)host_offset; + (void)region; + (void)buffer_row_pitch; + (void)buffer_slice_pitch; + (void)host_row_pitch; + (void)host_slice_pitch; + (void)ptr; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} + +pi_result hip_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, + pi_queue queue, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event) { + (void)command_buffer; + (void)queue; + (void)num_events_in_wait_list; + (void)event_wait_list; + (void)event; + + sycl::detail::pi::die("command-buffer API not implemented in HIP backend"); + return {}; +} // This API is called by Sycl RT to notify the end of the plugin lifetime. // Windows: dynamically loaded plugins might have been unloaded already @@ -5784,6 +6001,23 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueReadHostPipe, hip_piextEnqueueReadHostPipe) _PI_CL(piextEnqueueWriteHostPipe, hip_piextEnqueueWriteHostPipe) + // command-buffer + _PI_CL(piextCommandBufferCreate, hip_piextCommandBufferCreate) + _PI_CL(piextCommandBufferRetain, hip_piextCommandBufferRetain) + _PI_CL(piextCommandBufferRelease, hip_piextCommandBufferRelease) + _PI_CL(piextCommandBufferNDRangeKernel, hip_piextCommandBufferNDRangeKernel) + _PI_CL(piextCommandBufferMemcpyUSM, hip_piextCommandBufferMemcpyUSM) + _PI_CL(piextCommandBufferMemBufferCopy, hip_piextCommandBufferMemBufferCopy) + _PI_CL(piextCommandBufferMemBufferCopyRect, + hip_piextCommandBufferMemBufferCopyRect) + _PI_CL(piextCommandBufferMemBufferRead, hip_piextCommandBufferMemBufferRead) + _PI_CL(piextCommandBufferMemBufferReadRect, + hip_piextCommandBufferMemBufferReadRect) + _PI_CL(piextCommandBufferMemBufferWrite, hip_piextCommandBufferMemBufferWrite) + _PI_CL(piextCommandBufferMemBufferWriteRect, + hip_piextCommandBufferMemBufferWriteRect) + _PI_CL(piextEnqueueCommandBuffer, hip_piextEnqueueCommandBuffer) + _PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, hip_piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, hip_piPluginGetLastError) diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 4b3c5f53298d9..b5648a8de42ed 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -954,6 +954,8 @@ struct _pi_sampler { pi_uint32 get_reference_count() const noexcept { return refCount_; } }; +struct _pi_ext_command_buffer {}; + // ------------------------------------------------------------- // Helper types and functions // diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt index 242fafecd7395..6aa44b0f9d6eb 100755 --- a/sycl/plugins/level_zero/CMakeLists.txt +++ b/sycl/plugins/level_zero/CMakeLists.txt @@ -113,6 +113,7 @@ add_sycl_plugin(level_zero "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp" + "../unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp" @@ -125,6 +126,7 @@ add_sycl_plugin(level_zero "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp" + "../unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.cpp" # Following are the PI Level-Zero Plugin only codes. "pi_level_zero.cpp" "pi_level_zero.hpp" diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 7df0ff7d21eab..88891bc585f68 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1067,6 +1067,120 @@ pi_result piextProgramSetSpecializationConstant(pi_program Prog, SpecValue); } +// Command buffer extension +pi_result piextCommandBufferCreate(pi_context Context, pi_device Device, + const pi_ext_command_buffer_desc *Desc, + pi_ext_command_buffer *RetCommandBuffer) { + return pi2ur::piextCommandBufferCreate(Context, Device, Desc, + RetCommandBuffer); +} + +pi_result piextCommandBufferRetain(pi_ext_command_buffer CommandBuffer) { + return pi2ur::piextCommandBufferRetain(CommandBuffer); +} + +pi_result piextCommandBufferRelease(pi_ext_command_buffer CommandBuffer) { + return pi2ur::piextCommandBufferRelease(CommandBuffer); +} + +pi_result piextCommandBufferFinalize(pi_ext_command_buffer CommandBuffer) { + return pi2ur::piextCommandBufferFinalize(CommandBuffer); +} + +pi_result piextCommandBufferNDRangeKernel( + pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferNDRangeKernel( + CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemcpyUSM( + pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr, + size_t Size, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemcpyUSM(CommandBuffer, DstPtr, SrcPtr, Size, + NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + size_t SrcOffset, size_t DstOffset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferCopy( + CommandBuffer, SrcMem, DstMem, SrcOffset, DstOffset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, + pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch, + size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferCopyRect( + CommandBuffer, SrcMem, DstMem, SrcOrigin, DstOrigin, Region, SrcRowPitch, + SrcSlicePitch, DstRowPitch, DstSlicePitch, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferRead( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset, + size_t Size, void *Dst, pi_uint32 NumEventsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferRead(CommandBuffer, Buffer, Offset, + Size, Dst, NumEventsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferReadRect( + CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, NumEventsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferWrite( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset, + size_t Size, const void *Ptr, pi_uint32 NumEventsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferWrite(CommandBuffer, Buffer, Offset, + Size, Ptr, NumEventsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferWriteRect( + CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, NumEventsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, + pi_queue Queue, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + return pi2ur::piextEnqueueCommandBuffer( + CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); +} + const char SupportedVersion[] = _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { // missing diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 9f04858b472e4..f0bd3b98fb0f9 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -2282,6 +2282,227 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, return piextGetNativeHandle(kernel, nativeHandle); } +// command-buffer extension +pi_result piextCommandBufferCreate(pi_context context, pi_device device, + const pi_ext_command_buffer_desc *desc, + pi_ext_command_buffer *ret_command_buffer) { + (void)context; + (void)device; + (void)desc; + (void)ret_command_buffer; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferRetain(pi_ext_command_buffer command_buffer) { + (void)command_buffer; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferRelease(pi_ext_command_buffer command_buffer) { + (void)command_buffer; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferFinalize(pi_ext_command_buffer command_buffer) { + (void)command_buffer; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferNDRangeKernel( + pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)kernel; + (void)work_dim; + (void)global_work_offset; + (void)global_work_size; + (void)local_work_size; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result +piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr, + const void *src_ptr, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)dst_ptr; + (void)src_ptr; + (void)size; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)src_buffer; + (void)dst_buffer; + (void)src_offset; + (void)dst_offset; + (void)size; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)src_buffer; + (void)dst_buffer; + (void)src_origin; + (void)dst_origin; + (void)region; + (void)src_row_pitch; + (void)src_slice_pitch; + (void)dst_row_pitch; + (void)dst_slice_pitch; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferMemBufferRead( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, void *dst, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)offset; + (void)size; + (void)dst; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)buffer_offset; + (void)host_offset; + (void)region; + (void)buffer_row_pitch; + (void)buffer_slice_pitch; + (void)host_row_pitch; + (void)host_slice_pitch; + (void)ptr; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferMemBufferWrite( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)offset; + (void)size; + (void)ptr; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + (void)command_buffer; + (void)buffer; + (void)buffer_offset; + (void)host_offset; + (void)region; + (void)buffer_row_pitch; + (void)buffer_slice_pitch; + (void)host_row_pitch; + (void)host_slice_pitch; + (void)ptr; + (void)num_sync_points_in_wait_list; + (void)sync_point_wait_list; + (void)sync_point; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + +pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer, + pi_queue queue, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event) { + (void)command_buffer; + (void)queue; + (void)num_events_in_wait_list; + (void)event_wait_list; + (void)event; + + // Not implemented + return PI_ERROR_INVALID_OPERATION; +} + // This API is called by Sycl RT to notify the end of the plugin lifetime. // Windows: dynamically loaded plugins might have been unloaded already // when this is called. Sycl RT holds onto the PI plugin so it can be @@ -2495,6 +2716,17 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueReadHostPipe, piextEnqueueReadHostPipe) _PI_CL(piextEnqueueWriteHostPipe, piextEnqueueWriteHostPipe) + // command-buffer + _PI_CL(piextCommandBufferCreate, piextCommandBufferCreate) + _PI_CL(piextCommandBufferRetain, piextCommandBufferRetain) + _PI_CL(piextCommandBufferRelease, piextCommandBufferRelease) + _PI_CL(piextCommandBufferNDRangeKernel, piextCommandBufferNDRangeKernel) + _PI_CL(piextCommandBufferMemcpyUSM, piextCommandBufferMemcpyUSM) + _PI_CL(piextCommandBufferMemBufferCopy, piextCommandBufferMemBufferCopy) + _PI_CL(piextCommandBufferMemBufferCopyRect, + piextCommandBufferMemBufferCopyRect) + _PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer) + _PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, piPluginGetLastError) diff --git a/sycl/plugins/opencl/pi_opencl.hpp b/sycl/plugins/opencl/pi_opencl.hpp index 7835df8c4cb6e..54b1ad90abcaf 100644 --- a/sycl/plugins/opencl/pi_opencl.hpp +++ b/sycl/plugins/opencl/pi_opencl.hpp @@ -114,5 +114,6 @@ inline const OpenCLVersion V2_2(2, 2); inline const OpenCLVersion V3_0(3, 0); } // namespace OCLV +struct _pi_ext_command_buffer {}; #endif // PI_OPENCL_HPP diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 46a4fa72c8a1a..7eedc741d3117 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 7e16bb37cbb12450637e595749c3617151cbe851) + set(UNIFIED_RUNTIME_TAG c7b0caf4e3ce5330bd0669db6e8e498b48e8ad27) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime @@ -97,6 +97,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED "ur/adapters/level_zero/ur_level_zero_queue.hpp" "ur/adapters/level_zero/ur_level_zero_sampler.hpp" "ur/adapters/level_zero/ur_level_zero_usm.hpp" + "ur/adapters/level_zero/ur_level_zero_command_buffer.hpp" "ur/adapters/level_zero/ur_level_zero.cpp" "ur/adapters/level_zero/ur_level_zero_common.cpp" "ur/adapters/level_zero/ur_level_zero_context.cpp" @@ -109,6 +110,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED "ur/adapters/level_zero/ur_level_zero_queue.cpp" "ur/adapters/level_zero/ur_level_zero_sampler.cpp" "ur/adapters/level_zero/ur_level_zero_usm.cpp" + "ur/adapters/level_zero/ur_level_zero_command_buffer.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES @@ -155,6 +157,8 @@ if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/cuda/tracing.cpp" "ur/adapters/cuda/ur_interface_loader.cpp" "ur/adapters/cuda/usm.cpp" + "ur/adapters/cuda/command_buffer.hpp" + "ur/adapters/cuda/command_buffer.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 24b5826134ce2..79b6d44a4d0d8 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4075,4 +4075,251 @@ inline pi_result piSamplerRelease(pi_sampler Sampler) { // Sampler /////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +// Command-buffer extension + +inline pi_result +piextCommandBufferCreate(pi_context Context, pi_device Device, + const pi_ext_command_buffer_desc *Desc, + pi_ext_command_buffer *RetCommandBuffer) { + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_device_handle_t UrDevice = reinterpret_cast(Device); + const ur_exp_command_buffer_desc_t *UrDesc = + reinterpret_cast(Desc); + ur_exp_command_buffer_handle_t *UrCommandBuffer = + reinterpret_cast(RetCommandBuffer); + + HANDLE_ERRORS( + urCommandBufferCreateExp(UrContext, UrDevice, UrDesc, UrCommandBuffer)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferRetain(pi_ext_command_buffer CommandBuffer) { + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + HANDLE_ERRORS(urCommandBufferRetainExp(UrCommandBuffer)); + + return PI_SUCCESS; +} + +inline pi_result +piextCommandBufferRelease(pi_ext_command_buffer CommandBuffer) { + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + HANDLE_ERRORS(urCommandBufferReleaseExp(UrCommandBuffer)); + + return PI_SUCCESS; +} + +inline pi_result +piextCommandBufferFinalize(pi_ext_command_buffer CommandBuffer) { + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + HANDLE_ERRORS(urCommandBufferFinalizeExp(UrCommandBuffer)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferNDRangeKernel( + pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + + HANDLE_ERRORS(urCommandBufferAppendKernelLaunchExp( + UrCommandBuffer, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferMemcpyUSM( + pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr, + size_t Size, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + HANDLE_ERRORS(urCommandBufferAppendMemcpyUSMExp( + UrCommandBuffer, DstPtr, SrcPtr, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + size_t SrcOffset, size_t DstOffset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + ur_mem_handle_t UrSrcMem = reinterpret_cast(SrcMem); + ur_mem_handle_t UrDstMem = reinterpret_cast(DstMem); + + HANDLE_ERRORS(urCommandBufferAppendMembufferCopyExp( + UrCommandBuffer, UrSrcMem, UrDstMem, SrcOffset, DstOffset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, + pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch, + size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + ur_mem_handle_t UrSrcMem = reinterpret_cast(SrcMem); + ur_mem_handle_t UrDstMem = reinterpret_cast(DstMem); + + ur_rect_offset_t UrSrcOrigin{SrcOrigin->x_bytes, SrcOrigin->y_scalar, + SrcOrigin->z_scalar}; + ur_rect_offset_t UrDstOrigin{DstOrigin->x_bytes, DstOrigin->y_scalar, + DstOrigin->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + + HANDLE_ERRORS(urCommandBufferAppendMembufferCopyRectExp( + UrCommandBuffer, UrSrcMem, UrDstMem, UrSrcOrigin, UrDstOrigin, UrRegion, + SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar, + BufferOffset->z_scalar}; + ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar, + HostOffset->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + + HANDLE_ERRORS(urCommandBufferAppendMembufferReadRectExp( + UrCommandBuffer, UrBuffer, UrBufferOffset, UrHostOffset, UrRegion, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferMemBufferRead( + pi_ext_command_buffer CommandBuffer, pi_mem Src, size_t Offset, size_t Size, + void *Dst, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + PI_ASSERT(Src, PI_ERROR_INVALID_MEM_OBJECT); + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + ur_mem_handle_t UrBuffer = reinterpret_cast(Src); + + HANDLE_ERRORS(urCommandBufferAppendMembufferReadExp( + UrCommandBuffer, UrBuffer, Offset, Size, Dst, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar, + BufferOffset->z_scalar}; + ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar, + HostOffset->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + + HANDLE_ERRORS(urCommandBufferAppendMembufferWriteRectExp( + UrCommandBuffer, UrBuffer, UrBufferOffset, UrHostOffset, UrRegion, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, + const_cast(Ptr), NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferMemBufferWrite( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset, + size_t Size, const void *Ptr, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + + HANDLE_ERRORS(urCommandBufferAppendMembufferWriteExp( + UrCommandBuffer, UrBuffer, Offset, Size, const_cast(Ptr), + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint)); + + return PI_SUCCESS; +} + +inline pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, + pi_queue Queue, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventWaitList = + reinterpret_cast(EventWaitList); + ur_event_handle_t *UrEvent = reinterpret_cast(Event); + + HANDLE_ERRORS(urCommandBufferEnqueueExp( + UrCommandBuffer, UrQueue, NumEventsInWaitList, UrEventWaitList, UrEvent)); + + return PI_SUCCESS; +} + +// Command-buffer extension +/////////////////////////////////////////////////////////////////////////////// + } // namespace pi2ur diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index d89be52061a0d..b4680e716ac01 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -974,6 +974,120 @@ pi_result piextMemImageCreateWithNativeHandle( NativeHandle, Context, OwnNativeHandle, ImageFormat, ImageDesc, Img); } +// Command buffer extension +pi_result piextCommandBufferCreate(pi_context Context, pi_device Device, + const pi_ext_command_buffer_desc *Desc, + pi_ext_command_buffer *RetCommandBuffer) { + return pi2ur::piextCommandBufferCreate(Context, Device, Desc, + RetCommandBuffer); +} + +pi_result piextCommandBufferRetain(pi_ext_command_buffer CommandBuffer) { + return pi2ur::piextCommandBufferRetain(CommandBuffer); +} + +pi_result piextCommandBufferRelease(pi_ext_command_buffer CommandBuffer) { + return pi2ur::piextCommandBufferRelease(CommandBuffer); +} + +pi_result piextCommandBufferFinalize(pi_ext_command_buffer CommandBuffer) { + return pi2ur::piextCommandBufferFinalize(CommandBuffer); +} + +pi_result piextCommandBufferNDRangeKernel( + pi_ext_command_buffer CommandBuffer, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferNDRangeKernel( + CommandBuffer, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemcpyUSM( + pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr, + size_t Size, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemcpyUSM(CommandBuffer, DstPtr, SrcPtr, Size, + NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferCopy( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + size_t SrcOffset, size_t DstOffset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferCopy( + CommandBuffer, SrcMem, DstMem, SrcOffset, DstOffset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem, + pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, + pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch, + size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferCopyRect( + CommandBuffer, SrcMem, DstMem, SrcOrigin, DstOrigin, Region, SrcRowPitch, + SrcSlicePitch, DstRowPitch, DstSlicePitch, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferRead( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset, + size_t Size, void *Dst, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferRead( + CommandBuffer, Buffer, Offset, Size, Dst, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferReadRect( + CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferWrite( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset, + size_t Size, const void *Ptr, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferWrite( + CommandBuffer, Buffer, Offset, Size, Ptr, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferWriteRect( + CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, + pi_queue Queue, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + return pi2ur::piextEnqueueCommandBuffer( + CommandBuffer, Queue, NumEventsInWaitList, EventWaitList, Event); +} + __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp new file mode 100644 index 0000000000000..9ba6795b4be8c --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp @@ -0,0 +1,250 @@ +//===--------- command_buffer.cpp - CUDA Adapter ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "command_buffer.hpp" +#include "common.hpp" + +/// Stub implementations of UR experimental feature command-buffers + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_exp_command_buffer_desc_t *pCommandBufferDesc, + ur_exp_command_buffer_handle_t *phCommandBuffer) { + (void)hContext; + (void)hDevice; + (void)pCommandBufferDesc; + (void)phCommandBuffer; + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hKernel; + (void)workDim; + (void)pGlobalWorkOffset; + (void)pGlobalWorkSize; + (void)pLocalWorkSize; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, + size_t size, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)pDst; + (void)pSrc; + (void)size; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hSrcMem; + (void)hDstMem; + (void)srcOffset; + (void)dstOffset; + (void)size; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hSrcMem; + (void)hDstMem; + (void)srcOrigin; + (void)dstOrigin; + (void)region; + (void)srcRowPitch; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, const void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)offset; + (void)size; + (void)pSrc; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)offset; + (void)size; + (void)pDst; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)bufferOffset; + (void)hostOffset; + (void)region; + (void)bufferRowPitch; + (void)bufferSlicePitch; + (void)hostRowPitch; + (void)hostSlicePitch; + (void)pSrc; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)bufferOffset; + (void)hostOffset; + (void)region; + (void)bufferRowPitch; + (void)bufferSlicePitch; + (void)hostRowPitch; + (void)hostSlicePitch; + (void)pDst; + + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + (void)hCommandBuffer; + (void)hQueue; + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + + sycl::detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp new file mode 100644 index 0000000000000..cd088c0d287cc --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp @@ -0,0 +1,13 @@ +//===--------- command_buffer.hpp - CUDA Adapter ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +/// Stub implementation of command-buffers for CUDA + +struct ur_exp_command_buffer_handle_t_ {}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index f8e806b0626a0..132c7775bbad5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -258,6 +258,36 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return UR_RESULT_SUCCESS; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_command_buffer_exp_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnCreateExp = urCommandBufferCreateExp; + pDdiTable->pfnRetainExp = urCommandBufferRetainExp; + pDdiTable->pfnReleaseExp = urCommandBufferReleaseExp; + pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; + pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; + pDdiTable->pfnAppendMemcpyUSMExp = urCommandBufferAppendMemcpyUSMExp; + pDdiTable->pfnAppendMembufferCopyExp = urCommandBufferAppendMembufferCopyExp; + pDdiTable->pfnAppendMembufferCopyRectExp = + urCommandBufferAppendMembufferCopyRectExp; + pDdiTable->pfnAppendMembufferReadExp = urCommandBufferAppendMembufferReadExp; + pDdiTable->pfnAppendMembufferReadRectExp = + urCommandBufferAppendMembufferReadRectExp; + pDdiTable->pfnAppendMembufferWriteExp = + urCommandBufferAppendMembufferWriteExp; + pDdiTable->pfnAppendMembufferWriteRectExp = + urCommandBufferAppendMembufferWriteRectExp; + pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; + + return retVal; +} + #if defined(__cplusplus) } // extern "C" #endif diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.cpp new file mode 100644 index 0000000000000..9fdeab956305f --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.cpp @@ -0,0 +1,752 @@ +//===--------- ur_level_zero_command_buffer.cpp - Level Zero Adapter -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "ur_level_zero_command_buffer.hpp" +#include "ur_level_zero.hpp" + +/* Command-buffer Extension + + The UR interface for submitting a UR command-buffer takes a list + of events to wait on, and returns an event representing the completion of + that particular submission of the command-buffer. + + However, in `zeCommandQueueExecuteCommandLists` there are no parameters to + take a waitlist and also the only sync primitive returned is to block on + host. + + In order to get the UR command-buffer enqueue semantics we want with L0 + this adapter adds extra commands to the L0 command-list representing a + UR command-buffer. + + Prefix - Commands added to the start of the L0 command-list by L0 adapter. + Suffix - Commands added to the end of the L0 command-list by L0 adapter. + + These extra commands operate on L0 event synchronisation primitives used by + the command-list to interact with the external UR wait-list and UR return + event required for the enqueue interface. + + The `ur_exp_command_buffer_handle_t` class for this adapter contains a + SignalEvent which signals the completion of the command-list in the suffix, + and is reset in the prefix. This signal is detected by a new UR return event + created on UR command-buffer enqueue. + + There is also a WaitEvent used by the `ur_exp_command_buffer_handle_t` class + in the prefix to wait on any dependencies passed in the enqueue wait-list. + + ┌──────────┬────────────────────────────────────────────────┬─────────┐ + │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │ + └──────────┴────────────────────────────────────────────────┴─────────┘ + + ┌───────────────────┬──────────────────────────────┐ + Prefix │Reset signal event │ Barrier waiting on wait event│ + └───────────────────┴──────────────────────────────┘ + + ┌─────────────────────────────────────────┐ + Suffix │Signal the UR command-buffer signal event│ + └─────────────────────────────────────────┘ + + + For a call to `urCommandBufferEnqueueExp` with an event_list `EL`, + command-buffer `CB`, and return event `RE` our implementation has to create + and submit two new command-lists for the above approach to work. One before + the command-list with extra commands associated with `CB`, and the other + after `CB`. + + Command-list created on `urCommandBufferEnqueueExp` to execution before `CB`: + ┌───────────────────────────────────────────────────────────┐ + │Barrier on `EL` than signals `CB` WaitEvent when completed │ + └───────────────────────────────────────────────────────────┘ + + Command-list created on `urCommandBufferEnqueueExp` to execution after `CB`: + ┌─────────────────────────────────────────────────────────────┐ + │Barrier on `CB` SignalEvent that signals `RE` when completed │ + └─────────────────────────────────────────────────────────────┘ + +Drawbacks +--------- + +There are two drawbacks to this approach: + +1. We use 3x the command-list resources, if there are many UR command-buffers +in flight, this may exhaust L0 driver resources. + +2. Each command list is submitted individually with a +`ur_queue_handle_t_::executeCommandList` call which introduces serialization in +the submission pipeline that is heavier than having a barrier or a +waitForEvents on the same list. Resulting in additional latency when executing +graphs. + +*/ + +ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( + ur_context_handle_t Context, ur_device_handle_t Device, + ze_command_list_handle_t CommandList, + ZeStruct ZeDesc, + const ur_exp_command_buffer_desc_t *Desc) + : Context(Context), Device(Device), ZeCommandList(CommandList), + ZeCommandListDesc(ZeDesc), QueueProperties(), SyncPoints(), + NextSyncPoint(0), CommandListMap() { + (void)Desc; + urContextRetain(Context); + urDeviceRetain(Device); +} + +// The ur_exp_command_buffer_handle_t_ destructor release all the memory objects +// allocated for command_buffer managment +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { + // Release the memory allocated to the Context stored in the command_buffer + urContextRelease(Context); + + // Release the device + urDeviceRelease(Device); + + // Release the memory allocated to the CommandList stored in the + // command_buffer + if (ZeCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); + } + + // Release additional signal and wait events used by command_buffer + if (SignalEvent) { + CleanupCompletedEvent(SignalEvent, false); + urEventReleaseInternal(SignalEvent); + } + if (WaitEvent) { + CleanupCompletedEvent(WaitEvent, false); + urEventReleaseInternal(WaitEvent); + } + + // Release events added to the command_buffer + for (auto &Sync : SyncPoints) { + auto &Event = Sync.second; + CleanupCompletedEvent(Event, false); + urEventReleaseInternal(Event); + } + + // Release Fences allocated to command_buffer + for (auto it = CommandListMap.begin(); it != CommandListMap.end(); ++it) { + if (it->second.ZeFence != nullptr) { + ZE_CALL_NOCHECK(zeFenceDestroy, (it->second.ZeFence)); + } + } +} + +/// Helper function for calculating work dimensions for kernels +ur_result_t calculateKernelWorkDimensions( + ur_kernel_handle_t Kernel, ur_device_handle_t Device, + ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3], + uint32_t WorkDim, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize) { + // global_work_size of unused dimensions must be set to 1 + UR_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, + UR_RESULT_ERROR_INVALID_VALUE); + + if (LocalWorkSize) { + WG[0] = ur_cast(LocalWorkSize[0]); + WG[1] = ur_cast(LocalWorkSize[1]); + WG[2] = ur_cast(LocalWorkSize[2]); + } else { + // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize + // values do not fit to 32-bit that the API only supports currently. + bool SuggestGroupSize = true; + for (int I : {0, 1, 2}) { + if (GlobalWorkSize[I] > UINT32_MAX) { + SuggestGroupSize = false; + } + } + if (SuggestGroupSize) { + ZE2UR_CALL(zeKernelSuggestGroupSize, + (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1], + GlobalWorkSize[2], &WG[0], &WG[1], &WG[2])); + } else { + for (int I : {0, 1, 2}) { + // Try to find a I-dimension WG size that the GlobalWorkSize[I] is + // fully divisable with. Start with the max possible size in + // each dimension. + uint32_t GroupSize[] = { + Device->ZeDeviceComputeProperties->maxGroupSizeX, + Device->ZeDeviceComputeProperties->maxGroupSizeY, + Device->ZeDeviceComputeProperties->maxGroupSizeZ}; + GroupSize[I] = std::min(size_t(GroupSize[I]), GlobalWorkSize[I]); + while (GlobalWorkSize[I] % GroupSize[I]) { + --GroupSize[I]; + } + if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) { + urPrint("urCommandBufferAppendKernelLaunchExp: can't find a WG size " + "suitable for global work size > UINT32_MAX\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + WG[I] = GroupSize[I]; + } + urPrint( + "urCommandBufferAppendKernelLaunchExp: using computed WG size = {%d, " + "%d, %d}\n", + WG[0], WG[1], WG[2]); + } + } + + // TODO: assert if sizes do not fit into 32-bit? + switch (WorkDim) { + case 3: + ZeThreadGroupDimensions.groupCountX = + ur_cast(GlobalWorkSize[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + ur_cast(GlobalWorkSize[1] / WG[1]); + ZeThreadGroupDimensions.groupCountZ = + ur_cast(GlobalWorkSize[2] / WG[2]); + break; + case 2: + ZeThreadGroupDimensions.groupCountX = + ur_cast(GlobalWorkSize[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + ur_cast(GlobalWorkSize[1] / WG[1]); + WG[2] = 1; + break; + case 1: + ZeThreadGroupDimensions.groupCountX = + ur_cast(GlobalWorkSize[0] / WG[0]); + WG[1] = WG[2] = 1; + break; + + default: + urPrint("urCommandBufferAppendKernelLaunchExp: unsupported work_dim\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Error handling for non-uniform group size case + if (GlobalWorkSize[0] != + size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { + urPrint("urCommandBufferAppendKernelLaunchExp: invalid work_dim. The range " + "is not a " + "multiple of the group size in the 1st dimension\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize[1] != + size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { + urPrint("urCommandBufferAppendKernelLaunchExp: invalid work_dim. The range " + "is not a " + "multiple of the group size in the 2nd dimension\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize[2] != + size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { + urPrint("urCommandBufferAppendKernelLaunchExp: invalid work_dim. The range " + "is not a " + "multiple of the group size in the 3rd dimension\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + return UR_RESULT_SUCCESS; +} + +/// Helper function for finding the Level Zero events associated with the +/// commands in a command-buffer, each event is pointed to by a sync-point in +/// the wait list. +/// +/// @param[in] CommandBuffer to lookup the L0 events from. +/// @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList. +/// @param[in] SyncPointWaitList List of sync points in \p CommandBuffer +/// to find the L0 events for. +/// @param[out] ZeEventList Return parameter for the L0 events associated with +/// each sync-point in \p SyncPointWaitList. +/// +/// @return UR_RESULT_SUCCESS or an error code on failure +static ur_result_t getEventsFromSyncPoints( + const ur_exp_command_buffer_handle_t &CommandBuffer, + size_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + std::vector &ZeEventList) { + // Map of ur_exp_command_buffer_sync_point_t to ur_event_handle_t defining + // the event associated with each sync-point + auto SyncPoints = CommandBuffer->SyncPoints; + + // For each sync-point add associated L0 event to the return list. + for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { + if (auto EventHandle = SyncPoints.find(SyncPointWaitList[i]); + EventHandle != SyncPoints.end()) { + ZeEventList.push_back(EventHandle->second->ZeEvent); + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + return UR_RESULT_SUCCESS; +} + +// Shared by all memory read/write/copy PI interfaces. +// Helper function for common code when enqueuing memory operations to a command +// buffer. +static ur_result_t enqueueCommandBufferMemCopyHelper( + ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, + void *Dst, const void *Src, size_t Size, uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent)); + LaunchEvent->CommandType = CommandType; + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (CommandBuffer->ZeCommandList, Dst, Src, Size, + LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + + urPrint("calling zeCommandListAppendMemoryCopy() with" + " ZeEvent %#lx\n", + ur_cast(LaunchEvent->ZeEvent)); + + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + return UR_RESULT_SUCCESS; +} + +// Helper function for common code when enqueuing rectangular memory operations +// to a command buffer. +static ur_result_t enqueueCommandBufferMemCopyRectHelper( + ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, + void *Dst, const void *Src, ur_rect_offset_t SrcOrigin, + ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, + size_t DstRowPitch, size_t SrcSlicePitch, size_t DstSlicePitch, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + + uint32_t SrcOriginX = ur_cast(SrcOrigin.x); + uint32_t SrcOriginY = ur_cast(SrcOrigin.y); + uint32_t SrcOriginZ = ur_cast(SrcOrigin.z); + + uint32_t SrcPitch = SrcRowPitch; + if (SrcPitch == 0) + SrcPitch = ur_cast(Region.width); + + if (SrcSlicePitch == 0) + SrcSlicePitch = ur_cast(Region.height) * SrcPitch; + + uint32_t DstOriginX = ur_cast(DstOrigin.x); + uint32_t DstOriginY = ur_cast(DstOrigin.y); + uint32_t DstOriginZ = ur_cast(DstOrigin.z); + + uint32_t DstPitch = DstRowPitch; + if (DstPitch == 0) + DstPitch = ur_cast(Region.width); + + if (DstSlicePitch == 0) + DstSlicePitch = ur_cast(Region.height) * DstPitch; + + uint32_t Width = ur_cast(Region.width); + uint32_t Height = ur_cast(Region.height); + uint32_t Depth = ur_cast(Region.depth); + + const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, + Width, Height, Depth}; + const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, + Width, Height, Depth}; + + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent)); + LaunchEvent->CommandType = CommandType; + + ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, + (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, + DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, + LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + + urPrint("calling zeCommandListAppendMemoryCopyRegion() with" + " ZeEvent %#lx\n", + ur_cast(LaunchEvent->ZeEvent)); + + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, + const ur_exp_command_buffer_desc_t *CommandBufferDesc, + ur_exp_command_buffer_handle_t *CommandBuffer) { + // Force compute queue type for now. Copy engine types may be better suited + // for host to device copies. + uint32_t QueueGroupOrdinal = + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] + .ZeOrdinal; + + ZeStruct ZeCommandListDesc; + ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + + ze_command_list_handle_t ZeCommandList; + // TODO We could optimize this by pooling both Level Zero command-lists and UR + // command-buffers, then reusing them. + ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, + &ZeCommandListDesc, &ZeCommandList)); + try { + *CommandBuffer = new ur_exp_command_buffer_handle_t_( + Context, Device, ZeCommandList, ZeCommandListDesc, CommandBufferDesc); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + // Create signal & wait events to be used in the command-list for sync + // on command-buffer enqueue. + auto RetCommandBuffer = *CommandBuffer; + UR_CALL(EventCreate(Context, nullptr, true, &RetCommandBuffer->SignalEvent)); + UR_CALL(EventCreate(Context, nullptr, false, &RetCommandBuffer->WaitEvent)); + + // Add prefix commands + ZE2UR_CALL(zeCommandListAppendEventReset, + (ZeCommandList, RetCommandBuffer->SignalEvent->ZeEvent)); + ZE2UR_CALL( + zeCommandListAppendBarrier, + (ZeCommandList, nullptr, 1, &RetCommandBuffer->WaitEvent->ZeEvent)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t CommandBuffer) { + CommandBuffer->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { + if (!CommandBuffer->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + delete CommandBuffer; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { + // We need to append signal that will indicate that command-buffer has + // finished executing. + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent)); + // Close the command list and have it ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + uint32_t WorkDim, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock Lock( + Kernel->Mutex, Kernel->Program->Mutex); + + if (GlobalWorkOffset != NULL) { + if (!CommandBuffer->Context->getPlatform() + ->ZeDriverGlobalOffsetExtensionFound) { + urPrint("No global offset extension found on this driver\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + ZE2UR_CALL(zeKernelSetGlobalOffsetExp, + (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], + GlobalWorkOffset[2])); + } + + // If there are any pending arguments set them now. + for (auto &Arg : Kernel->PendingArguments) { + // The ArgValue may be a NULL pointer in which case a NULL value is used for + // the kernel argument declared as a pointer to global or constant memory. + char **ZeHandlePtr = nullptr; + if (Arg.Value) { + // TODO: Not sure of the implication of not passing a device pointer here + UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode)); + } + ZE2UR_CALL(zeKernelSetArgumentValue, + (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); + } + Kernel->PendingArguments.clear(); + + ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]; + + UR_CALL(calculateKernelWorkDimensions(Kernel, CommandBuffer->Device, + ZeThreadGroupDimensions, WG, WorkDim, + GlobalWorkSize, LocalWorkSize)); + + ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); + + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent)); + LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH; + + LaunchEvent->CommandData = (void *)Kernel; + // Increment the reference count of the Kernel and indicate that the Kernel + // is in use. Once the event has been signalled, the code in + // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the + // reference count on the kernel, using the kernel saved in CommandData. + UR_CALL(urKernelRetain(Kernel)); + + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (CommandBuffer->ZeCommandList, Kernel->ZeKernel, + &ZeThreadGroupDimensions, LaunchEvent->ZeEvent, + ZeEventList.size(), ZeEventList.data())); + + urPrint("calling zeCommandListAppendLaunchKernel() with" + " ZeEvent %#lx\n", + ur_cast(LaunchEvent->ZeEvent)); + + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( + ur_exp_command_buffer_handle_t CommandBuffer, void *Dst, const void *Src, + size_t Size, uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + return enqueueCommandBufferMemCopyHelper( + UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t SrcMem, + ur_mem_handle_t DstMem, size_t SrcOffset, size_t DstOffset, size_t Size, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + (void)SrcOffset; + (void)DstOffset; + + auto SrcBuffer = ur_cast(SrcMem); + auto DstBuffer = ur_cast(DstMem); + + std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex> LockAll( + SrcLock, DstBuffer->Mutex); + + char *ZeHandleSrc; + UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + CommandBuffer->Device)); + char *ZeHandleDst; + UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + CommandBuffer->Device)); + + return enqueueCommandBufferMemCopyHelper( + UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst, ZeHandleSrc, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t SrcMem, + ur_mem_handle_t DstMem, ur_rect_offset_t SrcOrigin, + ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, + size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + auto SrcBuffer = ur_cast(SrcMem); + auto DstBuffer = ur_cast(DstMem); + + std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex> LockAll( + SrcLock, DstBuffer->Mutex); + + char *ZeHandleSrc; + UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + CommandBuffer->Device)); + char *ZeHandleDst; + UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + CommandBuffer->Device)); + + return enqueueCommandBufferMemCopyRectHelper( + UR_COMMAND_MEM_BUFFER_COPY_RECT, CommandBuffer, ZeHandleDst, ZeHandleSrc, + SrcOrigin, DstOrigin, Region, SrcRowPitch, DstRowPitch, SrcSlicePitch, + DstSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, + size_t Offset, size_t Size, const void *Src, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + std::scoped_lock Lock(Buffer->Mutex); + + char *ZeHandleDst = nullptr; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + CommandBuffer->Device)); + + return enqueueCommandBufferMemCopyHelper( + UR_COMMAND_MEM_BUFFER_WRITE, CommandBuffer, + ZeHandleDst + Offset, // dst + Src, // src + Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, + ur_rect_offset_t BufferOffset, ur_rect_offset_t HostOffset, + ur_rect_region_t Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Src, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + std::scoped_lock Lock(Buffer->Mutex); + + char *ZeHandleDst = nullptr; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + CommandBuffer->Device)); + return enqueueCommandBufferMemCopyRectHelper( + UR_COMMAND_MEM_BUFFER_WRITE_RECT, CommandBuffer, ZeHandleDst, + const_cast(static_cast(Src)), HostOffset, + BufferOffset, Region, HostRowPitch, BufferRowPitch, HostSlicePitch, + BufferSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, + size_t Offset, size_t Size, void *Dst, uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + std::scoped_lock SrcLock(Buffer->Mutex); + + char *ZeHandleSrc = nullptr; + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + CommandBuffer->Device)); + return enqueueCommandBufferMemCopyHelper( + UR_COMMAND_MEM_BUFFER_READ, CommandBuffer, Dst, ZeHandleSrc + Offset, + Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, + ur_rect_offset_t BufferOffset, ur_rect_offset_t HostOffset, + ur_rect_region_t Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Dst, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint) { + std::scoped_lock SrcLock(Buffer->Mutex); + + char *ZeHandleSrc; + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + CommandBuffer->Device)); + return enqueueCommandBufferMemCopyRectHelper( + UR_COMMAND_MEM_BUFFER_READ_RECT, CommandBuffer, Dst, ZeHandleSrc, + BufferOffset, HostOffset, Region, BufferRowPitch, HostRowPitch, + BufferSlicePitch, HostSlicePitch, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t Queue, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_event_handle_t *Event) { + std::scoped_lock lock(Queue->Mutex); + // Use compute engine rather than copy engine + const auto UseCopyEngine = false; + auto &QGroup = Queue->getQueueGroup(UseCopyEngine); + uint32_t QueueGroupOrdinal; + auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal); + + ze_fence_handle_t ZeFence; + ZeStruct ZeFenceDesc; + ur_command_list_ptr_t CommandListPtr; + + ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + // TODO: Refactor so requiring a map iterator is not required here, currently + // required for executeCommandList though. + ZeStruct ZeQueueDesc; + ZeQueueDesc.ordinal = QueueGroupOrdinal; + CommandListPtr = CommandBuffer->CommandListMap.insert( + std::pair( + CommandBuffer->ZeCommandList, + {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc})); + + // Previous execution will have closed the command list, we need to reopen + // it otherwise calling `executeCommandList` will return early. + CommandListPtr->second.IsClosed = false; + CommandListPtr->second.ZeFenceInUse = true; + + // Create command-list to execute before `CommandListPtr` and will signal + // when `EventWaitList` dependencies are complete. + ur_command_list_ptr_t WaitCommandList{}; + if (NumEventsInWaitList) { + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, + false, false)) + + // Update the WaitList of the Wait Event + // Events are appended to the WaitList if the WaitList is not empty + if (CommandBuffer->WaitEvent->WaitList.isEmpty()) + CommandBuffer->WaitEvent->WaitList = TmpWaitList; + else + CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList); + + ZE2UR_CALL(zeCommandListAppendBarrier, + (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, + CommandBuffer->WaitEvent->WaitList.Length, + CommandBuffer->WaitEvent->WaitList.ZeEventList)); + } else { + UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, + false, false)); + + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + } + + // Execution event for this enqueue of the PI command-buffer + ur_event_handle_t RetEvent{}; + // Create a command-list to signal RetEvent on completion + ur_command_list_ptr_t SignalCommandList{}; + if (Event) { + UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, + false, false)); + + UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + SignalCommandList, false)); + + ZE2UR_CALL(zeCommandListAppendBarrier, + (SignalCommandList->first, RetEvent->ZeEvent, 1, + &(CommandBuffer->SignalEvent->ZeEvent))); + } + + // Execution our command-lists asynchronously + // TODO Look using a single `zeCommandQueueExecuteCommandLists()` call + // passing all three command-lists, rather than individual calls which + // introduces latency. + UR_CALL(Queue->executeCommandList(WaitCommandList, false, false)); + UR_CALL(Queue->executeCommandList(CommandListPtr, false, false)); + UR_CALL(Queue->executeCommandList(SignalCommandList, false, false)); + + if (Event) { + *Event = RetEvent; + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.hpp new file mode 100644 index 0000000000000..ebf7271d95f9e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_command_buffer.hpp @@ -0,0 +1,69 @@ +//===--------- ur_level_zero_command_buffer.hpp - Level Zero Adapter -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include +#include +#include +#include + +#include "ur_level_zero_common.hpp" + +#include "ur_level_zero_context.hpp" +#include "ur_level_zero_queue.hpp" + +struct ur_exp_command_buffer_handle_t_ : public _ur_object { + ur_exp_command_buffer_handle_t_(ur_context_handle_t Context, + ur_device_handle_t Device, + ze_command_list_handle_t CommandList, + ZeStruct ZeDesc, + const ur_exp_command_buffer_desc_t *Desc); + + ~ur_exp_command_buffer_handle_t_(); + + void RegisterSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, + ur_event_handle_t Event) { + SyncPoints[SyncPoint] = Event; + NextSyncPoint++; + } + + ur_exp_command_buffer_sync_point_t GetNextSyncPoint() const { + return NextSyncPoint; + } + + // UR context associated with this command-buffer + ur_context_handle_t Context; + // Device associated with this command buffer + ur_device_handle_t Device; + // Level Zero command list handle + ze_command_list_handle_t ZeCommandList; + // Level Zero command list descriptor + ZeStruct ZeCommandListDesc; + // Queue properties from command-buffer descriptor + // TODO: Do we need these? + ur_queue_properties_t QueueProperties; + // Map of sync_points to ur_events + std::unordered_map + SyncPoints; + // Next sync_point value (may need to consider ways to reuse values if 32-bits + // is not enough) + ur_exp_command_buffer_sync_point_t NextSyncPoint; + // Command list map so we can use queue::executeCommandList. + // Command list map is also used to release all the Fences retained by the + // command_buffer std::unordered_multimap CommandListMap; CommandListMap is redefined as a + // multimap to enable mutiple commands enqueing into the same command_buffer + std::unordered_multimap + CommandListMap; + // Event which will signals the most recent execution of the command-buffer + // has finished + ur_event_handle_t SignalEvent = nullptr; + // Event which a command-buffer waits on until the wait-list dependencies + // passed to a command-buffer enqueue have been satisfied. + ur_event_handle_t WaitEvent = nullptr; +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 2eaa671b21d07..5b410d2e103c6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -1181,6 +1181,44 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( return UR_RESULT_SUCCESS; } +ur_result_t _ur_ze_event_list_t::insert(_ur_ze_event_list_t &Other) { + if (this != &Other) { + // save of the previous object values + uint32_t PreLength = this->Length; + ze_event_handle_t *PreZeEventList = this->ZeEventList; + ur_event_handle_t *PreUrEventList = this->UrEventList; + + // allocate new memory + uint32_t Length = PreLength + Other.Length; + this->ZeEventList = new ze_event_handle_t[Length]; + this->UrEventList = new ur_event_handle_t[Length]; + + // copy elements + uint32_t TmpListLength = 0; + for (uint32_t I = 0; I < PreLength; I++) { + this->ZeEventList[TmpListLength] = std::move(PreZeEventList[I]); + this->UrEventList[TmpListLength] = std::move(PreUrEventList[I]); + TmpListLength += 1; + } + for (uint32_t I = 0; I < Other.Length; I++) { + this->ZeEventList[TmpListLength] = std::move(Other.ZeEventList[I]); + this->UrEventList[TmpListLength] = std::move(Other.UrEventList[I]); + TmpListLength += 1; + } + this->Length = TmpListLength; + + // Free previous allocated memory + delete[] PreZeEventList; + delete[] PreUrEventList; + delete[] Other.ZeEventList; + delete[] Other.UrEventList; + Other.ZeEventList = nullptr; + Other.UrEventList = nullptr; + Other.Length = 0; + } + return UR_RESULT_SUCCESS; +} + ur_result_t _ur_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList( std::list &EventsToBeReleased) { // acquire a lock before reading the length and list fields. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp index 9922742c7776d..76a07b435dd09 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp @@ -111,6 +111,15 @@ struct _ur_ze_event_list_t { } return *this; } + + // This function allows to merge two _ur_ze_event_lists + // The ur_ze_event_list "other" is added to the caller list. + // Note that new containers are allocated to contains the additional elements. + // Elements are moved to the new containers. + // other list can not be used after the call to this function. + ur_result_t insert(_ur_ze_event_list_t &Other); + + bool isEmpty() const { return (this->ZeEventList == nullptr); } }; void printZeEventList(const _ur_ze_event_list_t &PiZeEventList); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp index 280c9d025d702..b78742ada9658 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp @@ -300,3 +300,33 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return retVal; } + +UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_command_buffer_exp_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnCreateExp = urCommandBufferCreateExp; + pDdiTable->pfnRetainExp = urCommandBufferRetainExp; + pDdiTable->pfnReleaseExp = urCommandBufferReleaseExp; + pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; + pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; + pDdiTable->pfnAppendMemcpyUSMExp = urCommandBufferAppendMemcpyUSMExp; + pDdiTable->pfnAppendMembufferCopyExp = urCommandBufferAppendMembufferCopyExp; + pDdiTable->pfnAppendMembufferCopyRectExp = + urCommandBufferAppendMembufferCopyRectExp; + pDdiTable->pfnAppendMembufferReadExp = urCommandBufferAppendMembufferReadExp; + pDdiTable->pfnAppendMembufferReadRectExp = + urCommandBufferAppendMembufferReadRectExp; + pDdiTable->pfnAppendMembufferWriteExp = + urCommandBufferAppendMembufferWriteExp; + pDdiTable->pfnAppendMembufferWriteRectExp = + urCommandBufferAppendMembufferWriteRectExp; + pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; + + return retVal; +} diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 5c536543cd057..dc977f5d51171 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -83,12 +83,25 @@ piSamplerRelease piSamplerRetain piTearDown piclProgramCreateWithSource +piextCommandBufferCreate +piextCommandBufferFinalize +piextCommandBufferMemBufferCopy +piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferRead +piextCommandBufferMemBufferReadRect +piextCommandBufferMemBufferWrite +piextCommandBufferMemBufferWriteRect +piextCommandBufferMemcpyUSM +piextCommandBufferNDRangeKernel +piextCommandBufferRelease +piextCommandBufferRetain piextContextCreateWithNativeHandle piextContextGetNativeHandle piextContextSetExtendedDeleter piextDeviceCreateWithNativeHandle piextDeviceGetNativeHandle piextDeviceSelectBinary +piextEnqueueCommandBuffer piextEnqueueReadHostPipe piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 4f9f1e6d60218..beec58894863c 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -35,11 +35,24 @@ piQueueGetInfo piSamplerCreate piTearDown piclProgramCreateWithSource +piextCommandBufferCreate +piextCommandBufferFinalize +piextCommandBufferMemBufferCopy +piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferRead +piextCommandBufferMemBufferReadRect +piextCommandBufferMemBufferWrite +piextCommandBufferMemBufferWriteRect +piextCommandBufferMemcpyUSM +piextCommandBufferNDRangeKernel +piextCommandBufferRelease +piextCommandBufferRetain piextContextCreateWithNativeHandle piextContextGetNativeHandle piextDeviceCreateWithNativeHandle piextDeviceGetNativeHandle piextDeviceSelectBinary +piextEnqueueCommandBuffer piextEnqueueReadHostPipe piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 73d0f03dca4fd..fb62e3d4ffb67 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1137,6 +1137,111 @@ inline pi_result mock_piextPluginGetOpaqueData(void *opaque_data_param, return PI_SUCCESS; } +inline pi_result +mock_piextCommandBufferCreate(pi_context context, pi_device device, + const pi_ext_command_buffer_desc *desc, + pi_ext_command_buffer *ret_command_buffer) { + + return PI_SUCCESS; +} + +inline pi_result +mock_piextCommandBufferRetain(pi_ext_command_buffer command_buffer) { + return PI_SUCCESS; +} + +inline pi_result +mock_piextCommandBufferRelease(pi_ext_command_buffer command_buffer) { + return PI_SUCCESS; +} + +inline pi_result +mock_piextCommandBufferFinalize(pi_ext_command_buffer command_buffer) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferNDRangeKernel( + pi_ext_command_buffer command_buffer, pi_kernel kernel, pi_uint32 work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemcpyUSM( + pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr, + size_t size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemBufferRead( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, void *dst, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemBufferReadRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemBufferWrite( + pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, + size_t size, const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemBufferWriteRect( + pi_ext_command_buffer command_buffer, pi_mem buffer, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t BufferRowPitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextEnqueueCommandBuffer( + pi_ext_command_buffer command_buffer, pi_queue queue, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemBufferCopy( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + size_t src_offset, size_t dst_offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferMemBufferCopyRect( + pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + inline pi_result mock_piTearDown(void *PluginParameter) { return PI_SUCCESS; } inline pi_result mock_piPluginGetLastError(char **message) {