diff --git a/dpctl/_backend.pxd b/dpctl/_backend.pxd index 3f7ba63a55..57da77eb7d 100644 --- a/dpctl/_backend.pxd +++ b/dpctl/_backend.pxd @@ -403,6 +403,13 @@ cdef extern from "syclinterface/dpctl_sycl_queue_interface.h": void *Dest, const void *Src, size_t Count) + cdef DPCTLSyclEventRef DPCTLQueue_MemcpyWithEvents( + const DPCTLSyclQueueRef Q, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *depEvents, + size_t depEventsCount) cdef DPCTLSyclEventRef DPCTLQueue_Memset( const DPCTLSyclQueueRef Q, void *Dest, diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index 8db17594fd..cb3828a54f 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,30 +29,30 @@ /// //===----------------------------------------------------------------------===// +#pragma once #include "Python.h" #include "syclinterface/dpctl_data_types.h" +#include "syclinterface/dpctl_sycl_type_casters.hpp" #include -int async_dec_ref(DPCTLSyclQueueRef QRef, - PyObject **obj_array, - size_t obj_array_size, - DPCTLSyclEventRef *ERefs, - size_t nERefs) +DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef, + PyObject **obj_array, + size_t obj_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status) { + using dpctl::syclinterface::unwrap; + using dpctl::syclinterface::wrap; - sycl::queue *q = reinterpret_cast(QRef); + sycl::queue *q = unwrap(QRef); - std::vector obj_vec; - obj_vec.reserve(obj_array_size); - for (size_t obj_id = 0; obj_id < obj_array_size; ++obj_id) { - obj_vec.push_back(obj_array[obj_id]); - } + std::vector obj_vec(obj_array, obj_array + obj_array_size); try { - q->submit([&](sycl::handler &cgh) { - for (size_t ev_id = 0; ev_id < nERefs; ++ev_id) { - cgh.depends_on( - *(reinterpret_cast(ERefs[ev_id]))); + sycl::event ht_ev = q->submit([&](sycl::handler &cgh) { + for (size_t ev_id = 0; ev_id < nDepERefs; ++ev_id) { + cgh.depends_on(*(unwrap(depERefs[ev_id]))); } cgh.host_task([obj_array_size, obj_vec]() { // if the main thread has not finilized the interpreter yet @@ -66,9 +66,21 @@ int async_dec_ref(DPCTLSyclQueueRef QRef, } }); }); + + constexpr int result_ok = 0; + + *status = result_ok; + auto e_ptr = new sycl::event(ht_ev); + return wrap(e_ptr); } catch (const std::exception &e) { - return 1; + constexpr int result_std_exception = 1; + + *status = result_std_exception; + return nullptr; } - return 0; + constexpr int result_other_abnormal = 2; + + *status = result_other_abnormal; + return nullptr; } diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 8f9028fabf..0269cc4aae 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -70,6 +70,19 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef SyclContext get_sycl_context(self) cpdef SyclDevice get_sycl_device(self) cdef DPCTLSyclQueueRef get_queue_ref(self) + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ) + cpdef SyclEvent submit_async( + self, + SyclKernel kernel, + list args, + list gS, + list lS=*, + list dEvents=* + ) cpdef SyclEvent submit( self, SyclKernel kernel, @@ -81,6 +94,7 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef void wait(self) cdef DPCTLSyclQueueRef get_queue_ref(self) cpdef memcpy(self, dest, src, size_t count) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=*) cpdef prefetch(self, ptr, size_t count=*) cpdef mem_advise(self, ptr, size_t count, int mem) cpdef SyclEvent submit_barrier(self, dependent_events=*) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 361b9d5924..a27e6f940f 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -45,6 +45,7 @@ from ._backend cimport ( # noqa: E211 DPCTLQueue_IsInOrder, DPCTLQueue_MemAdvise, DPCTLQueue_Memcpy, + DPCTLQueue_MemcpyWithEvents, DPCTLQueue_Prefetch, DPCTLQueue_SubmitBarrierForEvents, DPCTLQueue_SubmitNDRange, @@ -64,6 +65,7 @@ import ctypes from .enum_types import backend_type from cpython cimport pycapsule +from cpython.buffer cimport PyObject_CheckBuffer from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject from libc.stdlib cimport free, malloc @@ -72,7 +74,7 @@ import logging cdef extern from "_host_task_util.hpp": - int async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t) nogil + DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t, int *) nogil __all__ = [ @@ -160,6 +162,62 @@ cdef void _queue_capsule_deleter(object o) noexcept: DPCTLQueue_Delete(QRef) +cdef bint _is_buffer(object o): + return PyObject_CheckBuffer(o) + + +cdef DPCTLSyclEventRef _memcpy_impl( + SyclQueue q, + object dst, + object src, + size_t byte_count, + DPCTLSyclEventRef *dep_events, + size_t dep_events_count +) except *: + cdef void *c_dst_ptr = NULL + cdef void *c_src_ptr = NULL + cdef DPCTLSyclEventRef ERef = NULL + cdef const unsigned char[::1] src_host_buf = None + cdef unsigned char[::1] dst_host_buf = None + + if isinstance(src, _Memory): + c_src_ptr = (<_Memory>src).memory_ptr + elif _is_buffer(src): + src_host_buf = src + c_src_ptr = &src_host_buf[0] + else: + raise TypeError( + "Parameter `src` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if isinstance(dst, _Memory): + c_dst_ptr = (<_Memory>dst).memory_ptr + elif _is_buffer(dst): + dst_host_buf = dst + c_dst_ptr = &dst_host_buf[0] + else: + raise TypeError( + "Parameter `dst` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if dep_events_count == 0 or dep_events is NULL: + ERef = DPCTLQueue_Memcpy(q._queue_ref, c_dst_ptr, c_src_ptr, byte_count) + else: + ERef = DPCTLQueue_MemcpyWithEvents( + q._queue_ref, + c_dst_ptr, + c_src_ptr, + byte_count, + dep_events, + dep_events_count + ) + return ERef + + cdef class _SyclQueue: """ Barebone data owner class used by SyclQueue. """ @@ -703,7 +761,81 @@ cdef class SyclQueue(_SyclQueue): """ return self._queue_ref - cpdef SyclEvent submit( + + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ): + """ SyclQueue._submit_keep_args_alive(args, events) + + Keeps objects in `args` alive until tasks associated with events + complete. + + Args: + args(object): Python object to keep alive. + Typically a tuple with arguments to offloaded tasks + events(Tuple[dpctl.SyclEvent]): Gating events + The list or tuple of events associated with tasks + working on Python objects collected in `args`. + Returns: + dpctl.SyclEvent + The event associated with the submission of host task. + + Increments reference count of `args` and schedules asynchronous + ``host_task`` to decrement the count once dependent events are + complete. + + N.B.: The `host_task` attempts to acquire Python GIL, and it is + known to be unsafe during interpreter shudown sequence. It is + thus strongly advised to ensure that all submitted `host_task` + complete before the end of the Python script. + """ + cdef size_t nDE = len(dEvents) + cdef DPCTLSyclEventRef *depEvents = NULL + cdef PyObject *args_raw = NULL + cdef DPCTLSyclEventRef htERef = NULL + cdef int status = -1 + + # Create the array of dependent events if any + if nDE > 0: + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if not depEvents: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + + # increment reference counts to list of arguments + Py_INCREF(args) + + # schedule decrement + args_raw = args + + htERef = async_dec_ref( + self.get_queue_ref(), + &args_raw, 1, + depEvents, nDE, &status + ) + + free(depEvents) + if (status != 0): + with nogil: DPCTLEvent_Wait(htERef) + DPCTLEvent_Delete(htERef) + raise RuntimeError("Could not submit keep_args_alive host_task") + + return SyclEvent._create(htERef) + + + cpdef SyclEvent submit_async( self, SyclKernel kernel, list args, @@ -715,13 +847,14 @@ cdef class SyclQueue(_SyclQueue): cdef _arg_data_type *kargty = NULL cdef DPCTLSyclEventRef *depEvents = NULL cdef DPCTLSyclEventRef Eref = NULL + cdef DPCTLSyclEventRef htEref = NULL cdef int ret = 0 cdef size_t gRange[3] cdef size_t lRange[3] cdef size_t nGS = len(gS) cdef size_t nLS = len(lS) if lS is not None else 0 cdef size_t nDE = len(dEvents) if dEvents is not None else 0 - cdef PyObject **arg_objects = NULL + cdef PyObject *args_raw = NULL cdef ssize_t i = 0 # Allocate the arrays to be sent to DPCTLQueue_Submit @@ -745,7 +878,15 @@ cdef class SyclQueue(_SyclQueue): raise MemoryError() else: for idx, de in enumerate(dEvents): - depEvents[idx] = (de).get_event_ref() + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(kargs) + free(kargty) + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) # populate the args and argstype arrays ret = self._populate_args(args, kargs, kargty) @@ -823,50 +964,69 @@ cdef class SyclQueue(_SyclQueue): raise SyclKernelSubmitError( "Kernel submission to Sycl queue failed." ) - # increment reference counts to each argument - arg_objects = malloc(len(args) * sizeof(PyObject *)) - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_INCREF( arg_objects[i]) - - # schedule decrement - if async_dec_ref(self.get_queue_ref(), arg_objects, len(args), &Eref, 1): - # async task submission failed, decrement ref counts and wait - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_DECREF( arg_objects[i]) - with nogil: DPCTLEvent_Wait(Eref) - - # free memory - free(arg_objects) return SyclEvent._create(Eref) + cpdef SyclEvent submit( + self, + SyclKernel kernel, + list args, + list gS, + list lS=None, + list dEvents=None + ): + cdef SyclEvent e = self.submit_async(kernel, args, gS, lS, dEvents) + e.wait() + return e + cpdef void wait(self): with nogil: DPCTLQueue_Wait(self._queue_ref) cpdef memcpy(self, dest, src, size_t count): - cdef void *c_dest - cdef void *c_src + """Copy memory from `src` to `dst`""" cdef DPCTLSyclEventRef ERef = NULL - if isinstance(dest, _Memory): - c_dest = (<_Memory>dest).memory_ptr - else: - raise TypeError("Parameter `dest` should have type _Memory.") + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) + if (ERef is NULL): + raise RuntimeError( + "SyclQueue.memcpy operation encountered an error" + ) + with nogil: DPCTLEvent_Wait(ERef) + DPCTLEvent_Delete(ERef) - if isinstance(src, _Memory): - c_src = (<_Memory>src).memory_ptr + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=None): + """Copy memory from `src` to `dst`""" + cdef DPCTLSyclEventRef ERef = NULL + cdef DPCTLSyclEventRef *depEvents = NULL + cdef size_t nDE = 0 + + if dEvents is None: + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) else: - raise TypeError("Parameter `src` should have type _Memory.") + nDE = len(dEvents) + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if depEvents is NULL: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + ERef = _memcpy_impl(self, dest, src, count, depEvents, nDE) + free(depEvents) - ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) if (ERef is NULL): raise RuntimeError( "SyclQueue.memcpy operation encountered an error" ) - with nogil: DPCTLEvent_Wait(ERef) - DPCTLEvent_Delete(ERef) + + return SyclEvent._create(ERef) cpdef prefetch(self, mem, size_t count=0): cdef void *ptr diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 322272df2d..66dd4f9340 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import timeit from . import SyclQueue @@ -22,6 +21,29 @@ __doc__ = "This module implements :class:`dpctl.SyclTimer`." +class HostDeviceDuration: + def __init__(self, host_dt, device_dt): + self._host_dt = host_dt + self._device_dt = device_dt + + def __repr__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __str__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __iter__(self): + yield from [self._host_dt, self._device_dt] + + @property + def host_dt(self): + return self._host_dt + + @property + def device_dt(self): + return self._device_dt + + class SyclTimer: """ SyclTimer(host_timer=timeit.default_timer, time_scale=1) @@ -45,7 +67,7 @@ class SyclTimer: code_block # retrieve elapsed times in milliseconds - sycl_dt, wall_dt = timer.dt + wall_dt, device_dt = timer.dt Remark: The timer submits barriers to the queue at the entrance and the @@ -67,10 +89,8 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1): self.timer = host_timer self.time_scale = time_scale self.queue = None - self.host_start = None - self.host_finish = None - self.event_start = None - self.event_finish = None + self.host_times = [] + self.bracketing_events = [] def __call__(self, queue=None): if isinstance(queue, SyclQueue): @@ -89,27 +109,31 @@ def __call__(self, queue=None): return self def __enter__(self): - self.event_start = self.queue.submit_barrier() - self.host_start = self.timer() + self._event_start = self.queue.submit_barrier() + self._host_start = self.timer() return self def __exit__(self, *args): - self.event_finish = self.queue.submit_barrier() - self.host_finish = self.timer() + self.host_times.append((self._host_start, self.timer())) + self.bracketing_events.append( + (self._event_start, self.queue.submit_barrier()) + ) + del self._event_start + del self._host_start @property def dt(self): - """Returns a tuple of elapsed times where first - element is the duration as measured by the host timer, - while the second element is the duration as measured by - the device timer and encoded in profiling events""" - self.event_start.wait() - self.event_finish.wait() - return ( - (self.host_finish - self.host_start) * self.time_scale, - ( - self.event_finish.profiling_info_start - - self.event_start.profiling_info_end - ) - * (1e-9 * self.time_scale), - ) + """Returns a pair of elapsed times (host_dt, device_dt). + + The host_dt is the duration as measured by the host + timer, while the device_dt is the duration as measured by + the device timer and encoded in profiling events.""" + for es, ef in self.bracketing_events: + es.wait() + ef.wait() + host_dt = sum(tf - ts for ts, tf in self.host_times) * self.time_scale + dev_dt = sum( + ef.profiling_info_start - es.profiling_info_end + for es, ef in self.bracketing_events + ) * (1e-9 * self.time_scale) + return HostDeviceDuration(host_dt, dev_dt) diff --git a/dpctl/tests/test_sycl_event.py b/dpctl/tests/test_sycl_event.py index fa496d1bb8..7f0db07539 100644 --- a/dpctl/tests/test_sycl_event.py +++ b/dpctl/tests/test_sycl_event.py @@ -202,7 +202,12 @@ def test_sycl_timer(): m1.copy_from_device(m2) # host operation [x**2 for x in range(128 * 1024)] - host_dt, device_dt = timer.dt + elapsed = timer.dt + host_dt, device_dt = elapsed + assert isinstance(repr(elapsed), str) + assert isinstance(str(elapsed), str) + assert host_dt == elapsed.host_dt + assert device_dt == elapsed.device_dt assert host_dt > device_dt or (host_dt > 0 and device_dt >= 0) q_no_profiling = dpctl.SyclQueue() assert q_no_profiling.has_enable_profiling is False diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py index d15f5c8e2b..697af32f5c 100644 --- a/dpctl/tests/test_sycl_kernel_submit.py +++ b/dpctl/tests/test_sycl_kernel_submit.py @@ -114,7 +114,7 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor): ) -def test_async_submit(): +def test_submit_async(): try: q = dpctl.SyclQueue("opencl") except dpctl.SyclQueueCreationError: @@ -182,7 +182,7 @@ def test_async_submit(): async_detected = False for attempt in range(5): - e1 = q.submit( + e1 = q.submit_async( kern1Kernel, [ first_row, @@ -192,7 +192,7 @@ def test_async_submit(): n, ], ) - e2 = q.submit( + e2 = q.submit_async( kern2Kernel, [ second_row, @@ -202,7 +202,7 @@ def test_async_submit(): n, ], ) - e3 = q.submit( + e3 = q.submit_async( kern3Kernel, [third_row, first_row, second_row], [ @@ -214,6 +214,9 @@ def test_async_submit(): e3_st = e3.execution_status e2_st = e2.execution_status e1_st = e1.execution_status + ht_e = q._submit_keep_args_alive( + [first_row, second_row, third_row], [e1, e2, e3] + ) are_complete = [ e == status_complete for e in ( @@ -223,6 +226,7 @@ def test_async_submit(): ) ] e3.wait() + ht_e.wait() if not all(are_complete): async_detected = True break diff --git a/dpctl/tests/test_sycl_queue_memcpy.py b/dpctl/tests/test_sycl_queue_memcpy.py index 45c8e41f61..e678b73f03 100644 --- a/dpctl/tests/test_sycl_queue_memcpy.py +++ b/dpctl/tests/test_sycl_queue_memcpy.py @@ -44,7 +44,77 @@ def test_memcpy_copy_usm_to_usm(): q.memcpy(mobj2, mobj1, 3) - assert mv2[:3], b"123" + assert mv2[:3] == b"123" + + +def test_memcpy_copy_host_to_usm(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + + canary = bytearray(b"123456789") + host_obj = memoryview(canary) + + q.memcpy(usm_obj, host_obj, len(canary)) + + mv2 = memoryview(usm_obj) + + assert mv2[: len(canary)] == canary + + +def test_memcpy_copy_usm_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + mv2 = memoryview(usm_obj) + + n = 9 + for id in range(n): + mv2[id] = ord("a") + id + + host_obj = bytearray(b" " * n) + + q.memcpy(host_obj, usm_obj, n) + + assert host_obj == b"abcdefghi" + + +def test_memcpy_copy_host_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + dst_buf = bytearray(len(src_buf)) + + q.memcpy(dst_buf, src_buf, len(src_buf)) + + assert dst_buf == src_buf + + +def test_memcpy_async(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + n = len(src_buf) + dst_buf = bytearray(n) + dst_buf2 = bytearray(n) + + e = q.memcpy_async(dst_buf, src_buf, n) + e2 = q.memcpy_async(dst_buf2, src_buf, n, [e]) + + e.wait() + e2.wait() + assert dst_buf == src_buf + assert dst_buf2 == src_buf def test_memcpy_type_error(): @@ -56,8 +126,8 @@ def test_memcpy_type_error(): with pytest.raises(TypeError) as cm: q.memcpy(None, mobj, 3) - assert "`dest`" in str(cm.value) + assert "_Memory" in str(cm.value) with pytest.raises(TypeError) as cm: q.memcpy(mobj, None, 3) - assert "`src`" in str(cm.value) + assert "_Memory" in str(cm.value) diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h index 1c5e53a395..cc466fce17 100644 --- a/libsyclinterface/include/dpctl_sycl_queue_interface.h +++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h @@ -294,6 +294,29 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, const void *Src, size_t Count); +/*! + * @brief C-API wrapper for ``sycl::queue::memcpy``. + * + * @param QRef An opaque pointer to the ``sycl::queue``. + * @param Dest An USM pointer to the destination memory. + * @param Src An USM pointer to the source memory. + * @param Count A number of bytes to copy. + * @param DepEvents A pointer to array of DPCTLSyclEventRef opaque + * pointers to dependent events. + * @param DepEventsCount A number of dependent events. + * @return An opaque pointer to the ``sycl::event`` returned by the + * ``sycl::queue::memcpy`` function. + * @ingroup QueueInterface + */ +DPCTL_API +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + __dpctl_keep const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount); + /*! * @brief C-API wrapper for ``sycl::queue::prefetch``. * diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index 4903b888ff..60098ae933 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -410,9 +410,12 @@ DPCTLQueue_SubmitNDRange(__dpctl_keep const DPCTLSyclKernelRef KRef, try { e = Queue->submit([&](handler &cgh) { // Depend on any event that was specified by the caller. - if (NDepEvents) - for (auto i = 0ul; i < NDepEvents; ++i) - cgh.depends_on(*unwrap(DepEvents[i])); + if (DepEvents) + for (auto i = 0ul; i < NDepEvents; ++i) { + auto ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } for (auto i = 0ul; i < NArgs; ++i) { // \todo add support for Sycl buffers @@ -485,6 +488,42 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, } } +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount) +{ + event ev; + auto Q = unwrap(QRef); + if (Q) { + try { + ev = Q->submit([&](handler &cgh) { + if (DepEvents) + for (size_t i = 0; i < DepEventsCount; ++i) { + event *ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } + + cgh.memcpy(Dest, Src, Count); + }); + } catch (const std::exception &ex) { + error_handler(ex, __FILE__, __func__, __LINE__); + return nullptr; + } + } + else { + error_handler("QRef passed to memcpy was NULL.", __FILE__, __func__, + __LINE__); + return nullptr; + } + + return wrap(new event(ev)); +} + __dpctl_give DPCTLSyclEventRef DPCTLQueue_Prefetch(__dpctl_keep DPCTLSyclQueueRef QRef, const void *Ptr, diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp index 8d23929d39..836a87379b 100644 --- a/libsyclinterface/tests/test_sycl_queue_interface.cpp +++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp @@ -340,6 +340,10 @@ TEST(TestDPCTLSyclQueueInterface, CheckMemOpsZeroQRef) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); ASSERT_FALSE(bool(ERef)); @@ -391,6 +395,10 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemOpsNullPtr) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); if (ERef) { ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); @@ -450,6 +458,38 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset) delete[] host_arr; } +TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset2) +{ + DPCTLSyclUSMRef p = nullptr; + DPCTLSyclEventRef Memset_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; + uint8_t val = 42; + size_t nbytes = 256; + uint8_t *host_arr = new uint8_t[nbytes]; + + ASSERT_FALSE(host_arr == nullptr); + + ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); + ASSERT_FALSE(p == nullptr); + + ASSERT_NO_FATAL_FAILURE( + Memset_ERef = DPCTLQueue_Memset(QRef, (void *)p, val, nbytes)); + + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Memset_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memset_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); + + for (size_t i = 0; i < nbytes; ++i) { + ASSERT_TRUE(host_arr[i] == val); + } + delete[] host_arr; +} + TEST(TestDPCTLSyclQueueInterface, CheckFillNullQRef) { DPCTLSyclQueueRef QRef = nullptr; @@ -481,7 +521,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) { using T = uint8_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill8_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -492,17 +533,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); ASSERT_FALSE(p == nullptr); - ASSERT_NO_FATAL_FAILURE(ERef = + ASSERT_NO_FATAL_FAILURE(Fill8_ERef = DPCTLQueue_Fill8(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill8_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill8_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -517,7 +556,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) using T = uint16_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill16_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xAB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -529,16 +569,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill16_ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill16_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill16_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -553,7 +591,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) using T = uint32_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill32_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCD); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -565,16 +604,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill32_ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill32_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill32_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -589,7 +626,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) using T = uint64_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill64_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCDEF73); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -601,16 +639,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill64_ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill64_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill64_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -639,7 +675,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) using T = value128_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill128_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val{static_cast(0xABCDEF73), static_cast(0x3746AF05)}; size_t nelems = 256; T *host_arr = new T[nelems]; @@ -651,17 +688,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill128(QRef, (void *)p, - reinterpret_cast(&val), nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill128_ERef = DPCTLQueue_Fill128( + QRef, (void *)p, reinterpret_cast(&val), nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill128_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill128_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef));