diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 52d87d6818756..1f174bcf80049 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -149,6 +149,8 @@ pi_result _pi_event::start() { } isStarted_ = true; + // let observers know that the event is "submitted" + trigger_callback(get_execution_status()); return result; } @@ -195,6 +197,22 @@ pi_result _pi_event::record() { try { result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream)); + + result = cuda_piEventRetain(this); + try { + result = PI_CHECK_ERROR(cuLaunchHostFunc( + cuStream, + [](void *userData) { + pi_event event = reinterpret_cast(userData); + event->set_event_complete(); + cuda_piEventRelease(event); + }, + this)); + } catch (...) { + // If host function fails to enqueue we must release the event here + result = cuda_piEventRelease(this); + throw; + } } catch (pi_result error) { result = error; } @@ -215,6 +233,7 @@ pi_result _pi_event::wait() { if (is_native_event()) { try { retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_)); + isCompleted_ = true; } catch (pi_result error) { retErr = error; } @@ -226,30 +245,12 @@ pi_result _pi_event::wait() { retErr = PI_SUCCESS; } - return retErr; -} - -pi_event_status _pi_event::get_execution_status() const noexcept { + auto is_success = retErr == PI_SUCCESS; + auto status = is_success ? get_execution_status() : pi_int32(retErr); - if (!is_recorded()) { - return PI_EVENT_SUBMITTED; - } - - if (is_native_event()) { - // native event status - - auto status = cuEventQuery(get()); - if (status == CUDA_ERROR_NOT_READY) { - return PI_EVENT_RUNNING; - } else if (status != CUDA_SUCCESS) { - cl::sycl::detail::pi::die("Invalid CUDA event status"); - } - return PI_EVENT_COMPLETE; - } else { - // user event status + trigger_callback(status); - return is_completed() ? PI_EVENT_COMPLETE : PI_EVENT_RUNNING; - } + return retErr; } // iterates over the event wait list, returns correct pi_result error codes. @@ -2516,24 +2517,21 @@ pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name, switch (param_name) { case PI_EVENT_INFO_COMMAND_QUEUE: - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_queue()); + return getInfo(param_value_size, param_value, param_value_size_ret, + event->get_queue()); case PI_EVENT_INFO_COMMAND_TYPE: - return getInfo(param_value_size, param_value, - param_value_size_ret, - event->get_command_type()); + return getInfo(param_value_size, param_value, param_value_size_ret, + event->get_command_type()); case PI_EVENT_INFO_REFERENCE_COUNT: - return getInfo(param_value_size, param_value, - param_value_size_ret, - event->get_reference_count()); + return getInfo(param_value_size, param_value, param_value_size_ret, + event->get_reference_count()); case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: { - return getInfo(param_value_size, param_value, - param_value_size_ret, - event->get_execution_status()); + return getInfo(param_value_size, param_value, param_value_size_ret, + static_cast(event->get_execution_status())); } case PI_EVENT_INFO_CONTEXT: - return getInfo(param_value_size, param_value, - param_value_size_ret, event->get_context()); + return getInfo(param_value_size, param_value, param_value_size_ret, + event->get_context()); default: PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); } @@ -2568,13 +2566,21 @@ pi_result cuda_piEventGetProfilingInfo( return {}; } -pi_result cuda_piEventSetCallback( - pi_event event, pi_int32 command_exec_callback_type, - void (*pfn_notify)(pi_event event, pi_int32 event_command_status, - void *user_data), - void *user_data) { - cl::sycl::detail::pi::die("cuda_piEventSetCallback not implemented"); - return {}; +pi_result cuda_piEventSetCallback(pi_event event, + pi_int32 command_exec_callback_type, + pfn_notify notify, void *user_data) { + + assert(event); + assert(notify); + assert(command_exec_callback_type == PI_EVENT_SUBMITTED || + command_exec_callback_type == PI_EVENT_RUNNING || + command_exec_callback_type == PI_EVENT_COMPLETE); + event_callback callback(pi_event_status(command_exec_callback_type), notify, + user_data); + + event->set_event_callback(callback); + + return PI_SUCCESS; } pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) { @@ -2587,7 +2593,7 @@ pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) { } if (execution_status == PI_EVENT_COMPLETE) { - return event->set_user_event_complete(); + return event->set_event_complete(); } else if (execution_status < 0) { // TODO: A negative integer value causes all enqueued commands that wait // on this user event to be terminated. diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index fe0fda38bd3c7..ba70aa6aaee95 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -235,6 +235,39 @@ struct _pi_queue { pi_uint32 get_reference_count() const noexcept { return refCount_; } }; +typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus, + void *userData); + +class event_callback { +public: + void trigger_callback(pi_event event, pi_int32 currentEventStatus) const { + + auto validParameters = callback_ && event; + + // As a pi_event_status value approaches 0, it gets closer to completion. + // If the calling pi_event's status is less than or equal to the event + // status the user is interested in, invoke the callback anyway. The event + // will have passed through that state anyway. + auto validStatus = currentEventStatus <= observedEventStatus_; + + if (validParameters && validStatus) { + + callback_(event, currentEventStatus, userData_); + } + } + + event_callback(pi_event_status status, pfn_notify callback, void *userData) + : observedEventStatus_{status}, callback_{callback}, userData_{userData} { + } + + pi_event_status get_status() const noexcept { return observedEventStatus_; } + +private: + pi_event_status observedEventStatus_; + pfn_notify callback_; + void *userData_; +}; + class _pi_event { public: using native_type = CUevent; @@ -247,18 +280,39 @@ class _pi_event { native_type get() const noexcept { return evEnd_; }; - pi_result set_user_event_complete() noexcept { + pi_result set_event_complete() noexcept { if (isCompleted_) { return PI_INVALID_OPERATION; } - if (is_user_event()) { - isRecorded_ = true; - isCompleted_ = true; - return PI_SUCCESS; + isRecorded_ = true; + isCompleted_ = true; + + trigger_callback(get_execution_status()); + + return PI_SUCCESS; + } + + void trigger_callback(pi_int32 status) { + + std::vector callbacks; + + // Here we move all callbacks into local variable before we call them. + // This is a defensive maneuver; if any of the callbacks attempt to + // add additional callbacks, we will end up in a bad spot. Our mutex + // will be locked twice and the vector will be modified as it is being + // iterated over! By moving everything locally, we can call all of these + // callbacks and let them modify the original vector without much worry. + + { + std::lock_guard lock(mutex_); + event_callbacks_.swap(callbacks); + } + + for (auto &event_callback : callbacks) { + event_callback.trigger_callback(this, status); } - return PI_INVALID_EVENT; } pi_queue get_queue() const noexcept { return queue_; } @@ -273,7 +327,27 @@ class _pi_event { bool is_started() const noexcept { return isStarted_; } - pi_event_status get_execution_status() const noexcept; + pi_int32 get_execution_status() const noexcept { + + if (!is_recorded()) { + return PI_EVENT_SUBMITTED; + } + + if (!is_completed()) { + return PI_EVENT_RUNNING; + } + return PI_EVENT_COMPLETE; + } + + void set_event_callback(const event_callback &callback) { + auto current_status = get_execution_status(); + if (current_status <= callback.get_status()) { + callback.trigger_callback(this, current_status); + } else { + std::lock_guard lock(mutex_); + event_callbacks_.emplace_back(callback); + } + } pi_context get_context() const noexcept { return context_; }; @@ -343,6 +417,12 @@ class _pi_event { pi_context context_; // pi_context associated with the event. If this is a // native event, this will be the same context associated // with the queue_ member. + + std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be + // a lock-free data structure we can use here. + std::vector + event_callbacks_; // Callbacks that can be triggered when an event's state + // changes. }; struct _pi_program { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 965285ce75ddc..a60de37570689 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -161,7 +161,7 @@ void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) { EventImplPtr *Event = (reinterpret_cast(data)); RT::PiEvent &EventHandle = (*Event)->getHandleRef(); const detail::plugin &Plugin = (*Event)->getPlugin(); - Plugin.call(EventHandle, CL_COMPLETE); + Plugin.call(EventHandle, PI_EVENT_COMPLETE); delete (Event); } @@ -169,37 +169,42 @@ void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) { std::vector Command::prepareEvents(ContextImplPtr Context) { std::vector Result; std::vector GlueEvents; - for (EventImplPtr &Event : MDepsEvents) { + for (EventImplPtr &DepEvent : MDepsEvents) { // Async work is not supported for host device. - if (Event->is_host()) { - Event->waitInternal(); + if (DepEvent->is_host()) { + DepEvent->waitInternal(); continue; } // The event handle can be null in case of, for example, alloca command, // which is currently synchrounious, so don't generate OpenCL event. - if (Event->getHandleRef() == nullptr) { + if (DepEvent->getHandleRef() == nullptr) { continue; } - ContextImplPtr EventContext = Event->getContextImpl(); - const detail::plugin &Plugin = Event->getPlugin(); - // If contexts don't match - connect them using user event - if (EventContext != Context && !Context->is_host()) { + ContextImplPtr DepEventContext = DepEvent->getContextImpl(); + // If contexts don't match - connect them using user event + if (DepEventContext != Context && !Context->is_host()) { EventImplPtr GlueEvent(new detail::event_impl()); GlueEvent->setContextImpl(Context); + EventImplPtr *GlueEventCopy = + new EventImplPtr(GlueEvent); // To increase the reference count by 1. + RT::PiEvent &GlueEventHandle = GlueEvent->getHandleRef(); + auto Plugin = Context->getPlugin(); + auto DepPlugin = DepEventContext->getPlugin(); + // Add an event on the current context that + // is triggered when the DepEvent is complete Plugin.call(Context->getHandleRef(), &GlueEventHandle); - EventImplPtr *GlueEventCopy = - new EventImplPtr(GlueEvent); // To increase the reference count by 1. - Plugin.call( - Event->getHandleRef(), CL_COMPLETE, EventCompletionClbk, + + DepPlugin.call( + DepEvent->getHandleRef(), PI_EVENT_COMPLETE, EventCompletionClbk, /*void *data=*/(GlueEventCopy)); GlueEvents.push_back(GlueEvent); Result.push_back(std::move(GlueEvent)); continue; } - Result.push_back(Event); + Result.push_back(DepEvent); } MDepsEvents.insert(MDepsEvents.end(), GlueEvents.begin(), GlueEvents.end()); return Result; diff --git a/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp b/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp index f7caf5d070d2d..c84a06bbbace4 100644 --- a/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp +++ b/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp @@ -4,9 +4,6 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -// TODO: pi_die: cuda_piEventSetCallback not implemented -// XFAIL: cuda - //==---------- buffer_dev_to_dev.cpp - SYCL buffer basic test --------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test/scheduler/DataMovement.cpp b/sycl/test/scheduler/DataMovement.cpp index 1de310571c824..f1812bf4dbf29 100644 --- a/sycl/test/scheduler/DataMovement.cpp +++ b/sycl/test/scheduler/DataMovement.cpp @@ -1,7 +1,6 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out // RUN: %t.out // -// XFAIL: cuda //==-------------------------- DataMovement.cpp ----------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test/scheduler/MultipleDevices.cpp b/sycl/test/scheduler/MultipleDevices.cpp index d27923929871a..c031f3c8375c1 100644 --- a/sycl/test/scheduler/MultipleDevices.cpp +++ b/sycl/test/scheduler/MultipleDevices.cpp @@ -1,9 +1,6 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out // RUN: %t.out -// TODO: pi_die: cuda_piEventSetCallback not implemented -// XFAIL: cuda - //===- MultipleDevices.cpp - Test checking multi-device execution --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/unittests/pi/EventTest.cpp b/sycl/unittests/pi/EventTest.cpp index 4f48cc688a74b..d3c88b9db97a7 100644 --- a/sycl/unittests/pi/EventTest.cpp +++ b/sycl/unittests/pi/EventTest.cpp @@ -89,6 +89,88 @@ TEST_F(DISABLED_EventTest, PICreateEvent) { PI_SUCCESS); } +constexpr size_t event_type_count = 3; +static bool triggered_flag[event_type_count] = {false, false, false}; + +struct callback_user_data { + pi_int32 event_type; + int index; +}; + +void EventCallback(pi_event event, pi_int32 status, void *data) { + ASSERT_NE(data, nullptr); + + callback_user_data *pdata = static_cast(data); + +#ifndef NDEBUG + printf("\tEvent callback %d of type %d triggered\n", pdata->index, + pdata->event_type); +#endif + + triggered_flag[pdata->index] = true; +} + +TEST_F(DISABLED_EventTest, piEventSetCallback) { + + pi_int32 event_callback_types[event_type_count] = { + PI_EVENT_SUBMITTED, PI_EVENT_RUNNING, PI_EVENT_COMPLETE}; + + callback_user_data user_data[event_type_count]; + + // gate event lets us register callbacks before letting the enqueued work be + // executed. + pi_event gateEvent; + ASSERT_EQ((Plugins[0].call_nocheck( + _context, &gateEvent)), + PI_SUCCESS); + + constexpr const size_t dataCount = 1000u; + std::vector data(dataCount); + auto size_in_bytes = data.size() * sizeof(int); + + pi_mem memObj; + ASSERT_EQ( + (Plugins[0].call_nocheck( + _context, PI_MEM_FLAGS_ACCESS_RW, size_in_bytes, nullptr, &memObj)), + PI_SUCCESS); + + pi_event syncEvent; + ASSERT_EQ( + (Plugins[0].call_nocheck( + _queue, memObj, false, 0, size_in_bytes, data.data(), 1, &gateEvent, + &syncEvent)), + PI_SUCCESS); + + for (size_t i = 0; i < event_type_count; i++) { + user_data[i].event_type = event_callback_types[i]; + user_data[i].index = i; + ASSERT_EQ( + (Plugins[0].call_nocheck( + syncEvent, event_callback_types[i], EventCallback, user_data + i)), + PI_SUCCESS); + } + + ASSERT_EQ((Plugins[0].call_nocheck( + gateEvent, PI_EVENT_COMPLETE)), + PI_SUCCESS); + ASSERT_EQ( + (Plugins[0].call_nocheck(1, &syncEvent)), + PI_SUCCESS); + ASSERT_EQ((Plugins[0].call_nocheck(_queue)), + PI_SUCCESS); + + for (size_t k = 0; k < event_type_count; ++k) { + EXPECT_TRUE(triggered_flag[k]); + } + + ASSERT_EQ( + (Plugins[0].call_nocheck(gateEvent)), + PI_SUCCESS); + ASSERT_EQ( + (Plugins[0].call_nocheck(syncEvent)), + PI_SUCCESS); +} + TEST_F(DISABLED_EventTest, piEventGetInfo) { pi_event foo;