diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 52d87d6818756..1f174bcf80049 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -149,6 +149,8 @@ pi_result _pi_event::start() {
   }
 
   isStarted_ = true;
+  // let observers know that the event is "submitted"
+  trigger_callback(get_execution_status());
   return result;
 }
 
@@ -195,6 +197,22 @@ pi_result _pi_event::record() {
 
     try {
       result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
+
+      result = cuda_piEventRetain(this);
+      try {
+        result = PI_CHECK_ERROR(cuLaunchHostFunc(
+            cuStream,
+            [](void *userData) {
+              pi_event event = reinterpret_cast<pi_event>(userData);
+              event->set_event_complete();
+              cuda_piEventRelease(event);
+            },
+            this));
+      } catch (...) {
+        // If host function fails to enqueue we must release the event here
+        result = cuda_piEventRelease(this);
+        throw;
+      }
     } catch (pi_result error) {
       result = error;
     }
@@ -215,6 +233,7 @@ pi_result _pi_event::wait() {
   if (is_native_event()) {
     try {
       retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
+      isCompleted_ = true;
     } catch (pi_result error) {
       retErr = error;
     }
@@ -226,30 +245,12 @@ pi_result _pi_event::wait() {
     retErr = PI_SUCCESS;
   }
 
-  return retErr;
-}
-
-pi_event_status _pi_event::get_execution_status() const noexcept {
+  auto is_success = retErr == PI_SUCCESS;
+  auto status = is_success ? get_execution_status() : pi_int32(retErr);
 
-  if (!is_recorded()) {
-    return PI_EVENT_SUBMITTED;
-  }
-
-  if (is_native_event()) {
-    // native event status
-
-    auto status = cuEventQuery(get());
-    if (status == CUDA_ERROR_NOT_READY) {
-      return PI_EVENT_RUNNING;
-    } else if (status != CUDA_SUCCESS) {
-      cl::sycl::detail::pi::die("Invalid CUDA event status");
-    }
-    return PI_EVENT_COMPLETE;
-  } else {
-    // user event status
+  trigger_callback(status);
 
-    return is_completed() ? PI_EVENT_COMPLETE : PI_EVENT_RUNNING;
-  }
+  return retErr;
 }
 
 // iterates over the event wait list, returns correct pi_result error codes.
@@ -2516,24 +2517,21 @@ pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
 
   switch (param_name) {
   case PI_EVENT_INFO_COMMAND_QUEUE:
-    return getInfo<pi_queue>(param_value_size, param_value,
-                             param_value_size_ret, event->get_queue());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_queue());
   case PI_EVENT_INFO_COMMAND_TYPE:
-    return getInfo<pi_command_type>(param_value_size, param_value,
-                                    param_value_size_ret,
-                                    event->get_command_type());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_command_type());
   case PI_EVENT_INFO_REFERENCE_COUNT:
-    return getInfo<pi_uint32>(param_value_size, param_value,
-                              param_value_size_ret,
-                              event->get_reference_count());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_reference_count());
   case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
-    return getInfo<pi_event_status>(param_value_size, param_value,
-                                    param_value_size_ret,
-                                    event->get_execution_status());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<pi_event_status>(event->get_execution_status()));
   }
   case PI_EVENT_INFO_CONTEXT:
-    return getInfo<pi_context>(param_value_size, param_value,
-                               param_value_size_ret, event->get_context());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_context());
   default:
     PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
   }
@@ -2568,13 +2566,21 @@ pi_result cuda_piEventGetProfilingInfo(
   return {};
 }
 
-pi_result cuda_piEventSetCallback(
-    pi_event event, pi_int32 command_exec_callback_type,
-    void (*pfn_notify)(pi_event event, pi_int32 event_command_status,
-                       void *user_data),
-    void *user_data) {
-  cl::sycl::detail::pi::die("cuda_piEventSetCallback not implemented");
-  return {};
+pi_result cuda_piEventSetCallback(pi_event event,
+                                  pi_int32 command_exec_callback_type,
+                                  pfn_notify notify, void *user_data) {
+
+  assert(event);
+  assert(notify);
+  assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
+         command_exec_callback_type == PI_EVENT_RUNNING ||
+         command_exec_callback_type == PI_EVENT_COMPLETE);
+  event_callback callback(pi_event_status(command_exec_callback_type), notify,
+                          user_data);
+
+  event->set_event_callback(callback);
+
+  return PI_SUCCESS;
 }
 
 pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
@@ -2587,7 +2593,7 @@ pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
   }
 
   if (execution_status == PI_EVENT_COMPLETE) {
-    return event->set_user_event_complete();
+    return event->set_event_complete();
   } else if (execution_status < 0) {
     // TODO: A negative integer value causes all enqueued commands that wait
     // on this user event to be terminated.
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index fe0fda38bd3c7..ba70aa6aaee95 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -235,6 +235,39 @@ struct _pi_queue {
   pi_uint32 get_reference_count() const noexcept { return refCount_; }
 };
 
+typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
+                           void *userData);
+
+class event_callback {
+public:
+  void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {
+
+    auto validParameters = callback_ && event;
+
+    // As a pi_event_status value approaches 0, it gets closer to completion.
+    // If the calling pi_event's status is less than or equal to the event
+    // status the user is interested in, invoke the callback anyway. The event
+    // will have passed through that state anyway.
+    auto validStatus = currentEventStatus <= observedEventStatus_;
+
+    if (validParameters && validStatus) {
+
+      callback_(event, currentEventStatus, userData_);
+    }
+  }
+
+  event_callback(pi_event_status status, pfn_notify callback, void *userData)
+      : observedEventStatus_{status}, callback_{callback}, userData_{userData} {
+  }
+
+  pi_event_status get_status() const noexcept { return observedEventStatus_; }
+
+private:
+  pi_event_status observedEventStatus_;
+  pfn_notify callback_;
+  void *userData_;
+};
+
 class _pi_event {
 public:
   using native_type = CUevent;
@@ -247,18 +280,39 @@ class _pi_event {
 
   native_type get() const noexcept { return evEnd_; };
 
-  pi_result set_user_event_complete() noexcept {
+  pi_result set_event_complete() noexcept {
 
     if (isCompleted_) {
       return PI_INVALID_OPERATION;
     }
 
-    if (is_user_event()) {
-      isRecorded_ = true;
-      isCompleted_ = true;
-      return PI_SUCCESS;
+    isRecorded_ = true;
+    isCompleted_ = true;
+
+    trigger_callback(get_execution_status());
+
+    return PI_SUCCESS;
+  }
+
+  void trigger_callback(pi_int32 status) {
+
+    std::vector<event_callback> callbacks;
+
+    // Here we move all callbacks into local variable before we call them.
+    // This is a defensive maneuver; if any of the callbacks attempt to
+    // add additional callbacks, we will end up in a bad spot. Our mutex
+    // will be locked twice and the vector will be modified as it is being
+    // iterated over! By moving everything locally, we can call all of these
+    // callbacks and let them modify the original vector without much worry.
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      event_callbacks_.swap(callbacks);
+    }
+
+    for (auto &event_callback : callbacks) {
+      event_callback.trigger_callback(this, status);
     }
-    return PI_INVALID_EVENT;
   }
 
   pi_queue get_queue() const noexcept { return queue_; }
@@ -273,7 +327,27 @@ class _pi_event {
 
   bool is_started() const noexcept { return isStarted_; }
 
-  pi_event_status get_execution_status() const noexcept;
+  pi_int32 get_execution_status() const noexcept {
+
+    if (!is_recorded()) {
+      return PI_EVENT_SUBMITTED;
+    }
+
+    if (!is_completed()) {
+      return PI_EVENT_RUNNING;
+    }
+    return PI_EVENT_COMPLETE;
+  }
+
+  void set_event_callback(const event_callback &callback) {
+    auto current_status = get_execution_status();
+    if (current_status <= callback.get_status()) {
+      callback.trigger_callback(this, current_status);
+    } else {
+      std::lock_guard<std::mutex> lock(mutex_);
+      event_callbacks_.emplace_back(callback);
+    }
+  }
 
   pi_context get_context() const noexcept { return context_; };
 
@@ -343,6 +417,12 @@ class _pi_event {
   pi_context context_; // pi_context associated with the event. If this is a
                        // native event, this will be the same context associated
                        // with the queue_ member.
+
+  std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
+                     // a lock-free data structure we can use here.
+  std::vector<event_callback>
+      event_callbacks_; // Callbacks that can be triggered when an event's state
+                        // changes.
 };
 
 struct _pi_program {
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 965285ce75ddc..a60de37570689 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -161,7 +161,7 @@ void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) {
   EventImplPtr *Event = (reinterpret_cast<EventImplPtr *>(data));
   RT::PiEvent &EventHandle = (*Event)->getHandleRef();
   const detail::plugin &Plugin = (*Event)->getPlugin();
-  Plugin.call<PiApiKind::piEventSetStatus>(EventHandle, CL_COMPLETE);
+  Plugin.call<PiApiKind::piEventSetStatus>(EventHandle, PI_EVENT_COMPLETE);
   delete (Event);
 }
 
@@ -169,37 +169,42 @@ void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) {
 std::vector<EventImplPtr> Command::prepareEvents(ContextImplPtr Context) {
   std::vector<EventImplPtr> Result;
   std::vector<EventImplPtr> GlueEvents;
-  for (EventImplPtr &Event : MDepsEvents) {
+  for (EventImplPtr &DepEvent : MDepsEvents) {
     // Async work is not supported for host device.
-    if (Event->is_host()) {
-      Event->waitInternal();
+    if (DepEvent->is_host()) {
+      DepEvent->waitInternal();
       continue;
     }
     // The event handle can be null in case of, for example, alloca command,
     // which is currently synchrounious, so don't generate OpenCL event.
-    if (Event->getHandleRef() == nullptr) {
+    if (DepEvent->getHandleRef() == nullptr) {
       continue;
     }
-    ContextImplPtr EventContext = Event->getContextImpl();
-    const detail::plugin &Plugin = Event->getPlugin();
-    // If contexts don't match - connect them using user event
-    if (EventContext != Context && !Context->is_host()) {
+    ContextImplPtr DepEventContext = DepEvent->getContextImpl();
 
+    // If contexts don't match - connect them using user event
+    if (DepEventContext != Context && !Context->is_host()) {
       EventImplPtr GlueEvent(new detail::event_impl());
       GlueEvent->setContextImpl(Context);
+      EventImplPtr *GlueEventCopy =
+          new EventImplPtr(GlueEvent); // To increase the reference count by 1.
+
       RT::PiEvent &GlueEventHandle = GlueEvent->getHandleRef();
+      auto Plugin = Context->getPlugin();
+      auto DepPlugin = DepEventContext->getPlugin();
+      // Add an event on the current context that
+      // is triggered when the DepEvent is complete
       Plugin.call<PiApiKind::piEventCreate>(Context->getHandleRef(),
                                             &GlueEventHandle);
-      EventImplPtr *GlueEventCopy =
-          new EventImplPtr(GlueEvent); // To increase the reference count by 1.
-      Plugin.call<PiApiKind::piEventSetCallback>(
-          Event->getHandleRef(), CL_COMPLETE, EventCompletionClbk,
+
+      DepPlugin.call<PiApiKind::piEventSetCallback>(
+          DepEvent->getHandleRef(), PI_EVENT_COMPLETE, EventCompletionClbk,
           /*void *data=*/(GlueEventCopy));
       GlueEvents.push_back(GlueEvent);
       Result.push_back(std::move(GlueEvent));
       continue;
     }
-    Result.push_back(Event);
+    Result.push_back(DepEvent);
   }
   MDepsEvents.insert(MDepsEvents.end(), GlueEvents.begin(), GlueEvents.end());
   return Result;
diff --git a/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp b/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp
index f7caf5d070d2d..c84a06bbbace4 100644
--- a/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp
+++ b/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp
@@ -4,9 +4,6 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// TODO: pi_die: cuda_piEventSetCallback not implemented
-// XFAIL: cuda
-
 //==---------- buffer_dev_to_dev.cpp - SYCL buffer basic test --------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test/scheduler/DataMovement.cpp b/sycl/test/scheduler/DataMovement.cpp
index 1de310571c824..f1812bf4dbf29 100644
--- a/sycl/test/scheduler/DataMovement.cpp
+++ b/sycl/test/scheduler/DataMovement.cpp
@@ -1,7 +1,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out
 // RUN: %t.out
 //
-// XFAIL: cuda
 //==-------------------------- DataMovement.cpp ----------------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test/scheduler/MultipleDevices.cpp b/sycl/test/scheduler/MultipleDevices.cpp
index d27923929871a..c031f3c8375c1 100644
--- a/sycl/test/scheduler/MultipleDevices.cpp
+++ b/sycl/test/scheduler/MultipleDevices.cpp
@@ -1,9 +1,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out
 // RUN: %t.out
 
-// TODO: pi_die: cuda_piEventSetCallback not implemented
-// XFAIL: cuda
-
 //===- MultipleDevices.cpp - Test checking multi-device execution --------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/unittests/pi/EventTest.cpp b/sycl/unittests/pi/EventTest.cpp
index 4f48cc688a74b..d3c88b9db97a7 100644
--- a/sycl/unittests/pi/EventTest.cpp
+++ b/sycl/unittests/pi/EventTest.cpp
@@ -89,6 +89,88 @@ TEST_F(DISABLED_EventTest, PICreateEvent) {
             PI_SUCCESS);
 }
 
+constexpr size_t event_type_count = 3;
+static bool triggered_flag[event_type_count] = {false, false, false};
+
+struct callback_user_data {
+  pi_int32 event_type;
+  int index;
+};
+
+void EventCallback(pi_event event, pi_int32 status, void *data) {
+  ASSERT_NE(data, nullptr);
+
+  callback_user_data *pdata = static_cast<callback_user_data *>(data);
+
+#ifndef NDEBUG
+  printf("\tEvent callback %d of type %d triggered\n", pdata->index,
+         pdata->event_type);
+#endif
+
+  triggered_flag[pdata->index] = true;
+}
+
+TEST_F(DISABLED_EventTest, piEventSetCallback) {
+
+  pi_int32 event_callback_types[event_type_count] = {
+      PI_EVENT_SUBMITTED, PI_EVENT_RUNNING, PI_EVENT_COMPLETE};
+
+  callback_user_data user_data[event_type_count];
+
+  // gate event lets us register callbacks before letting the enqueued work be
+  // executed.
+  pi_event gateEvent;
+  ASSERT_EQ((Plugins[0].call_nocheck<detail::PiApiKind::piEventCreate>(
+                _context, &gateEvent)),
+            PI_SUCCESS);
+
+  constexpr const size_t dataCount = 1000u;
+  std::vector<int> data(dataCount);
+  auto size_in_bytes = data.size() * sizeof(int);
+
+  pi_mem memObj;
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piMemBufferCreate>(
+          _context, PI_MEM_FLAGS_ACCESS_RW, size_in_bytes, nullptr, &memObj)),
+      PI_SUCCESS);
+
+  pi_event syncEvent;
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
+          _queue, memObj, false, 0, size_in_bytes, data.data(), 1, &gateEvent,
+          &syncEvent)),
+      PI_SUCCESS);
+
+  for (size_t i = 0; i < event_type_count; i++) {
+    user_data[i].event_type = event_callback_types[i];
+    user_data[i].index = i;
+    ASSERT_EQ(
+        (Plugins[0].call_nocheck<detail::PiApiKind::piEventSetCallback>(
+            syncEvent, event_callback_types[i], EventCallback, user_data + i)),
+        PI_SUCCESS);
+  }
+
+  ASSERT_EQ((Plugins[0].call_nocheck<detail::PiApiKind::piEventSetStatus>(
+                gateEvent, PI_EVENT_COMPLETE)),
+            PI_SUCCESS);
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEventsWait>(1, &syncEvent)),
+      PI_SUCCESS);
+  ASSERT_EQ((Plugins[0].call_nocheck<detail::PiApiKind::piQueueFinish>(_queue)),
+            PI_SUCCESS);
+
+  for (size_t k = 0; k < event_type_count; ++k) {
+    EXPECT_TRUE(triggered_flag[k]);
+  }
+
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEventRelease>(gateEvent)),
+      PI_SUCCESS);
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEventRelease>(syncEvent)),
+      PI_SUCCESS);
+}
+
 TEST_F(DISABLED_EventTest, piEventGetInfo) {
 
   pi_event foo;