From 1ef4cd682f3a6f0ad8e5a6cf8958db30454eb2ba Mon Sep 17 00:00:00 2001
From: Ruyman Reyes <ruyman@codeplay.com>
Date: Wed, 20 May 2020 17:43:57 +0000
Subject: [PATCH 1/2] [SYCL][CUDA] Remove pi Event Callback implementation

Since introduction of host tasks in #1471,
`piEventCallback` and related functionality is not required
by the SYCL-RT.
Removing the implementation of this behaviour from the CUDA
backend simplifies the submission of operations to streams and
overall increases performance.

Signed-off-by: Ruyman Reyes <ruyman@codeplay.com>
---
 sycl/plugins/cuda/pi_cuda.cpp | 154 +++++++---------------------------
 sycl/plugins/cuda/pi_cuda.hpp | 101 ++--------------------
 2 files changed, 36 insertions(+), 219 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 8a4d9540334a4..d48d8ed6501b9 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -276,13 +276,15 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
       isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
       queue_{queue}, context_{context} {
 
-  if (is_native_event()) {
+  if (type != PI_COMMAND_TYPE_USER) {
     PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));
 
     if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
       PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
       PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
     }
+  } else {
+    cl::sycl::detail::pi::die("User-defined events not implemented");
   }
 
   if (queue_ != nullptr) {
@@ -303,7 +305,7 @@ pi_result _pi_event::start() {
   pi_result result;
 
   try {
-    if (is_native_event() && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
+    if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
       // NOTE: This relies on the default stream to be unused.
       result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
       result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get()));
@@ -313,8 +315,6 @@ pi_result _pi_event::start() {
   }
 
   isStarted_ = true;
-  // let observers know that the event is "submitted"
-  trigger_callback(get_execution_status());
   return result;
 }
 
@@ -351,37 +351,16 @@ pi_result _pi_event::record() {
 
   pi_result result = PI_INVALID_OPERATION;
 
-  if (is_native_event()) {
-
-    if (!queue_) {
-      return PI_INVALID_QUEUE;
-    }
+  if (!queue_) {
+    return PI_INVALID_QUEUE;
+  }
 
-    CUstream cuStream = queue_->get();
+  CUstream cuStream = queue_->get();
 
-    try {
-      result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
-
-      result = cuda_piEventRetain(this);
-      try {
-        result = PI_CHECK_ERROR(cuLaunchHostFunc(
-            cuStream,
-            [](void *userData) {
-              pi_event event = reinterpret_cast<pi_event>(userData);
-              event->set_event_complete();
-              cuda_piEventRelease(event);
-            },
-            this));
-      } catch (...) {
-        // If host function fails to enqueue we must release the event here
-        result = cuda_piEventRelease(this);
-        throw;
-      }
-    } catch (pi_result error) {
-      result = error;
-    }
-  } else {
-    result = PI_SUCCESS;
+  try {
+    result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
+  } catch (pi_result error) {
+    result = error;
   }
 
   if (result == PI_SUCCESS) {
@@ -392,65 +371,23 @@ pi_result _pi_event::record() {
 }
 
 pi_result _pi_event::wait() {
-
   pi_result retErr;
-  if (is_native_event()) {
-    try {
-      retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
-      isCompleted_ = true;
-    } catch (pi_result error) {
-      retErr = error;
-    }
-  } else {
-
-    while (!is_completed()) {
-      // wait for user event to complete
-    }
-    retErr = PI_SUCCESS;
+  try {
+    retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
+    isCompleted_ = true;
+  } catch (pi_result error) {
+    retErr = error;
   }
 
-  auto is_success = retErr == PI_SUCCESS;
-  auto status = is_success ? get_execution_status() : pi_int32(retErr);
-
-  trigger_callback(status);
-
   return retErr;
 }
 
 // makes all future work submitted to queue wait for all work captured in event.
 pi_result enqueueEventWait(pi_queue queue, pi_event event) {
-  if (event->is_native_event()) {
-
-    // for native events, the cuStreamWaitEvent call is used.
-    // This makes all future work submitted to stream wait for all
-    // work captured in event.
-
-    return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
-
-  } else {
-
-    // for user events, we enqueue a callback. When invoked, the
-    // callback will block until the user event is marked as
-    // completed.
-
-    static auto user_wait_func = [](void *user_data) {
-      // The host function must not make any CUDA API calls.
-      auto event = static_cast<pi_event>(user_data);
-
-      // busy wait for user event to complete
-      event->wait();
-
-      // this function does not need the event to be kept alive
-      // anymore
-      cuda_piEventRelease(event);
-    };
-
-    // retain event to ensure it is still alive when the
-    // user_wait_func callback is invoked
-    cuda_piEventRetain(event);
-
-    return PI_CHECK_ERROR(cuLaunchHostFunc(queue->get(), user_wait_func, event));
-  }
+  // for native events, the cuStreamWaitEvent call is used.
+  // This makes all future work submitted to stream wait for all
+  // work captured in event.
+  return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
 }
 
 _pi_program::_pi_program(pi_context ctxt)
@@ -2763,37 +2700,13 @@ pi_result cuda_piEventSetCallback(pi_event event,
                                   pi_int32 command_exec_callback_type,
                                   pfn_notify notify, void *user_data) {
 
-  assert(event);
-  assert(notify);
-  assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
-         command_exec_callback_type == PI_EVENT_RUNNING ||
-         command_exec_callback_type == PI_EVENT_COMPLETE);
-  event_callback callback(pi_event_status(command_exec_callback_type), notify,
-                          user_data);
-
-  event->set_event_callback(callback);
-
+  cl::sycl::detail::pi::die("Event Callback not implemented");
   return PI_SUCCESS;
 }
 
 pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
 
-  assert(execution_status >= PI_EVENT_COMPLETE &&
-         execution_status <= PI_EVENT_QUEUED);
-
-  if (!event || event->is_native_event()) {
-    return PI_INVALID_EVENT;
-  }
-
-  if (execution_status == PI_EVENT_COMPLETE) {
-    return event->set_event_complete();
-  } else if (execution_status < 0) {
-    // TODO: A negative integer value causes all enqueued commands that wait
-    // on this user event to be terminated.
-    cl::sycl::detail::pi::die("cuda_piEventSetStatus support for negative execution_status not "
-                              "implemented.");
-  }
-
+  cl::sycl::detail::pi::die("Event Set Status not implemented");
   return PI_INVALID_VALUE;
 }
 
@@ -2821,19 +2734,13 @@ pi_result cuda_piEventRelease(pi_event event) {
   if (event->decrement_reference_count() == 0) {
     std::unique_ptr<_pi_event> event_ptr{event};
     pi_result result = PI_INVALID_EVENT;
-
-    if (event->is_native_event()) {
-      try {
-        ScopedContext active(event->get_context());
-        auto cuEvent = event->get();
-        result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
-      } catch (...) {
-        result = PI_OUT_OF_RESOURCES;
-      }
-    } else {
-      result = PI_SUCCESS;
+    try {
+      ScopedContext active(event->get_context());
+      auto cuEvent = event->get();
+      result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
+    } catch (...) {
+      result = PI_OUT_OF_RESOURCES;
     }
-
     return result;
   }
 
@@ -2888,9 +2795,6 @@ pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
 /// \return PI_SUCCESS on success. PI_INVALID_EVENT if given a user event.
 pi_result cuda_piextEventGetNativeHandle(pi_event event,
                                          pi_native_handle *nativeHandle) {
-  if (event->is_user_event()) {
-    return PI_INVALID_EVENT;
-  }
   *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index 382ef5ed85c3e..fa05dd2e174d0 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -302,37 +302,6 @@ struct _pi_queue {
 
 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
                            void *userData);
-
-class event_callback {
-public:
-  void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {
-
-    auto validParameters = callback_ && event;
-
-    // As a pi_event_status value approaches 0, it gets closer to completion.
-    // If the calling pi_event's status is less than or equal to the event
-    // status the user is interested in, invoke the callback anyway. The event
-    // will have passed through that state anyway.
-    auto validStatus = currentEventStatus <= observedEventStatus_;
-
-    if (validParameters && validStatus) {
-
-      callback_(event, currentEventStatus, userData_);
-    }
-  }
-
-  event_callback(pi_event_status status, pfn_notify callback, void *userData)
-      : observedEventStatus_{status}, callback_{callback}, userData_{userData} {
-  }
-
-  pi_event_status get_status() const noexcept { return observedEventStatus_; }
-
-private:
-  pi_event_status observedEventStatus_;
-  pfn_notify callback_;
-  void *userData_;
-};
-
 /// PI Event mapping to CUevent
 ///
 class _pi_event {
@@ -347,41 +316,6 @@ class _pi_event {
 
   native_type get() const noexcept { return evEnd_; };
 
-  pi_result set_event_complete() noexcept {
-
-    if (isCompleted_) {
-      return PI_INVALID_OPERATION;
-    }
-
-    isRecorded_ = true;
-    isCompleted_ = true;
-
-    trigger_callback(get_execution_status());
-
-    return PI_SUCCESS;
-  }
-
-  void trigger_callback(pi_int32 status) {
-
-    std::vector<event_callback> callbacks;
-
-    // Here we move all callbacks into local variable before we call them.
-    // This is a defensive maneuver; if any of the callbacks attempt to
-    // add additional callbacks, we will end up in a bad spot. Our mutex
-    // will be locked twice and the vector will be modified as it is being
-    // iterated over! By moving everything locally, we can call all of these
-    // callbacks and let them modify the original vector without much worry.
-
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      event_callbacks_.swap(callbacks);
-    }
-
-    for (auto &event_callback : callbacks) {
-      event_callback.trigger_callback(this, status);
-    }
-  }
-
   pi_queue get_queue() const noexcept { return queue_; }
 
   pi_command_type get_command_type() const noexcept { return commandType_; }
@@ -390,10 +324,10 @@ class _pi_event {
 
   bool is_recorded() const noexcept { return isRecorded_; }
 
-  bool is_completed() const noexcept { return isCompleted_; }
-
   bool is_started() const noexcept { return isStarted_; }
 
+  bool is_completed() const noexcept { return isCompleted_; };
+
   pi_int32 get_execution_status() const noexcept {
 
     if (!is_recorded()) {
@@ -406,24 +340,8 @@ class _pi_event {
     return PI_EVENT_COMPLETE;
   }
 
-  void set_event_callback(const event_callback &callback) {
-    auto current_status = get_execution_status();
-    if (current_status <= callback.get_status()) {
-      callback.trigger_callback(this, current_status);
-    } else {
-      std::lock_guard<std::mutex> lock(mutex_);
-      event_callbacks_.emplace_back(callback);
-    }
-  }
-
   pi_context get_context() const noexcept { return context_; };
 
-  bool is_user_event() const noexcept {
-    return get_command_type() == PI_COMMAND_TYPE_USER;
-  }
-
-  bool is_native_event() const noexcept { return !is_user_event(); }
-
   pi_uint32 increment_reference_count() { return ++refCount_; }
 
   pi_uint32 decrement_reference_count() { return --refCount_; }
@@ -462,13 +380,14 @@ class _pi_event {
 
   std::atomic_uint32_t refCount_; // Event reference count.
 
-  std::atomic_bool isCompleted_; // Atomic bool used by user events. Can be
-                                 // used to wait for a user event's completion.
+  bool isCompleted_; // Signifies whether the operations have completed
+                     //
 
   bool isRecorded_; // Signifies wether a native CUDA event has been recorded
                     // yet.
-  bool isStarted_; // Signifies wether the operation associated with the
-                   // PI event has started or not
+  bool isStarted_;  // Signifies wether the operation associated with the
+                    // PI event has started or not
+                    //
 
   native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
                       // event, this will be nullptr.
@@ -484,12 +403,6 @@ class _pi_event {
   pi_context context_; // pi_context associated with the event. If this is a
                        // native event, this will be the same context associated
                        // with the queue_ member.
-
-  std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
-                     // a lock-free data structure we can use here.
-  std::vector<event_callback>
-      event_callbacks_; // Callbacks that can be triggered when an event's state
-                        // changes.
 };
 
 /// Implementation of PI Program on CUDA Module object

From adcf8978b525aef45a5ffc888a42722017987c45 Mon Sep 17 00:00:00 2001
From: Ruyman Reyes <ruyman@codeplay.com>
Date: Thu, 21 May 2020 17:40:49 +0000
Subject: [PATCH 2/2] Addressing feedback from review

---
 sycl/plugins/cuda/pi_cuda.cpp                 |  37 +-
 sycl/plugins/cuda/pi_cuda.hpp                 |   6 -
 sycl/unittests/pi/CMakeLists.txt              |   1 -
 sycl/unittests/pi/EventTest.cpp               | 348 ------------------
 sycl/unittests/pi/cuda/CMakeLists.txt         |   1 -
 sycl/unittests/pi/cuda/test_events.cpp        | 105 ------
 .../pi/cuda/test_primary_context.cpp          |   2 +-
 7 files changed, 10 insertions(+), 490 deletions(-)
 delete mode 100644 sycl/unittests/pi/EventTest.cpp
 delete mode 100644 sycl/unittests/pi/cuda/test_events.cpp

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 9afd7610f3ae2..c514d4ad87896 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -276,15 +276,13 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
       isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
       queue_{queue}, context_{context} {
 
-  if (type != PI_COMMAND_TYPE_USER) {
-    PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));
+  assert(type != PI_COMMAND_TYPE_USER);
 
-    if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
-      PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
-      PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
-    }
-  } else {
-    cl::sycl::detail::pi::die("User-defined events not implemented");
+  PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));
+
+  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
+    PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
+    PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
   }
 
   if (queue_ != nullptr) {
@@ -2622,24 +2620,7 @@ pi_result cuda_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index,
 // Events
 //
 pi_result cuda_piEventCreate(pi_context context, pi_event *event) {
-  assert(context != nullptr);
-  assert(event != nullptr);
-  pi_result retErr = PI_SUCCESS;
-  pi_event retEvent = nullptr;
-
-  try {
-    retEvent = _pi_event::make_user(context);
-    if (retEvent == nullptr) {
-      retErr = PI_OUT_OF_HOST_MEMORY;
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  } catch (...) {
-    retErr = PI_OUT_OF_RESOURCES;
-  }
-
-  *event = retEvent;
-  return retErr;
+  cl::sycl::detail::pi::die("PI Event Create not implemented in CUDA backend");
 }
 
 pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
@@ -2703,13 +2684,13 @@ pi_result cuda_piEventSetCallback(pi_event event,
                                   pi_int32 command_exec_callback_type,
                                   pfn_notify notify, void *user_data) {
 
-  cl::sycl::detail::pi::die("Event Callback not implemented");
+  cl::sycl::detail::pi::die("Event Callback not implemented in CUDA backend");
   return PI_SUCCESS;
 }
 
 pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
 
-  cl::sycl::detail::pi::die("Event Set Status not implemented");
+  cl::sycl::detail::pi::die("Event Set Status not implemented in CUDA backend");
   return PI_INVALID_VALUE;
 }
 
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index fa05dd2e174d0..03220fac1ac47 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -358,12 +358,6 @@ class _pi_event {
   //
   pi_uint64 get_end_time() const;
 
-  // make a user event. CUDA has no concept of user events, so this
-  // functionality is implemented by the CUDA PI implementation.
-  static pi_event make_user(pi_context context) {
-    return new _pi_event(PI_COMMAND_TYPE_USER, context, nullptr);
-  }
-
   // construct a native CUDA. This maps closely to the underlying CUDA event.
   static pi_event make_native(pi_command_type type, pi_queue queue) {
     return new _pi_event(type, queue->get_context(), queue);
diff --git a/sycl/unittests/pi/CMakeLists.txt b/sycl/unittests/pi/CMakeLists.txt
index fbaf61fea440f..d1b14617c5c56 100644
--- a/sycl/unittests/pi/CMakeLists.txt
+++ b/sycl/unittests/pi/CMakeLists.txt
@@ -6,7 +6,6 @@ add_sycl_unittest(PiTests OBJECT
   EnqueueMemTest.cpp
   PiMock.cpp
   PlatformTest.cpp
-  EventTest.cpp
 )
 
 add_dependencies(PiTests sycl)
diff --git a/sycl/unittests/pi/EventTest.cpp b/sycl/unittests/pi/EventTest.cpp
deleted file mode 100644
index 08b4fdcc95aa7..0000000000000
--- a/sycl/unittests/pi/EventTest.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-//==---- EventTest.cpp --- PI unit tests --------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "CL/sycl/detail/pi.hpp"
-#include "TestGetPlugin.hpp"
-#include <atomic>
-#include <detail/plugin.hpp>
-#include <gtest/gtest.h>
-#include <thread>
-
-using namespace cl::sycl;
-
-namespace pi {
-class EventTest : public testing::TestWithParam<detail::plugin> {
-protected:
-  pi_platform _platform;
-  pi_context _context;
-  pi_queue _queue;
-  pi_device _device;
-  pi_result _result;
-
-  EventTest()
-      : _context{nullptr}, _queue{nullptr}, _device{nullptr},
-        _result{PI_INVALID_VALUE} {}
-
-  ~EventTest() override = default;
-
-  void SetUp() override {
-    pi_uint32 numPlatforms = 0;
-
-    detail::plugin plugin = GetParam();
-
-    RecordProperty("PiBackend", GetBackendString(plugin.getBackend()));
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &_platform, nullptr)),
-              PI_SUCCESS);
-    (void)numPlatforms; // Deal with unused variable warning
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  _platform, PI_DEVICE_TYPE_DEFAULT, 1, &_device, nullptr)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &_device, nullptr, nullptr, &_context)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  _context, _device, 0, &_queue)),
-              PI_SUCCESS);
-
-    _result = PI_INVALID_VALUE;
-  }
-
-  void TearDown() override {
-
-    detail::plugin plugin = GetParam();
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piQueueRelease>(_queue)),
-              PI_SUCCESS);
-
-    ASSERT_EQ(
-        (plugin.call_nocheck<detail::PiApiKind::piContextRelease>(_context)),
-        PI_SUCCESS);
-  }
-};
-
-static std::vector<detail::plugin> Plugins = pi::initializeAndRemoveInvalid();
-
-INSTANTIATE_TEST_CASE_P(
-    EventTestImpl, EventTest, testing::ValuesIn(Plugins),
-    [](const testing::TestParamInfo<EventTest::ParamType> &info) {
-      return pi::GetBackendString(info.param.getBackend());
-    });
-
-// TODO: need more negative tests to show errors being reported when expected
-// (invalid arguments etc).
-
-TEST_P(EventTest, PICreateEvent) {
-  pi_event foo;
-
-  detail::plugin plugin = GetParam();
-
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEventCreate>(_context, &foo)),
-      PI_SUCCESS);
-  ASSERT_NE(foo, nullptr);
-
-  EXPECT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(foo)),
-            PI_SUCCESS);
-}
-
-constexpr size_t event_type_count = 3;
-static bool triggered_flag[event_type_count] = {false, false, false};
-
-struct callback_user_data {
-  pi_int32 event_type;
-  int index;
-};
-
-void EventCallback(pi_event event, pi_int32 status, void *data) {
-  ASSERT_NE(data, nullptr);
-
-  callback_user_data *pdata = static_cast<callback_user_data *>(data);
-
-#ifndef NDEBUG
-  printf("\tEvent callback %d of type %d triggered\n", pdata->index,
-         pdata->event_type);
-#endif
-
-  triggered_flag[pdata->index] = true;
-}
-
-TEST_P(EventTest, piEventSetCallback) {
-
-  detail::plugin plugin = GetParam();
-
-  pi_int32 event_callback_types[event_type_count] = {
-      PI_EVENT_SUBMITTED, PI_EVENT_RUNNING, PI_EVENT_COMPLETE};
-
-  callback_user_data user_data[event_type_count];
-
-  // gate event lets us register callbacks before letting the enqueued work be
-  // executed.
-  pi_event gateEvent;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventCreate>(_context,
-                                                                   &gateEvent)),
-            PI_SUCCESS);
-
-  constexpr const size_t dataCount = 1000u;
-  std::vector<int> data(dataCount);
-  auto size_in_bytes = data.size() * sizeof(int);
-
-  pi_mem memObj;
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-          _context, PI_MEM_FLAGS_ACCESS_RW, size_in_bytes, nullptr, &memObj)),
-      PI_SUCCESS);
-
-  pi_event syncEvent;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-                _queue, memObj, false, 0, size_in_bytes, data.data(), 1,
-                &gateEvent, &syncEvent)),
-            PI_SUCCESS);
-
-  for (size_t i = 0; i < event_type_count; i++) {
-    user_data[i].event_type = event_callback_types[i];
-    user_data[i].index = i;
-    ASSERT_EQ(
-        (plugin.call_nocheck<detail::PiApiKind::piEventSetCallback>(
-            syncEvent, event_callback_types[i], EventCallback, user_data + i)),
-        PI_SUCCESS);
-  }
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventSetStatus>(
-                gateEvent, PI_EVENT_COMPLETE)),
-            PI_SUCCESS);
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEventsWait>(1, &syncEvent)),
-      PI_SUCCESS);
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piQueueFinish>(_queue)),
-            PI_SUCCESS);
-
-  for (size_t k = 0; k < event_type_count; ++k) {
-    EXPECT_TRUE(triggered_flag[k]);
-  }
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(gateEvent)),
-            PI_SUCCESS);
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(syncEvent)),
-            PI_SUCCESS);
-}
-
-TEST_P(EventTest, piEventGetInfo) {
-
-  detail::plugin plugin = GetParam();
-
-  pi_event foo;
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEventCreate>(_context, &foo)),
-      PI_SUCCESS);
-  ASSERT_NE(foo, nullptr);
-
-  pi_uint64 paramValue = 0;
-  pi_uint64 retSize = 0;
-  EXPECT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-                foo, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(paramValue),
-                &paramValue, &retSize)),
-            PI_SUCCESS);
-
-  EXPECT_EQ(retSize, sizeof(pi_int32));
-  EXPECT_EQ(paramValue, PI_EVENT_SUBMITTED);
-
-  EXPECT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(foo)),
-            PI_SUCCESS);
-}
-
-TEST_P(EventTest, piEventSetStatus) {
-
-  detail::plugin plugin = GetParam();
-
-  pi_event foo;
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEventCreate>(_context, &foo)),
-      PI_SUCCESS);
-  ASSERT_NE(foo, nullptr);
-
-  pi_event_status paramValue = PI_EVENT_QUEUED;
-  size_t retSize = 0u;
-  plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-      foo, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(paramValue),
-      &paramValue, &retSize);
-
-  EXPECT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventSetStatus>(
-                foo, PI_EVENT_COMPLETE)),
-            PI_SUCCESS);
-
-  paramValue = {};
-  retSize = 0u;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-                foo, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(paramValue),
-                &paramValue, &retSize)),
-            PI_SUCCESS);
-  ASSERT_EQ(paramValue, PI_EVENT_COMPLETE);
-
-  EXPECT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(foo)),
-            PI_SUCCESS);
-}
-
-TEST_P(EventTest, WaitForManualEventOnOtherThread) {
-
-  detail::plugin plugin = GetParam();
-
-  pi_event foo;
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEventCreate>(_context, &foo)),
-      PI_SUCCESS);
-  ASSERT_NE(foo, nullptr);
-
-  pi_event_status paramValue = {};
-  size_t retSize = 0u;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-                foo, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(paramValue),
-                &paramValue, &retSize)),
-            PI_SUCCESS);
-  ASSERT_EQ(paramValue, PI_EVENT_SUBMITTED);
-
-  std::atomic<bool> started{false};
-
-  auto tWaiter = std::thread([&]() {
-    started = true;
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventsWait>(1, &foo)),
-              PI_SUCCESS);
-  });
-
-  while (!started) {
-  };
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventSetStatus>(
-                foo, PI_EVENT_COMPLETE)),
-            PI_SUCCESS);
-
-  tWaiter.join();
-
-  paramValue = {};
-  retSize = 0u;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-                foo, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(paramValue),
-                &paramValue, &retSize)),
-            PI_SUCCESS);
-  ASSERT_EQ(paramValue, PI_EVENT_COMPLETE);
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(foo)),
-            PI_SUCCESS);
-}
-
-TEST_P(EventTest, piEnqueueEventsWait) {
-
-  detail::plugin plugin = GetParam();
-
-  constexpr const size_t dataCount = 10u;
-  int output[dataCount] = {};
-  const int data[dataCount] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  constexpr const size_t bytes = sizeof(data);
-
-  pi_mem memObj;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                _context, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj)),
-            PI_SUCCESS);
-
-  pi_event events[4] = {nullptr, nullptr, nullptr, nullptr};
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
-                _queue, memObj, true, 0, bytes, data, 0, nullptr, &events[0])),
-            PI_SUCCESS);
-  ASSERT_NE(events[0], nullptr);
-
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEnqueueMemBufferRead>(
-          _queue, memObj, true, 0, bytes, output, 0, nullptr, &events[1])),
-      PI_SUCCESS);
-  ASSERT_NE(events[1], nullptr);
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventCreate>(_context,
-                                                                   &events[2])),
-            PI_SUCCESS);
-  ASSERT_NE(events[2], nullptr);
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEnqueueEventsWait>(
-                _queue, 3, events, &events[3])),
-            PI_SUCCESS);
-  ASSERT_NE(events[3], nullptr);
-
-  pi_event_status paramValue = {};
-  size_t retSize = 0u;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-                events[3], PI_EVENT_INFO_COMMAND_EXECUTION_STATUS,
-                sizeof(paramValue), &paramValue, &retSize)),
-            PI_SUCCESS);
-  ASSERT_NE(paramValue, PI_EVENT_COMPLETE);
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventSetStatus>(
-                events[2], PI_EVENT_COMPLETE)),
-            PI_SUCCESS);
-
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEventsWait>(1, &events[3])),
-      PI_SUCCESS);
-
-  paramValue = {};
-  retSize = 0u;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-                events[3], PI_EVENT_INFO_COMMAND_EXECUTION_STATUS,
-                sizeof(paramValue), &paramValue, &retSize)),
-            PI_SUCCESS);
-  ASSERT_EQ(paramValue, PI_EVENT_COMPLETE);
-}
-
-} // namespace pi
diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt
index 8c1efe21467a3..13ce941adefff 100644
--- a/sycl/unittests/pi/cuda/CMakeLists.txt
+++ b/sycl/unittests/pi/cuda/CMakeLists.txt
@@ -8,7 +8,6 @@ add_sycl_unittest(PiCudaTests OBJECT
   test_mem_obj.cpp
   test_primary_context.cpp
   test_queue.cpp
-  test_events.cpp
 )
 
 add_dependencies(PiCudaTests sycl)
diff --git a/sycl/unittests/pi/cuda/test_events.cpp b/sycl/unittests/pi/cuda/test_events.cpp
deleted file mode 100644
index 6853b04c63305..0000000000000
--- a/sycl/unittests/pi/cuda/test_events.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-//==---- test_events.cpp --- PI unit tests ---------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-#include <cuda.h>
-
-#include "TestGetPlugin.hpp"
-#include <CL/sycl.hpp>
-#include <CL/sycl/detail/pi.hpp>
-#include <detail/plugin.hpp>
-#include <pi_cuda.hpp>
-#include <thread>
-
-using namespace cl::sycl;
-
-namespace pi {
-class CudaEventTests : public ::testing::Test {
-protected:
-  detail::plugin plugin = pi::initializeAndGet(backend::cuda);
-
-  pi_platform _platform;
-  pi_context _context;
-  pi_queue _queue;
-  pi_device _device;
-
-  CudaEventTests() : _context{nullptr}, _queue{nullptr}, _device{nullptr} {}
-
-  ~CudaEventTests() override = default;
-
-  void SetUp() override {
-    pi_uint32 numPlatforms = 0;
-    ASSERT_EQ(plugin.getBackend(), backend::cuda);
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  0, nullptr, &numPlatforms)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piPlatformsGet>(
-                  numPlatforms, &_platform, nullptr)),
-              PI_SUCCESS)
-        << "piPlatformsGet failed.\n";
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piDevicesGet>(
-                  _platform, PI_DEVICE_TYPE_GPU, 1, &_device, nullptr)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piContextCreate>(
-                  nullptr, 1, &_device, nullptr, nullptr, &_context)),
-              PI_SUCCESS);
-
-    ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  _context, _device, 0, &_queue)),
-              PI_SUCCESS);
-  }
-
-  void TearDown() override {
-    plugin.call<detail::PiApiKind::piQueueRelease>(_queue);
-    plugin.call<detail::PiApiKind::piContextRelease>(_context);
-  }
-};
-
-TEST_F(CudaEventTests, PICreateEvent) {
-
-  pi_event foo;
-  ASSERT_EQ(
-      (plugin.call_nocheck<detail::PiApiKind::piEventCreate>(_context, &foo)),
-      PI_SUCCESS);
-  ASSERT_NE(foo, nullptr);
-  // There is no CUDA interop event for user events
-  EXPECT_EQ(foo->get(), nullptr);
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(foo)),
-            PI_SUCCESS);
-}
-
-TEST_F(CudaEventTests, piGetInfoNativeEvent) {
-
-  auto foo = _pi_event::make_native(PI_COMMAND_TYPE_NDRANGE_KERNEL, _queue);
-  ASSERT_NE(foo, nullptr);
-
-  pi_event_status paramValue = {};
-  size_t retSize = 0u;
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventGetInfo>(
-                foo, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(paramValue),
-                &paramValue, &retSize)),
-            PI_SUCCESS);
-  EXPECT_EQ(retSize, sizeof(pi_int32));
-  EXPECT_EQ(paramValue, PI_EVENT_SUBMITTED);
-
-  auto cuEvent = foo->get();
-  ASSERT_NE(cuEvent, nullptr);
-
-  auto errCode = cuEventQuery(cuEvent);
-  ASSERT_EQ(errCode, CUDA_SUCCESS);
-
-  ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piEventRelease>(foo)),
-            PI_SUCCESS);
-}
-} // namespace pi
diff --git a/sycl/unittests/pi/cuda/test_primary_context.cpp b/sycl/unittests/pi/cuda/test_primary_context.cpp
index 50882ab2731aa..244033757fa1c 100644
--- a/sycl/unittests/pi/cuda/test_primary_context.cpp
+++ b/sycl/unittests/pi/cuda/test_primary_context.cpp
@@ -32,7 +32,7 @@ struct CudaPrimaryContextTests : public ::testing::Test {
     const std::string platformVersion =
         platform.get_info<info::platform::version>();
     // If using PI_CUDA, don't accept a non-CUDA device
-    return platformVersion.find("CUDA") != std::string::npos;
+    return platformVersion.find("CUDA BACKEND") != std::string::npos;
   }
 
   class cuda_device_selector : public device_selector {