NVIDIA · leofang · Apr 27, 2025 · Apr 14, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
@@ -8,7 +8,15 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import (
+    CUDAError,
+    check_or_create_options,
+    driver,
+    handle_return,
+)
+from cuda.core.experimental._utils.cuda_utils import (
+    _check_driver_error as raise_if_driver_error,
+)
 
 if TYPE_CHECKING:
     import cuda.bindings
@@ -117,13 +125,31 @@ def __rsub__(self, other):
 
     def __sub__(self, other):
         # return self - other (in milliseconds)
+        err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
         try:
-            timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
+            raise_if_driver_error(err)
+            return timing
         except CUDAError as e:
-            raise RuntimeError(
-                "Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
-            ) from e
-        return timing
+            if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
+                if self.is_timing_disabled or other.is_timing_disabled:
+                    explanation = (
+                        "Both Events must be created with timing enabled in order to subtract them; "
+                        "use EventOptions(enable_timing=True) when creating both events."
+                    )
+                else:
+                    explanation = (
+                        "Both Events must be recorded before they can be subtracted; "
+                        "use Stream.record() to record both events to a stream."
+                    )
+            elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
+                explanation = (
+                    "One or both events have not completed; "
+                    "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
+                    "before subtracting them."
+                )
+            else:
+                raise e
+            raise RuntimeError(explanation) from e
 
     @property
     def is_timing_disabled(self) -> bool:

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -20,37 +20,27 @@ def test_event_init_disabled():
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-@pytest.mark.parametrize("enable_timing", [True, False, None])
-def test_timing(init_cuda, enable_timing):
-    options = EventOptions(enable_timing=enable_timing)
+def test_timing_success(init_cuda):
+    options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
     delay_seconds = 0.5
     e1 = stream.record(options=options)
     time.sleep(delay_seconds)
     e2 = stream.record(options=options)
     e2.sync()
-    for e in (e1, e2):
-        assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
-    if enable_timing:
-        elapsed_time_ms = e2 - e1
-        assert isinstance(elapsed_time_ms, float)
-        # Using a generous tolerance, to avoid flaky tests:
-        # We only want to exercise the __sub__ method, this test is not meant
-        # to stress-test the CUDA driver or time.sleep().
-        delay_ms = delay_seconds * 1000
-        if os.name == "nt":  # noqa: SIM108
-            # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-            generous_tolerance = 100
-        else:
-            # Most modern Linux kernels have a default timer resolution of 1 ms.
-            generous_tolerance = 20
-        assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    elapsed_time_ms = e2 - e1
+    assert isinstance(elapsed_time_ms, float)
+    # Using a generous tolerance, to avoid flaky tests:
+    # We only want to exercise the __sub__ method, this test is not meant
+    # to stress-test the CUDA driver or time.sleep().
+    delay_ms = delay_seconds * 1000
+    if os.name == "nt":  # noqa: SIM108
+        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
+        generous_tolerance = 100
     else:
-        with pytest.raises(RuntimeError) as e:
-            elapsed_time_ms = e2 - e1
-            msg = str(e)
-            assert "disabled by default" in msg
-            assert "CUDA_ERROR_INVALID_HANDLE" in msg
+        # Most modern Linux kernels have a default timer resolution of 1 ms.
+        generous_tolerance = 20
+    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
 
 
 def test_is_sync_busy_waited(init_cuda):
@@ -80,3 +70,60 @@ def test_is_done(init_cuda):
     # Without a sync, the captured work might not have yet completed
     # Therefore this check should never raise an exception
     assert event.is_done in (True, False)
+
+
+def test_error_timing_disabled():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    disabled = EventOptions(enable_timing=False)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = stream.record(options=disabled)
+    assert not event1.is_timing_disabled
+    assert event2.is_timing_disabled
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+    event1 = stream.record(options=disabled)
+    event2 = stream.record(options=disabled)
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+
+def test_error_timing_recorded():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = device.create_event(options=enabled)
+    event3 = device.create_event(options=enabled)
+
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event2 - event1
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event1 - event2
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event3 - event2
+
+
+def test_error_timing_incomplete():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = device.create_event(options=enabled)
+    stream.wait(event2)
+    event3 = stream.record(options=enabled)
+
+    # event3 will never complete because the stream is waiting on event2 which is never recorded
+    with pytest.raises(RuntimeError, match="^One or both events have not completed."):
+        event3 - event1