Skip to content

NEW: Make event timing error messages more specific and actionable #559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 27, 2025
Merged
38 changes: 32 additions & 6 deletions cuda_core/cuda/core/experimental/_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
check_or_create_options,
driver,
handle_return,
)
from cuda.core.experimental._utils.cuda_utils import (
_check_driver_error as raise_if_driver_error,
)

if TYPE_CHECKING:
import cuda.bindings
Expand Down Expand Up @@ -117,13 +125,31 @@ def __rsub__(self, other):

def __sub__(self, other):
# return self - other (in milliseconds)
err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
try:
timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
raise_if_driver_error(err)
return timing
except CUDAError as e:
raise RuntimeError(
"Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
) from e
return timing
if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
if self.is_timing_disabled or other.is_timing_disabled:
explanation = (
"Both Events must be created with timing enabled in order to subtract them; "
"use EventOptions(enable_timing=True) when creating both events."
)
else:
explanation = (
"Both Events must be recorded before they can be subtracted; "
"use Stream.record() to record both events to a stream."
)
elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
explanation = (
"One or both events have not completed; "
"use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
"before subtracting them."
)
else:
raise e
raise RuntimeError(explanation) from e

@property
def is_timing_disabled(self) -> bool:
Expand Down
95 changes: 71 additions & 24 deletions cuda_core/tests/test_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,37 +20,27 @@ def test_event_init_disabled():
cuda.core.experimental._event.Event() # Ensure back door is locked.


@pytest.mark.parametrize("enable_timing", [True, False, None])
def test_timing(init_cuda, enable_timing):
options = EventOptions(enable_timing=enable_timing)
def test_timing_success(init_cuda):
options = EventOptions(enable_timing=True)
stream = Device().create_stream()
delay_seconds = 0.5
e1 = stream.record(options=options)
time.sleep(delay_seconds)
e2 = stream.record(options=options)
e2.sync()
for e in (e1, e2):
assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
if enable_timing:
elapsed_time_ms = e2 - e1
assert isinstance(elapsed_time_ms, float)
# Using a generous tolerance, to avoid flaky tests:
# We only want to exercise the __sub__ method, this test is not meant
# to stress-test the CUDA driver or time.sleep().
delay_ms = delay_seconds * 1000
if os.name == "nt": # noqa: SIM108
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
generous_tolerance = 100
else:
# Most modern Linux kernels have a default timer resolution of 1 ms.
generous_tolerance = 20
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
elapsed_time_ms = e2 - e1
assert isinstance(elapsed_time_ms, float)
# Using a generous tolerance, to avoid flaky tests:
# We only want to exercise the __sub__ method, this test is not meant
# to stress-test the CUDA driver or time.sleep().
delay_ms = delay_seconds * 1000
if os.name == "nt": # noqa: SIM108
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
generous_tolerance = 100
else:
with pytest.raises(RuntimeError) as e:
elapsed_time_ms = e2 - e1
msg = str(e)
assert "disabled by default" in msg
assert "CUDA_ERROR_INVALID_HANDLE" in msg
# Most modern Linux kernels have a default timer resolution of 1 ms.
generous_tolerance = 20
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance


def test_is_sync_busy_waited(init_cuda):
Expand Down Expand Up @@ -80,3 +70,60 @@ def test_is_done(init_cuda):
# Without a sync, the captured work might not have yet completed
# Therefore this check should never raise an exception
assert event.is_done in (True, False)


def test_error_timing_disabled():
device = Device()
device.set_current()
enabled = EventOptions(enable_timing=True)
disabled = EventOptions(enable_timing=False)
stream = device.create_stream()

event1 = stream.record(options=enabled)
event2 = stream.record(options=disabled)
assert not event1.is_timing_disabled
assert event2.is_timing_disabled
stream.sync()
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
event2 - event1

event1 = stream.record(options=disabled)
event2 = stream.record(options=disabled)
stream.sync()
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
event2 - event1


def test_error_timing_recorded():
device = Device()
device.set_current()
enabled = EventOptions(enable_timing=True)
stream = device.create_stream()

event1 = stream.record(options=enabled)
event2 = device.create_event(options=enabled)
event3 = device.create_event(options=enabled)

stream.sync()
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
event2 - event1
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
event1 - event2
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
event3 - event2


def test_error_timing_incomplete():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, this is an amazingly sophisticated unit test setup.

device = Device()
device.set_current()
enabled = EventOptions(enable_timing=True)
stream = device.create_stream()

event1 = stream.record(options=enabled)
event2 = device.create_event(options=enabled)
stream.wait(event2)
event3 = stream.record(options=enabled)

# event3 will never complete because the stream is waiting on event2 which is never recorded
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't there be no work recorded in event2 here so the stream.wait(event2) wouldn't actually wait on anything? Then event3 recording on the stream there again wouldn't be any actual work recorded so it very well could be finished?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting, I would think that because event2 has no work recorded, stream.wait(event2) should raise (maybe cudaErrorInvalidResourceHandle?) instead of no-op.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Took a quick look at the actual implementation, and Keith was right here. It's a no-op and cudaSuccess is returned if event2 is not recorded.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So what's the solution for testing this case? Launching a kernel that takes a long time to complete or never returns? I believe cuda.core now has all the necessary parts to compile and launch such a kernel from this test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@carterbox It is nerve wrecking to leave the kernel spinning and not clean it up after the event test is completed. I pushed commit add0ba1 to allow a signal passed from host to the busy kernel.

with pytest.raises(RuntimeError, match="^One or both events have not completed."):
event3 - event1