diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 4e48590a3..9cb5f41eb 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -55,9 +55,9 @@ jobs: if ('${{ inputs.local-ctk }}' -eq '1') { if ($TEST_CUDA_MAJOR -eq '12') { - $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]' + $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]' } else { - $MINI_CTK_DEPS = '["nvcc", "nvrtc"]' + $MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]' } } diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index 9710af2c0..60654ec6c 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -8,7 +8,15 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Optional -from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return +from cuda.core.experimental._utils.cuda_utils import ( + CUDAError, + check_or_create_options, + driver, + handle_return, +) +from cuda.core.experimental._utils.cuda_utils import ( + _check_driver_error as raise_if_driver_error, +) if TYPE_CHECKING: import cuda.bindings @@ -117,13 +125,31 @@ def __rsub__(self, other): def __sub__(self, other): # return self - other (in milliseconds) + err, timing = driver.cuEventElapsedTime(other.handle, self.handle) try: - timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle)) + raise_if_driver_error(err) + return timing except CUDAError as e: - raise RuntimeError( - "Timing capability must be enabled in order to subtract two Events; timing is disabled by default." - ) from e - return timing + if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE: + if self.is_timing_disabled or other.is_timing_disabled: + explanation = ( + "Both Events must be created with timing enabled in order to subtract them; " + "use EventOptions(enable_timing=True) when creating both events." + ) + else: + explanation = ( + "Both Events must be recorded before they can be subtracted; " + "use Stream.record() to record both events to a stream." + ) + elif err == driver.CUresult.CUDA_ERROR_NOT_READY: + explanation = ( + "One or both events have not completed; " + "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete " + "before subtracting them." + ) + else: + raise e + raise RuntimeError(explanation) from e @property def is_timing_disabled(self) -> bool: diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 108b8b140..4895c0a67 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -7,12 +7,15 @@ # is strictly prohibited. import os +import pathlib import time +import numpy as np import pytest import cuda.core.experimental -from cuda.core.experimental import Device, EventOptions +from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch +from cuda.core.experimental._memory import _DefaultPinnedMemorySource def test_event_init_disabled(): @@ -20,37 +23,27 @@ def test_event_init_disabled(): cuda.core.experimental._event.Event() # Ensure back door is locked. -@pytest.mark.parametrize("enable_timing", [True, False, None]) -def test_timing(init_cuda, enable_timing): - options = EventOptions(enable_timing=enable_timing) +def test_timing_success(init_cuda): + options = EventOptions(enable_timing=True) stream = Device().create_stream() delay_seconds = 0.5 e1 = stream.record(options=options) time.sleep(delay_seconds) e2 = stream.record(options=options) e2.sync() - for e in (e1, e2): - assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing) - if enable_timing: - elapsed_time_ms = e2 - e1 - assert isinstance(elapsed_time_ms, float) - # Using a generous tolerance, to avoid flaky tests: - # We only want to exercise the __sub__ method, this test is not meant - # to stress-test the CUDA driver or time.sleep(). - delay_ms = delay_seconds * 1000 - if os.name == "nt": # noqa: SIM108 - # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. - generous_tolerance = 100 - else: - # Most modern Linux kernels have a default timer resolution of 1 ms. - generous_tolerance = 20 - assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance + elapsed_time_ms = e2 - e1 + assert isinstance(elapsed_time_ms, float) + # Using a generous tolerance, to avoid flaky tests: + # We only want to exercise the __sub__ method, this test is not meant + # to stress-test the CUDA driver or time.sleep(). + delay_ms = delay_seconds * 1000 + if os.name == "nt": # noqa: SIM108 + # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. + generous_tolerance = 100 else: - with pytest.raises(RuntimeError) as e: - elapsed_time_ms = e2 - e1 - msg = str(e) - assert "disabled by default" in msg - assert "CUDA_ERROR_INVALID_HANDLE" in msg + # Most modern Linux kernels have a default timer resolution of 1 ms. + generous_tolerance = 20 + assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance def test_is_sync_busy_waited(init_cuda): @@ -80,3 +73,100 @@ def test_is_done(init_cuda): # Without a sync, the captured work might not have yet completed # Therefore this check should never raise an exception assert event.is_done in (True, False) + + +def test_error_timing_disabled(): + device = Device() + device.set_current() + enabled = EventOptions(enable_timing=True) + disabled = EventOptions(enable_timing=False) + stream = device.create_stream() + + event1 = stream.record(options=enabled) + event2 = stream.record(options=disabled) + assert not event1.is_timing_disabled + assert event2.is_timing_disabled + stream.sync() + with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"): + event2 - event1 + + event1 = stream.record(options=disabled) + event2 = stream.record(options=disabled) + stream.sync() + with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"): + event2 - event1 + + +def test_error_timing_recorded(): + device = Device() + device.set_current() + enabled = EventOptions(enable_timing=True) + stream = device.create_stream() + + event1 = stream.record(options=enabled) + event2 = device.create_event(options=enabled) + event3 = device.create_event(options=enabled) + + stream.sync() + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): + event2 - event1 + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): + event1 - event2 + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): + event3 - event2 + + +# TODO: improve this once path finder can find headers +@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header") +@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") +def test_error_timing_incomplete(): + device = Device() + device.set_current() + + # This kernel is designed to busy loop until a signal is received + code = """ +#include + +extern "C" +__global__ void wait(int* val) { + cuda::atomic_ref signal{*val}; + while (true) { + if (signal.load(cuda::memory_order_relaxed)) { + break; + } + } +} +""" + + arch = "".join(f"{i}" for i in device.compute_capability) + program_options = ProgramOptions( + std="c++17", + arch=f"sm_{arch}", + include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")), + ) + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile(target_type="cubin") + ker = mod.get_kernel("wait") + + mr = _DefaultPinnedMemorySource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + config = LaunchConfig(grid=1, block=1) + ker_args = (arr.ctypes.data,) + + enabled = EventOptions(enable_timing=True) + stream = device.create_stream() + + event1 = stream.record(options=enabled) + launch(stream, config, ker, *ker_args) + event3 = stream.record(options=enabled) + + # event3 will never complete because the stream is waiting on wait() to complete + with pytest.raises(RuntimeError, match="^One or both events have not completed."): + event3 - event1 + + arr[0] = 1 + event3.sync() + event3 - event1 # this should work