diff --git a/cuda_core/cuda/core/experimental/_context.py b/cuda_core/cuda/core/experimental/_context.pyx similarity index 56% rename from cuda_core/cuda/core/experimental/_context.py rename to cuda_core/cuda/core/experimental/_context.pyx index 24e06d69c..205f6c983 100644 --- a/cuda_core/cuda/core/experimental/_context.py +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -13,16 +13,21 @@ class ContextOptions: pass # TODO -class Context: - __slots__ = ("_handle", "_id") +cdef class Context: - def __new__(self, *args, **kwargs): + cdef: + object _handle + int _device_id + + def __init__(self, *args, **kwargs): raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.") @classmethod - def _from_ctx(cls, obj, dev_id): - assert_type(obj, driver.CUcontext) - ctx = super().__new__(cls) - ctx._handle = obj - ctx._id = dev_id + def _from_ctx(cls, handle: driver.CUcontext, int device_id): + cdef Context ctx = Context.__new__(Context) + ctx._handle = handle + ctx._device_id = device_id return ctx + + def __eq__(self, other): + return int(self._handle) == int(other._handle) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index c9a786070..c89f659a9 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1237,7 +1237,6 @@ def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions """ return Stream._init(obj=obj, options=options) - @precondition(_check_context_initialized) def create_event(self, options: Optional[EventOptions] = None) -> Event: """Create an Event object without recording it to a Stream. @@ -1256,7 +1255,10 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event: Newly created event object. """ - return Event._init(self._id, self.context._handle, options) + ctx = driver.cuCtxGetCurrent()[1] + if int(ctx) == 0: + raise CUDAError("No context is bound to the calling CPU thread.") + return Event._init(self._id, ctx, options) @precondition(_check_context_initialized) def allocate(self, size, stream: Optional[Stream] = None) -> Buffer: diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.pyx similarity index 82% rename from cuda_core/cuda/core/experimental/_event.py rename to cuda_core/cuda/core/experimental/_event.pyx index 800f34c9a..1c1302a9b 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -4,14 +4,12 @@ from __future__ import annotations -import weakref from dataclasses import dataclass from typing import TYPE_CHECKING, Optional from cuda.core.experimental._context import Context from cuda.core.experimental._utils.cuda_utils import ( CUDAError, - check_or_create_options, driver, handle_return, ) @@ -25,7 +23,7 @@ @dataclass -class EventOptions: +cdef class EventOptions: """Customizable :obj:`~_event.Event` options. Attributes @@ -49,7 +47,27 @@ class EventOptions: support_ipc: Optional[bool] = False -class Event: +cdef inline EventOptions check_or_create_options(options, str options_description): + """ + Create the specified options dataclass from a dictionary of options or None. + """ + cdef EventOptions opts + if options is None: + opts = EventOptions() + elif isinstance(options, dict): + opts = EventOptions(**options) + elif not isinstance(options, EventOptions): + raise TypeError( + f"The {options_description} must be provided as an object " + f"of type {EventOptions.__name__} or as a dict with valid {options_description}. " + f"The provided object is '{options}'." + ) + + return opts + + + +cdef class Event: """Represent a record at a specific point of execution within a CUDA stream. Applications can asynchronously record events at any point in @@ -77,30 +95,20 @@ class Event: and they should instead be created through a :obj:`~_stream.Stream` object. """ - - class _MembersNeededForFinalize: - __slots__ = ("handle",) - - def __init__(self, event_obj, handle): - self.handle = handle - weakref.finalize(event_obj, self.close) - - def close(self): - if self.handle is not None: - handle_return(driver.cuEventDestroy(self.handle)) - self.handle = None - - def __new__(self, *args, **kwargs): + cdef: + object _handle + bint _timing_disabled + bint _busy_waited + int _device_id + object _ctx_handle + + def __init__(self, *args, **kwargs): raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).") - __slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited", "_device_id", "_ctx_handle") - @classmethod - def _init(cls, device_id: int, ctx_handle: Context, options: Optional[EventOptions] = None): - self = super().__new__(cls) - self._mnff = Event._MembersNeededForFinalize(self, None) - - options = check_or_create_options(EventOptions, options, "Event options") + def _init(cls, device_id: int, ctx_handle: Context, opts=None): + cdef Event self = Event.__new__(Event) + cdef EventOptions options = check_or_create_options(opts, "Event options") flags = 0x0 self._timing_disabled = False self._busy_waited = False @@ -112,14 +120,22 @@ def _init(cls, device_id: int, ctx_handle: Context, options: Optional[EventOptio self._busy_waited = True if options.support_ipc: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") - self._mnff.handle = handle_return(driver.cuEventCreate(flags)) + _, self._handle = driver.cuEventCreate(flags) self._device_id = device_id self._ctx_handle = ctx_handle return self + cdef _close(self): + if self._handle is not None: + _ = driver.cuEventDestroy(self._handle) + self._handle = None + def close(self): """Destroy the event.""" - self._mnff.close() + self._close() + + def __dealloc__(self): + self._close() def __isub__(self, other): return NotImplemented @@ -129,7 +145,7 @@ def __rsub__(self, other): def __sub__(self, other): # return self - other (in milliseconds) - err, timing = driver.cuEventElapsedTime(other.handle, self.handle) + err, timing = driver.cuEventElapsedTime(other.handle, self._handle) try: raise_if_driver_error(err) return timing @@ -180,12 +196,12 @@ def sync(self): has been completed. """ - handle_return(driver.cuEventSynchronize(self._mnff.handle)) + handle_return(driver.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - (result,) = driver.cuEventQuery(self._mnff.handle) + (result,) = driver.cuEventQuery(self._handle) if result == driver.CUresult.CUDA_SUCCESS: return True if result == driver.CUresult.CUDA_ERROR_NOT_READY: @@ -201,7 +217,7 @@ def handle(self) -> cuda.bindings.driver.CUevent: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Event.handle)``. """ - return self._mnff.handle + return self._handle @property def device(self) -> Device: diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.py b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx similarity index 96% rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.py rename to cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index 48b48d2fb..77ce533e6 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.py +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -52,7 +52,7 @@ def _reduce_3_tuple(t: tuple): return t[0] * t[1] * t[2] -def _check_driver_error(error): +cpdef inline void _check_driver_error(error) except*: if error == driver.CUresult.CUDA_SUCCESS: return name_err, name = driver.cuGetErrorName(error) @@ -69,7 +69,7 @@ def _check_driver_error(error): raise CUDAError(f"{name}: {desc}") -def _check_runtime_error(error): +cpdef inline void _check_runtime_error(error) except*: if error == runtime.cudaError_t.cudaSuccess: return name_err, name = runtime.cudaGetErrorName(error) @@ -86,7 +86,7 @@ def _check_runtime_error(error): raise CUDAError(f"{name}: {desc}") -def _check_error(error, handle=None): +cdef inline void _check_error(error, handle=None) except*: if isinstance(error, driver.CUresult): _check_driver_error(error) elif isinstance(error, runtime.cudaError_t): @@ -105,7 +105,7 @@ def _check_error(error, handle=None): raise RuntimeError(f"Unknown error type: {error}") -def handle_return(result, handle=None): +def handle_return(tuple result, handle=None): _check_error(result[0], handle=handle) if len(result) == 1: return diff --git a/cuda_core/setup.py b/cuda_core/setup.py index f2005c3dd..f2b84bfaf 100644 --- a/cuda_core/setup.py +++ b/cuda_core/setup.py @@ -2,28 +2,28 @@ # # SPDX-License-Identifier: Apache-2.0 +import glob import os from Cython.Build import cythonize from setuptools import Extension, setup from setuptools.command.build_ext import build_ext as _build_ext -ext_modules = ( - Extension( - "cuda.core.experimental._dlpack", - sources=["cuda/core/experimental/_dlpack.pyx"], - language="c++", - ), - Extension( - "cuda.core.experimental._memoryview", - sources=["cuda/core/experimental/_memoryview.pyx"], - language="c++", - ), + +# It seems setuptools' wildcard support has problems for namespace packages, +# so we explicitly spell out all Extension instances. +root_module = "cuda.core.experimental" +root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep +ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True) +def strip_prefix_suffix(filename): + return filename[len(root_path):-4] +module_names = (strip_prefix_suffix(f) for f in ext_files) +ext_modules = tuple( Extension( - "cuda.core.experimental._kernel_arg_handler", - sources=["cuda/core/experimental/_kernel_arg_handler.pyx"], + f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", + sources=[f"cuda/core/experimental/{mod}.pyx"], language="c++", - ), + ) for mod in module_names )