Merge branch 'main' into memory-refactor

Andy-Jost · web-flow · commit cf4dc9d24a6a · 2025-11-07T12:43:32.000-08:00
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
@@ -108,8 +108,8 @@ jobs:
         uses: ./.github/actions/install_unix_deps
         continue-on-error: false
         with:
-          # for artifact fetching
-          dependencies: "jq wget"
+          # for artifact fetching, graphics libs
+          dependencies: "jq wget libgl1 libegl1"
           dependent_exes: "jq wget"
 
       - name: Set environment variables
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
@@ -390,7 +390,7 @@ def initialize_options(self):
             self.parallel = nthreads
 
     def build_extension(self, ext):
-        if building_wheel and sys.platform == "linux":
+        if building_wheel and sys.platform == "linux" and "--debug" not in sys.argv:
             # Strip binaries to remove debug symbols
             ext.extra_link_args.append("-Wl,--strip-all")
         super().build_extension(ext)
diff --git a/cuda_bindings/tests/test_graphics_apis.py b/cuda_bindings/tests/test_graphics_apis.py
@@ -1,29 +1,104 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import contextlib
+import ctypes
+import ctypes.util
+import os
+import sys
+
 import pytest
 from cuda.bindings import runtime as cudart
 
 
-def test_graphics_api_smoketest():
-    # Due to lazy importing in pyglet, pytest.importorskip doesn't work
+@contextlib.contextmanager
+def _gl_context():
+    """
+    Yield a (tex_id, tex_target) with a current GL context.
+    Tries:
+      1) Windows: hidden WGL window (no EGL)
+      2) Linux with DISPLAY/wayland: hidden window
+      3) Linux headless: EGL headless if available
+    Skips if none work.
+    """
+    pyglet = pytest.importorskip("pyglet")
+
+    # Prefer non-headless when a display is available; it's more portable and avoids EGL.
+    if sys.platform.startswith("linux") and not (os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY")):
+        if ctypes.util.find_library("EGL") is None:
+            pytest.skip("No DISPLAY and no EGL runtime available for headless context.")
+        pyglet.options["headless"] = True
+
+    # Create a minimal offscreen/hidden context
+    win = None
     try:
-        import pyglet
-
-        tex = pyglet.image.Texture.create(512, 512)
-    except (ImportError, AttributeError):
-        pytest.skip("pyglet not available or could not create GL context")
-        # return to make linters happy
-        return
-
-    err, gfx_resource = cudart.cudaGraphicsGLRegisterImage(
-        tex.id, tex.target, cudart.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard
-    )
-    error_name = cudart.cudaGetErrorName(err)[1].decode()
-    if error_name == "cudaSuccess":
-        assert int(gfx_resource) != 0
-    else:
-        assert error_name in ("cudaErrorInvalidValue", "cudaErrorUnknown")
+        if not pyglet.options.get("headless"):
+            # Hidden window path (WGL on Windows, GLX/WLS on Linux)
+            from pyglet import gl
+
+            config = gl.Config(double_buffer=False)
+            win = pyglet.window.Window(visible=False, config=config)
+            win.switch_to()
+        else:
+            # Headless EGL path; pyglet will arrange a pbuffer-like headless context
+            from pyglet.gl import headless  # noqa: F401  (import side-effect creates context)
+
+        # Make a tiny texture so we have a real GL object to register
+        from pyglet.gl import gl as _gl
+
+        tex_id = _gl.GLuint(0)
+        _gl.glGenTextures(1, ctypes.byref(tex_id))
+        target = _gl.GL_TEXTURE_2D
+        _gl.glBindTexture(target, tex_id.value)
+        _gl.glTexParameteri(target, _gl.GL_TEXTURE_MIN_FILTER, _gl.GL_NEAREST)
+        _gl.glTexParameteri(target, _gl.GL_TEXTURE_MAG_FILTER, _gl.GL_NEAREST)
+        width, height = 16, 16
+        _gl.glTexImage2D(target, 0, _gl.GL_RGBA8, width, height, 0, _gl.GL_RGBA, _gl.GL_UNSIGNED_BYTE, None)
+
+        yield int(tex_id.value), int(target)
+
+    except Exception as e:
+        # Convert any pyglet/GL creation failure into a clean skip
+        pytest.skip(f"Could not create GL context/texture: {type(e).__name__}: {e}")
+    finally:
+        # Best-effort cleanup
+        try:
+            from pyglet.gl import gl as _gl
+
+            if tex_id.value:
+                _gl.glDeleteTextures(1, ctypes.byref(tex_id))
+        except Exception:  # noqa: S110
+            pass
+        try:
+            if win is not None:
+                win.close()
+        except Exception:  # noqa: S110
+            pass
+
+
+@pytest.mark.parametrize(
+    "flags",
+    [
+        cudart.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone,
+        cudart.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard,
+    ],
+)
+def test_cuda_gl_register_image_smoketest(flags):
+    with _gl_context() as (tex_id, tex_target):
+        # Register
+        err, resource = cudart.cudaGraphicsGLRegisterImage(tex_id, tex_target, flags)
+        name = cudart.cudaGetErrorName(err)[1].decode()
+
+        # Map error expectations by environment:
+        # - success: we actually exercised the API
+        # - operating-system: typical when the driver/runtime refuses interop (e.g., no GPU/driver in CI container)
+        acceptable = {"cudaSuccess", "cudaErrorOperatingSystem"}
+
+        assert name in acceptable, f"cudaGraphicsGLRegisterImage returned {name}"
+        if name == "cudaSuccess":
+            assert int(resource) != 0
+            # Unregister to be tidy
+            cudart.cudaGraphicsUnregisterResource(resource)
 
 
 def test_cuda_register_image_invalid():
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pxd b/cuda_core/cuda/core/experimental/_launch_config.pxd
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+cdef bint _inited
+cdef bint _use_ex
+
+cdef void _lazy_init() except *
+
+cdef class LaunchConfig:
+    """Customizable launch options."""
+    cdef public tuple grid
+    cdef public tuple cluster
+    cdef public tuple block
+    cdef public int shmem_size
+    cdef public bint cooperative_launch
+
+cpdef object _to_native_launch_config(LaunchConfig config)
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pyx b/cuda_core/cuda/core/experimental/_launch_config.pyx
@@ -2,9 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import dataclass
-from typing import Union
-
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._utils.cuda_utils import (
     CUDAError,
@@ -15,24 +12,27 @@
 )
 
 # TODO: revisit this treatment for py313t builds
-_inited = False
+cdef bint _inited = False
+cdef bint _use_ex = False
 
 
-def _lazy_init():
-    global _inited
+cdef void _lazy_init() except *:
+    """Initialize module-level globals for driver version checks."""
+    global _inited, _use_ex
     if _inited:
         return
 
-    global _use_ex
+    cdef tuple _py_major_minor
+    cdef int _driver_ver
+
     # binding availability depends on cuda-python version
     _py_major_minor = get_binding_version()
     _driver_ver = handle_return(driver.cuDriverGetVersion())
     _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
     _inited = True
 
 
-@dataclass
-class LaunchConfig:
+cdef class LaunchConfig:
     """Customizable launch options.
 
     Note
@@ -65,21 +65,36 @@ class LaunchConfig:
     """
 
     # TODO: expand LaunchConfig to include other attributes
-    grid: Union[tuple, int] = None
-    cluster: Union[tuple, int] = None
-    block: Union[tuple, int] = None
-    shmem_size: int | None = None
-    cooperative_launch: bool | None = False
-
-    def __post_init__(self):
+    # Note: attributes are declared in _launch_config.pxd
+
+    def __init__(self, grid=None, cluster=None, block=None,
+                 shmem_size=None, cooperative_launch=False):
+        """Initialize LaunchConfig with validation.
+
+        Parameters
+        ----------
+        grid : Union[tuple, int], optional
+            Grid dimensions (number of blocks or clusters if cluster is specified)
+        cluster : Union[tuple, int], optional
+            Cluster dimensions (Thread Block Cluster)
+        block : Union[tuple, int], optional
+            Block dimensions (threads per block)
+        shmem_size : int, optional
+            Dynamic shared memory size in bytes (default: 0)
+        cooperative_launch : bool, optional
+            Whether to launch as cooperative kernel (default: False)
+        """
         _lazy_init()
-        self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
-        self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
+
+        # Convert and validate grid and block dimensions
+        self.grid = cast_to_3_tuple("LaunchConfig.grid", grid)
+        self.block = cast_to_3_tuple("LaunchConfig.block", block)
+
         # FIXME: Calling Device() strictly speaking is not quite right; we should instead
         # look up the device from stream. We probably need to defer the checks related to
         # device compute capability or attributes.
         # thread block clusters are supported starting H100
-        if self.cluster is not None:
+        if cluster is not None:
             if not _use_ex:
                 err, drvers = driver.cuDriverGetVersion()
                 drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
@@ -89,19 +104,53 @@ def __post_init__(self):
                 raise CUDAError(
                     f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
                 )
-            self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
-        if self.shmem_size is None:
+            self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
+        else:
+            self.cluster = None
+
+        # Handle shmem_size default
+        if shmem_size is None:
             self.shmem_size = 0
+        else:
+            self.shmem_size = shmem_size
+
+        # Handle cooperative_launch
+        self.cooperative_launch = cooperative_launch
+
+        # Validate cooperative launch support
         if self.cooperative_launch and not Device().properties.cooperative_launch:
             raise CUDAError("cooperative kernels are not supported on this device")
 
+    def __repr__(self):
+        """Return string representation of LaunchConfig."""
+        return (f"LaunchConfig(grid={self.grid}, cluster={self.cluster}, "
+                f"block={self.block}, shmem_size={self.shmem_size}, "
+                f"cooperative_launch={self.cooperative_launch})")
+
 
-def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
+cpdef object _to_native_launch_config(LaunchConfig config):
+    """Convert LaunchConfig to native driver CUlaunchConfig.
+
+    Parameters
+    ----------
+    config : LaunchConfig
+        High-level launch configuration
+
+    Returns
+    -------
+    driver.CUlaunchConfig
+        Native CUDA driver launch configuration
+    """
     _lazy_init()
-    drv_cfg = driver.CUlaunchConfig()
+
+    cdef object drv_cfg = driver.CUlaunchConfig()
+    cdef list attrs
+    cdef object attr
+    cdef object dim
+    cdef tuple grid_blocks
 
     # Handle grid dimensions and cluster configuration
-    if config.cluster:
+    if config.cluster is not None:
         # Convert grid from cluster units to block units
         grid_blocks = (
             config.grid[0] * config.cluster[0],
@@ -122,11 +171,14 @@ def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
 
     drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
     drv_cfg.sharedMemBytes = config.shmem_size
+
     if config.cooperative_launch:
         attr = driver.CUlaunchAttribute()
         attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
         attr.value.cooperative = 1
         attrs.append(attr)
+
     drv_cfg.numAttrs = len(attrs)
     drv_cfg.attrs = attrs
+
     return drv_cfg
diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -9,7 +9,7 @@ from cuda.core.experimental._stream cimport _try_to_get_stream_ptr
 from typing import Union
 
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
-from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
+from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
 from cuda.core.experimental._module import Kernel
 from cuda.core.experimental._stream import IsStreamT, Stream
 from cuda.core.experimental._utils.clear_error_support import assert_type
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -131,7 +131,7 @@ def test_launch_invalid_values(init_cuda):
     ker = mod.get_kernel("my_kernel")
     config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), shmem_size=0)
 
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         launch(None, ker, config)
 
     with pytest.raises(TypeError):