From 15ebef3c4fe2702302664ce84a12ec863261a293 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 16 Nov 2024 04:19:02 +0000 Subject: [PATCH 01/17] add tests for viewable --- cuda_core/cuda/core/experimental/__init__.py | 1 + .../cuda/core/experimental/_memoryview.pyx | 2 +- cuda_core/tests/test_utils.py | 68 +++++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 cuda_core/tests/test_utils.py diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 9b978398d..a62f7db20 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -7,3 +7,4 @@ from cuda.core.experimental._launcher import LaunchConfig, launch from cuda.core.experimental._program import Program from cuda.core.experimental._stream import Stream, StreamOptions +from cuda.core.experimental import utils diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index af6b3adff..b214b0e98 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -26,7 +26,7 @@ cdef class StridedMemoryView: shape: tuple = None strides: tuple = None # in counts, not bytes dtype: numpy.dtype = None - device_id: int = None # -1 for CPU + device_id: int = None # 0 for CPU device_accessible: bool = None readonly: bool = None exporting_obj: Any = None diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py new file mode 100644 index 000000000..b52df1f42 --- /dev/null +++ b/cuda_core/tests/test_utils.py @@ -0,0 +1,68 @@ +import numpy as np +try: + import cupy as cp +except ImportError: + cp = None +import pytest + +from cuda.core.experimental import Device +from cuda.core.experimental.utils import StridedMemoryView, viewable + + +@pytest.mark.parametrize( + "in_arr,", ( + np.empty(3, dtype=np.int32), + np.empty((6, 6), dtype=np.float64)[::2, ::2], + np.empty((3, 4), order='F'), + ) +) +def test_viewable_cpu(in_arr): + + @viewable((0,)) + def my_func(arr): + view = arr.view(-1) + assert view.ptr == in_arr.ctypes.data + assert view.shape == in_arr.shape + if in_arr.flags.c_contiguous: + assert view.strides is None + else: + assert view.strides == tuple(s // in_arr.dtype.itemsize for s in in_arr.strides) + assert view.dtype == in_arr.dtype + assert view.device_id == 0 + assert view.device_accessible == False + assert view.exporting_obj is in_arr + + my_func(in_arr) + + +if cp is not None: + + @pytest.mark.parametrize( + "in_arr,stream", ( + (cp.empty(3, dtype=cp.complex64), None), + (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True), + (cp.empty((3, 4), order='F'), True), + ) + ) + def test_viewable_gpu(in_arr, stream): + # TODO: use the device fixture? + dev = Device() + dev.set_current() + s = dev.create_stream() if stream else None + + @viewable((0,)) + def my_func(arr): + view = arr.view(s.handle if s else -1) + assert view.ptr == in_arr.data.ptr + assert view.shape == in_arr.shape + strides_in_counts = tuple(s // in_arr.dtype.itemsize for s in in_arr.strides) + if in_arr.flags.c_contiguous: + assert view.strides in (None, strides_in_counts) + else: + assert view.strides == strides_in_counts + assert view.dtype == in_arr.dtype + assert view.device_id == dev.device_id + assert view.device_accessible == True + assert view.exporting_obj is in_arr + + my_func(in_arr) From 80b2556ce4fd045babad8614ef75d06721cdb605 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 16 Nov 2024 04:50:01 +0000 Subject: [PATCH 02/17] use numba to test CAI & make test a bit cleaner --- cuda_core/tests/test_utils.py | 92 ++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 28 deletions(-) diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index b52df1f42..9646cafd6 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -1,14 +1,22 @@ -import numpy as np try: import cupy as cp except ImportError: cp = None +try: + from numba import cuda as numba_cuda +except ImportError: + numba_cuda = None +import numpy as np import pytest from cuda.core.experimental import Device from cuda.core.experimental.utils import StridedMemoryView, viewable +def convert_strides_to_counts(strides, itemsize): + return tuple(s // itemsize for s in strides) + + @pytest.mark.parametrize( "in_arr,", ( np.empty(3, dtype=np.int32), @@ -21,12 +29,15 @@ def test_viewable_cpu(in_arr): @viewable((0,)) def my_func(arr): view = arr.view(-1) + assert isinstance(view, StridedMemoryView) assert view.ptr == in_arr.ctypes.data assert view.shape == in_arr.shape + strides_in_counts = convert_strides_to_counts( + in_arr.strides, in_arr.dtype.itemsize) if in_arr.flags.c_contiguous: assert view.strides is None else: - assert view.strides == tuple(s // in_arr.dtype.itemsize for s in in_arr.strides) + assert view.strides == strides_in_counts assert view.dtype == in_arr.dtype assert view.device_id == 0 assert view.device_accessible == False @@ -35,34 +46,59 @@ def my_func(arr): my_func(in_arr) -if cp is not None: - - @pytest.mark.parametrize( - "in_arr,stream", ( +def gpu_array_samples(): + # TODO: this function would initialize the device at test collection time + samples = [] + if cp is not None: + samples += [ (cp.empty(3, dtype=cp.complex64), None), (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True), (cp.empty((3, 4), order='F'), True), - ) + ] + # Numba's device_array is the only known array container that does not + # support DLPack (so that we get to test the CAI coverage). + if numba_cuda is not None: + samples += [ + (numba_cuda.device_array((2,), dtype=np.int8), None), + (numba_cuda.device_array((4, 2), dtype=np.float32), True), + ] + return samples + + +def gpu_array_ptr(arr): + if cp is not None and isinstance(arr, cp.ndarray): + return arr.data.ptr + if numba_cuda is not None and isinstance(arr, numba_cuda.cudadrv.devicearray.DeviceNDArray): + return arr.device_ctypes_pointer.value + assert False, f"{arr=}" + + +@pytest.mark.parametrize( + "in_arr,stream", ( + *gpu_array_samples(), ) - def test_viewable_gpu(in_arr, stream): - # TODO: use the device fixture? - dev = Device() - dev.set_current() - s = dev.create_stream() if stream else None +) +def test_viewable_gpu(in_arr, stream): + # TODO: use the device fixture? + dev = Device() + dev.set_current() + s = dev.create_stream() if stream else None + + @viewable((0,)) + def my_func(arr): + view = arr.view(s.handle if s else -1) + assert isinstance(view, StridedMemoryView) + assert view.ptr == gpu_array_ptr(in_arr) + assert view.shape == in_arr.shape + strides_in_counts = convert_strides_to_counts( + in_arr.strides, in_arr.dtype.itemsize) + if in_arr.flags["C_CONTIGUOUS"]: + assert view.strides in (None, strides_in_counts) + else: + assert view.strides == strides_in_counts + assert view.dtype == in_arr.dtype + assert view.device_id == dev.device_id + assert view.device_accessible == True + assert view.exporting_obj is in_arr - @viewable((0,)) - def my_func(arr): - view = arr.view(s.handle if s else -1) - assert view.ptr == in_arr.data.ptr - assert view.shape == in_arr.shape - strides_in_counts = tuple(s // in_arr.dtype.itemsize for s in in_arr.strides) - if in_arr.flags.c_contiguous: - assert view.strides in (None, strides_in_counts) - else: - assert view.strides == strides_in_counts - assert view.dtype == in_arr.dtype - assert view.device_id == dev.device_id - assert view.device_accessible == True - assert view.exporting_obj is in_arr - - my_func(in_arr) + my_func(in_arr) From 8ce0aa67314f1a75952165f595e4b845cb760c44 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 16 Nov 2024 05:06:09 +0000 Subject: [PATCH 03/17] add tests for creating views directly --- cuda_core/tests/test_utils.py | 59 +++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 9646cafd6..6700a9c58 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -24,11 +24,22 @@ def convert_strides_to_counts(strides, itemsize): np.empty((3, 4), order='F'), ) ) -def test_viewable_cpu(in_arr): +class TestViewCPU: - @viewable((0,)) - def my_func(arr): - view = arr.view(-1) + def test_viewable_cpu(self, in_arr): + + @viewable((0,)) + def my_func(arr): + view = arr.view(-1) + self._check_view(view, in_arr) + + my_func(in_arr) + + def test_strided_memory_view_cpu(self, in_arr): + view = StridedMemoryView(in_arr, stream_ptr=-1) + self._check_view(view, in_arr) + + def _check_view(self, view, in_arr): assert isinstance(view, StridedMemoryView) assert view.ptr == in_arr.ctypes.data assert view.shape == in_arr.shape @@ -43,8 +54,6 @@ def my_func(arr): assert view.device_accessible == False assert view.exporting_obj is in_arr - my_func(in_arr) - def gpu_array_samples(): # TODO: this function would initialize the device at test collection time @@ -78,15 +87,33 @@ def gpu_array_ptr(arr): *gpu_array_samples(), ) ) -def test_viewable_gpu(in_arr, stream): - # TODO: use the device fixture? - dev = Device() - dev.set_current() - s = dev.create_stream() if stream else None - - @viewable((0,)) - def my_func(arr): - view = arr.view(s.handle if s else -1) +class TestViewGPU: + + def test_viewable_gpu(self, in_arr, stream): + # TODO: use the device fixture? + dev = Device() + dev.set_current() + s = dev.create_stream() if stream else None + + @viewable((0,)) + def my_func(arr): + view = arr.view(s.handle if s else -1) + self._check_view(view, in_arr, dev) + + my_func(in_arr) + + def test_strided_memory_view_cpu(self, in_arr, stream): + # TODO: use the device fixture? + dev = Device() + dev.set_current() + s = dev.create_stream() if stream else None + + view = StridedMemoryView( + in_arr, + stream_ptr=s.handle if s else -1) + self._check_view(view, in_arr, dev) + + def _check_view(self, view, in_arr, dev): assert isinstance(view, StridedMemoryView) assert view.ptr == gpu_array_ptr(in_arr) assert view.shape == in_arr.shape @@ -100,5 +127,3 @@ def my_func(arr): assert view.device_id == dev.device_id assert view.device_accessible == True assert view.exporting_obj is in_arr - - my_func(in_arr) From b55b8baae41c9b7a1a10d1b54f788f08c9e74bb8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 16 Nov 2024 06:29:32 +0000 Subject: [PATCH 04/17] add docs --- .../cuda/core/experimental/_memoryview.pyx | 81 ++++++++++++++++++- cuda_core/docs/source/api.rst | 15 ++++ cuda_core/docs/source/conf.py | 9 +++ cuda_core/tests/test_utils.py | 8 ++ 4 files changed, 112 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index b214b0e98..4b12463e4 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -20,7 +20,59 @@ from cuda.core.experimental._utils import handle_return @cython.dataclasses.dataclass cdef class StridedMemoryView: - + """A dataclass holding metadata of a strided dense array/tensor. + + A :obj:`StridedMemoryView` instance can be created in two ways: + + 1. Using the :obj:`viewable` decorator (recommended) + 2. Explicit construction, see below + + This object supports both DLPack (up to v1.0) and CUDA Array Interface + (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol + first, then the CAI protocol. A :obj:`BufferError` is raised if neither is + supported. + + Since either way would take a consumer stream, for DLPack it is passed to + ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a + stream order will be established between the consumer stream and the + producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if + ``cudaStreamWaitEvent`` is called by this method. + + To opt-out of the stream ordering operation in either DLPack or CAI, + please pass ``stream_ptr=-1``. Note that this deviates (on purpose) + from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core`` + does not encourage using the (legacy) default/null stream, but is + consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be + internally passed to ``obj.__dlpack__()`` instead. + + Attributes + ---------- + ptr : int + Pointer to the tensor buffer (as a Python `int`). + shape: tuple + Shape of the tensor. + strides: tuple + Strides of the tensor (in **counts**, not bytes). + dtype: numpy.dtype + Data type of the tensor. + device_id: int + The device ID for where the tensor is located. It is 0 for CPU tensors. + device_accessible: bool + Whether the tensor data can be accessed on the GPU. + readonly: bool + Whether the tensor data can be modified in place. + exporting_obj: Any + A reference to the original tensor object that is being viewed. + + Parameters + ---------- + obj : Any + Any objects that supports either DLPack (up to v1.0) or CUDA Array + Interface (v3). + stream_ptr: int + The pointer address (as Python `int`) to the **consumer** stream. + Stream ordering will be properly established unless ``-1`` is passed. + """ # TODO: switch to use Cython's cdef typing? ptr: int = None shape: tuple = None @@ -285,6 +337,33 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): def viewable(tuple arg_indices): + """Decorator to create proxy objects to :obj:`StridedMemoryView` for the + specified positional arguments. + + Inside the decorated function, the specified arguments becomes instances + of an (undocumented) proxy type, regardless of its original source. A + :obj:`StridedMemoryView` instance can be obtained by passing the (consumer) + stream pointer (as a Python `int`) to the proxies's ``view()`` method. For + example: + + .. code-block:: python + + @viewable((1,)) + def my_func(arg0, arg1, arg2, stream: Stream): + # arg1 can be any object supporting DLPack or CUDA Array Interface + view = arg1.view(stream.handle) + assert isinstance(view, StridedMemoryView) + ... + + This allows array/tensor attributes to be accessed inside the function + implementation, while keeping the function body array-library-agnostic (if + desired). + + Parameters + ---------- + arg_indices : tuple + The indices of the target positional arguments. + """ def wrapped_func_with_indices(func): @functools.wraps(func) def wrapped_func(*args, **kwargs): diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 1cb9811b4..84720536a 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -31,3 +31,18 @@ CUDA compilation toolchain :toctree: generated/ Program + + +.. module:: cuda.core.experimental.utils + +Utility functions +----------------- + +.. autosummary:: + :toctree: generated/ + + viewable + + :template: dataclass.rst + + StridedMemoryView diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 5b28d331f..5ea1d19fb 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -33,6 +33,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', 'myst_nb', 'enum_tools.autoenum', 'sphinx_copybutton', @@ -81,3 +82,11 @@ # skip cmdline prompts copybutton_exclude = '.linenos, .gp' + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'numpy': ('https://numpy.org/doc/stable/', None), +} + +napoleon_google_docstring = False +napoleon_numpy_docstring = True diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 6700a9c58..e33aff18a 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + try: import cupy as cp except ImportError: @@ -30,12 +34,14 @@ def test_viewable_cpu(self, in_arr): @viewable((0,)) def my_func(arr): + # stream_ptr=-1 means "the consumer does not care" view = arr.view(-1) self._check_view(view, in_arr) my_func(in_arr) def test_strided_memory_view_cpu(self, in_arr): + # stream_ptr=-1 means "the consumer does not care" view = StridedMemoryView(in_arr, stream_ptr=-1) self._check_view(view, in_arr) @@ -93,6 +99,7 @@ def test_viewable_gpu(self, in_arr, stream): # TODO: use the device fixture? dev = Device() dev.set_current() + # This is the consumer stream s = dev.create_stream() if stream else None @viewable((0,)) @@ -106,6 +113,7 @@ def test_strided_memory_view_cpu(self, in_arr, stream): # TODO: use the device fixture? dev = Device() dev.set_current() + # This is the consumer stream s = dev.create_stream() if stream else None view = StridedMemoryView( From f1239a278377dc0c9cea4f07de78392812ac0106 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 30 Nov 2024 02:45:15 +0000 Subject: [PATCH 05/17] fix formatting --- cuda_core/cuda/core/experimental/__init__.py | 2 +- cuda_core/tests/test_utils.py | 28 +++++++------------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index a62f7db20..a45f4d770 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +from cuda.core.experimental import utils from cuda.core.experimental._device import Device from cuda.core.experimental._event import EventOptions from cuda.core.experimental._launcher import LaunchConfig, launch from cuda.core.experimental._program import Program from cuda.core.experimental._stream import Stream, StreamOptions -from cuda.core.experimental import utils diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index e33aff18a..c171742a2 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -22,16 +22,15 @@ def convert_strides_to_counts(strides, itemsize): @pytest.mark.parametrize( - "in_arr,", ( + "in_arr,", + ( np.empty(3, dtype=np.int32), np.empty((6, 6), dtype=np.float64)[::2, ::2], - np.empty((3, 4), order='F'), - ) + np.empty((3, 4), order="F"), + ), ) class TestViewCPU: - def test_viewable_cpu(self, in_arr): - @viewable((0,)) def my_func(arr): # stream_ptr=-1 means "the consumer does not care" @@ -49,8 +48,7 @@ def _check_view(self, view, in_arr): assert isinstance(view, StridedMemoryView) assert view.ptr == in_arr.ctypes.data assert view.shape == in_arr.shape - strides_in_counts = convert_strides_to_counts( - in_arr.strides, in_arr.dtype.itemsize) + strides_in_counts = convert_strides_to_counts(in_arr.strides, in_arr.dtype.itemsize) if in_arr.flags.c_contiguous: assert view.strides is None else: @@ -68,7 +66,7 @@ def gpu_array_samples(): samples += [ (cp.empty(3, dtype=cp.complex64), None), (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True), - (cp.empty((3, 4), order='F'), True), + (cp.empty((3, 4), order="F"), True), ] # Numba's device_array is the only known array container that does not # support DLPack (so that we get to test the CAI coverage). @@ -88,13 +86,8 @@ def gpu_array_ptr(arr): assert False, f"{arr=}" -@pytest.mark.parametrize( - "in_arr,stream", ( - *gpu_array_samples(), - ) -) +@pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),)) class TestViewGPU: - def test_viewable_gpu(self, in_arr, stream): # TODO: use the device fixture? dev = Device() @@ -116,17 +109,14 @@ def test_strided_memory_view_cpu(self, in_arr, stream): # This is the consumer stream s = dev.create_stream() if stream else None - view = StridedMemoryView( - in_arr, - stream_ptr=s.handle if s else -1) + view = StridedMemoryView(in_arr, stream_ptr=s.handle if s else -1) self._check_view(view, in_arr, dev) def _check_view(self, view, in_arr, dev): assert isinstance(view, StridedMemoryView) assert view.ptr == gpu_array_ptr(in_arr) assert view.shape == in_arr.shape - strides_in_counts = convert_strides_to_counts( - in_arr.strides, in_arr.dtype.itemsize) + strides_in_counts = convert_strides_to_counts(in_arr.strides, in_arr.dtype.itemsize) if in_arr.flags["C_CONTIGUOUS"]: assert view.strides in (None, strides_in_counts) else: From bcf3add300fd3209713e418c111ff686fe4111f1 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 30 Nov 2024 03:06:19 +0000 Subject: [PATCH 06/17] address comments on the docstring --- cuda_core/cuda/core/experimental/_memoryview.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 4b12463e4..d820c46d6 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -340,7 +340,11 @@ def viewable(tuple arg_indices): """Decorator to create proxy objects to :obj:`StridedMemoryView` for the specified positional arguments. - Inside the decorated function, the specified arguments becomes instances + This allows array/tensor attributes to be accessed inside the function + implementation, while keeping the function body array-library-agnostic (if + desired). + + Inside the decorated function, the specified arguments become instances of an (undocumented) proxy type, regardless of its original source. A :obj:`StridedMemoryView` instance can be obtained by passing the (consumer) stream pointer (as a Python `int`) to the proxies's ``view()`` method. For @@ -355,10 +359,6 @@ def viewable(tuple arg_indices): assert isinstance(view, StridedMemoryView) ... - This allows array/tensor attributes to be accessed inside the function - implementation, while keeping the function body array-library-agnostic (if - desired). - Parameters ---------- arg_indices : tuple From b11e1ae8a3f8d49e12ffdb77a5eaa66db029bde1 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 30 Nov 2024 03:39:53 +0000 Subject: [PATCH 07/17] fix import accidentally removed by ruff --- cuda_core/cuda/core/experimental/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py index 0717b41aa..a305d03bf 100644 --- a/cuda_core/cuda/core/experimental/utils.py +++ b/cuda_core/cuda/core/experimental/utils.py @@ -2,3 +2,4 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +from cuda.core.experimental._memoryview import StridedMemoryView, viewable # noqa: F401 From ede50769f0c2e2c4bc44ccd2f82a2849facc3127 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 30 Nov 2024 03:40:21 +0000 Subject: [PATCH 08/17] fix device_id convention for CPU --- .../cuda/core/experimental/_memoryview.pyx | 18 ++++++++---------- cuda_core/tests/test_utils.py | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index d820c46d6..ccb87c906 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -56,7 +56,8 @@ cdef class StridedMemoryView: dtype: numpy.dtype Data type of the tensor. device_id: int - The device ID for where the tensor is located. It is 0 for CPU tensors. + The device ID for where the tensor is located. It is -1 for CPU tensors + (meaning those only accessible from the host). device_accessible: bool Whether the tensor data can be accessed on the GPU. readonly: bool @@ -78,7 +79,7 @@ cdef class StridedMemoryView: shape: tuple = None strides: tuple = None # in counts, not bytes dtype: numpy.dtype = None - device_id: int = None # 0 for CPU + device_id: int = None # -1 for CPU device_accessible: bool = None readonly: bool = None exporting_obj: Any = None @@ -152,27 +153,24 @@ cdef class _StridedMemoryViewProxy: cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef int dldevice, device_id, i cdef bint device_accessible, versioned, is_readonly + device_accessible = False dldevice, device_id = obj.__dlpack_device__() if dldevice == _kDLCPU: - device_accessible = False assert device_id == 0 + device_id = -1 if stream_ptr is None: raise BufferError("stream=None is ambiguous with view()") elif stream_ptr == -1: stream_ptr = None elif dldevice == _kDLCUDA: + assert device_id >= 0 device_accessible = True # no need to check other stream values, it's a pass-through if stream_ptr is None: raise BufferError("stream=None is ambiguous with view()") - elif dldevice == _kDLCUDAHost: + elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged): device_accessible = True - assert device_id == 0 - # just do a pass-through without any checks, as pinned memory can be - # accessed on both host and device - elif dldevice == _kDLCUDAManaged: - device_accessible = True - # just do a pass-through without any checks, as managed memory can be + # just do a pass-through without any checks, as pinned/managed memory can be # accessed on both host and device else: raise BufferError("device not supported") diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index c171742a2..3df624855 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -54,7 +54,7 @@ def _check_view(self, view, in_arr): else: assert view.strides == strides_in_counts assert view.dtype == in_arr.dtype - assert view.device_id == 0 + assert view.device_id == -1 assert view.device_accessible == False assert view.exporting_obj is in_arr From 8027c78f3c467f1017e3aa51683fb7ac0254dc50 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Dec 2024 00:04:13 +0000 Subject: [PATCH 09/17] rename viewable to args_viewable_as_strided_memory --- cuda_core/cuda/core/experimental/_memoryview.pyx | 6 +++--- cuda_core/cuda/core/experimental/utils.py | 3 ++- cuda_core/docs/source/api.rst | 2 +- cuda_core/docs/source/release.md | 1 + cuda_core/docs/source/release/0.1.1-notes.md | 13 +++++++++++++ cuda_core/docs/versions.json | 1 + cuda_core/tests/test_utils.py | 10 +++++----- 7 files changed, 26 insertions(+), 10 deletions(-) create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index ccb87c906..246644fa2 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -24,7 +24,7 @@ cdef class StridedMemoryView: A :obj:`StridedMemoryView` instance can be created in two ways: - 1. Using the :obj:`viewable` decorator (recommended) + 1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended) 2. Explicit construction, see below This object supports both DLPack (up to v1.0) and CUDA Array Interface @@ -334,7 +334,7 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): return buf -def viewable(tuple arg_indices): +def args_viewable_as_strided_memory(tuple arg_indices): """Decorator to create proxy objects to :obj:`StridedMemoryView` for the specified positional arguments. @@ -350,7 +350,7 @@ def viewable(tuple arg_indices): .. code-block:: python - @viewable((1,)) + @args_viewable_as_strided_memory((1,)) def my_func(arg0, arg1, arg2, stream: Stream): # arg1 can be any object supporting DLPack or CUDA Array Interface view = arg1.view(stream.handle) diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py index a305d03bf..7b86160a4 100644 --- a/cuda_core/cuda/core/experimental/utils.py +++ b/cuda_core/cuda/core/experimental/utils.py @@ -2,4 +2,5 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from cuda.core.experimental._memoryview import StridedMemoryView, viewable # noqa: F401 +from cuda.core.experimental._memoryview import args_viewable_as_strided_memory # noqa: F401 +from cuda.core.experimental._memoryview import StridedMemoryView # noqa: F401 diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 84720536a..558c3ec8c 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -41,7 +41,7 @@ Utility functions .. autosummary:: :toctree: generated/ - viewable + args_viewable_as_strided_memory :template: dataclass.rst diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md index 48e247863..55090b0b3 100644 --- a/cuda_core/docs/source/release.md +++ b/cuda_core/docs/source/release.md @@ -5,5 +5,6 @@ maxdepth: 3 --- + 0.1.1 0.1.0 ``` diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md new file mode 100644 index 000000000..473352a47 --- /dev/null +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -0,0 +1,13 @@ +# `cuda.core` Release notes + +Released on Dec XX, 2024 + +## Hightlights +- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete + implementation of DLPack & CUDA Array Interface supports. + + +## Limitations + +- All APIs are currently *experimental* and subject to change without deprecation notice. + Please kindly share your feedbacks with us so that we can make `cuda.core` better! diff --git a/cuda_core/docs/versions.json b/cuda_core/docs/versions.json index 4163fd316..41664534b 100644 --- a/cuda_core/docs/versions.json +++ b/cuda_core/docs/versions.json @@ -1,4 +1,5 @@ { "latest" : "latest", + "0.1.1" : "0.1.1", "0.1.0" : "0.1.0" } diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 3df624855..6438c5628 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -14,7 +14,7 @@ import pytest from cuda.core.experimental import Device -from cuda.core.experimental.utils import StridedMemoryView, viewable +from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory def convert_strides_to_counts(strides, itemsize): @@ -30,8 +30,8 @@ def convert_strides_to_counts(strides, itemsize): ), ) class TestViewCPU: - def test_viewable_cpu(self, in_arr): - @viewable((0,)) + def test_args_viewable_as_strided_memory_cpu(self, in_arr): + @args_viewable_as_strided_memory((0,)) def my_func(arr): # stream_ptr=-1 means "the consumer does not care" view = arr.view(-1) @@ -88,14 +88,14 @@ def gpu_array_ptr(arr): @pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),)) class TestViewGPU: - def test_viewable_gpu(self, in_arr, stream): + def test_args_viewable_as_strided_memory_gpu(self, in_arr, stream): # TODO: use the device fixture? dev = Device() dev.set_current() # This is the consumer stream s = dev.create_stream() if stream else None - @viewable((0,)) + @args_viewable_as_strided_memory((0,)) def my_func(arr): view = arr.view(s.handle if s else -1) self._check_view(view, in_arr, dev) From 66377d8796c0249a912522e0971a306a6fdb2dc0 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Dec 2024 02:15:16 +0000 Subject: [PATCH 10/17] rename device_accessible to is_device_accessible for consistency --- .../cuda/core/experimental/_memoryview.pyx | 18 +++++++++--------- cuda_core/tests/test_utils.py | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 246644fa2..d8eba464a 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -58,7 +58,7 @@ cdef class StridedMemoryView: device_id: int The device ID for where the tensor is located. It is -1 for CPU tensors (meaning those only accessible from the host). - device_accessible: bool + is_device_accessible: bool Whether the tensor data can be accessed on the GPU. readonly: bool Whether the tensor data can be modified in place. @@ -80,7 +80,7 @@ cdef class StridedMemoryView: strides: tuple = None # in counts, not bytes dtype: numpy.dtype = None device_id: int = None # -1 for CPU - device_accessible: bool = None + is_device_accessible: bool = None readonly: bool = None exporting_obj: Any = None @@ -101,7 +101,7 @@ cdef class StridedMemoryView: + f" strides={self.strides},\n" + f" dtype={get_simple_repr(self.dtype)},\n" + f" device_id={self.device_id},\n" - + f" device_accessible={self.device_accessible},\n" + + f" is_device_accessible={self.is_device_accessible},\n" + f" readonly={self.readonly},\n" + f" exporting_obj={get_simple_repr(self.exporting_obj)})") @@ -152,8 +152,8 @@ cdef class _StridedMemoryViewProxy: cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef int dldevice, device_id, i - cdef bint device_accessible, versioned, is_readonly - device_accessible = False + cdef bint is_device_accessible, versioned, is_readonly + is_device_accessible = False dldevice, device_id = obj.__dlpack_device__() if dldevice == _kDLCPU: assert device_id == 0 @@ -164,12 +164,12 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): stream_ptr = None elif dldevice == _kDLCUDA: assert device_id >= 0 - device_accessible = True + is_device_accessible = True # no need to check other stream values, it's a pass-through if stream_ptr is None: raise BufferError("stream=None is ambiguous with view()") elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged): - device_accessible = True + is_device_accessible = True # just do a pass-through without any checks, as pinned/managed memory can be # accessed on both host and device else: @@ -221,7 +221,7 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): buf.strides = None buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype) buf.device_id = device_id - buf.device_accessible = device_accessible + buf.is_device_accessible = is_device_accessible buf.readonly = is_readonly buf.exporting_obj = obj @@ -311,7 +311,7 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): if buf.strides is not None: # convert to counts buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides) - buf.device_accessible = True + buf.is_device_accessible = True buf.device_id = handle_return( cuda.cuPointerGetAttribute( cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 6438c5628..83d890f23 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -55,7 +55,7 @@ def _check_view(self, view, in_arr): assert view.strides == strides_in_counts assert view.dtype == in_arr.dtype assert view.device_id == -1 - assert view.device_accessible == False + assert view.is_device_accessible == False assert view.exporting_obj is in_arr @@ -123,5 +123,5 @@ def _check_view(self, view, in_arr, dev): assert view.strides == strides_in_counts assert view.dtype == in_arr.dtype assert view.device_id == dev.device_id - assert view.device_accessible == True + assert view.is_device_accessible == True assert view.exporting_obj is in_arr From b5cfdce74db45324da17f404173024ed1ce56751 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Dec 2024 02:42:34 +0000 Subject: [PATCH 11/17] enforce line ending in the whole codebase --- .gitattributes | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitattributes b/.gitattributes index 8c8fc4274..00407cdc0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,7 @@ cuda/_version.py export-subst + +* text eol=lf + +# we do not own any headers checked in, don't touch them +*.h binary +*.hpp binary From 9572f8a0e8a84f3e0d5cf813fe14603b65929469 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Dec 2024 02:42:51 +0000 Subject: [PATCH 12/17] add a code sample for strided memory view --- cuda_core/examples/strided_memory_view.py | 162 ++++++++++++++++++++++ cuda_core/tests/conftest.py | 92 +++++++----- 2 files changed, 218 insertions(+), 36 deletions(-) create mode 100644 cuda_core/examples/strided_memory_view.py diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py new file mode 100644 index 000000000..4fde4a62f --- /dev/null +++ b/cuda_core/examples/strided_memory_view.py @@ -0,0 +1,162 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +# ################################################################################ +# +# This demo aims to illustrate two takeaways: +# +# 1. The similarity between CPU and GPU JIT-compilation for C++ sources +# 2. How to use StridedMemoryView to interface with foreign CPU/GPU functions +# at low-level +# +# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU +# path, which can be easily installed from pip or conda following their instruction. +# We also use NumPy/CuPy as the CPU/GPU array container. +# +# ################################################################################ + +import string +import sys + +try: + from cffi import FFI +except ImportError: + print("cffi is not installed, the CPU example would be skipped", file=sys.stderr) + cffi = None +try: + import cupy as cp +except ImportError: + print("cupy is not installed, the GPU example would be skipped", file=sys.stderr) + cp = None +import numpy as np + +from cuda.core.experimental import Device, Program +from cuda.core.experimental import launch, LaunchConfig +from cuda.core.experimental.utils import args_viewable_as_strided_memory +from cuda.core.experimental.utils import StridedMemoryView + + +# ################################################################################ +# +# Usually this entire code block is in a separate file, built as a Python extension +# module that can be imported by users at run time. For illustrative purposes we +# use JIT compilation to make this demo self-contained. +# +# Here we assume an in-place operation, equivalent to the following NumPy code: +# +# >>> arr = ... +# >>> assert arr.dtype == np.int32 +# >>> assert arr.ndim == 1 +# >>> arr += np.arange(arr.size, dtype=arr.dtype) +# +# is implemented for both CPU and GPU at low-level, with the following C function +# signature: +func_name = "inplace_plus_arange_N" +func_sig = f"void {func_name}(int* data, size_t N)" + +# Here is a concrete (very naive!) implementation on CPU: +if FFI: + cpu_code = string.Template(r""" + extern "C" { + $func_sig { + for (size_t i = 0; i < N; i++) { + data[i] += i; + } + } + } + """).substitute(func_sig=func_sig) + cpu_prog = FFI() + cpu_prog.set_source("_cpu_obj", cpu_code, source_extension=".cpp") + cpu_prog.cdef(f"{func_sig};") + cpu_prog.compile() + # This is cffi's way of loading a CPU function. cffi builds an extension module + # that has the Python binding to the underlying C function. (For more details, + # please refer to cffi's documentation.) + from _cpu_obj.lib import inplace_plus_arange_N as cpu_func + +# Here is a concrete (again, very naive!) implementation on GPU: +if cp: + gpu_code = string.Template(r""" + extern "C" + __global__ $func_sig { + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t stride_size = gridDim.x * blockDim.x; + for (size_t i = tid; i < N; i += stride_size) { + data[i] += i; + } + } + """).substitute(func_sig=func_sig) + gpu_prog = Program(gpu_code, code_type="c++") + # To know the GPU's compute capability, we need to identify which GPU to use. + dev = Device(0) + arch = "".join(f"{i}" for i in dev.compute_capability) + mod = gpu_prog.compile( + target_type="cubin", + # TODO: update this after NVIDIA/cuda-python#237 is merged + options=(f"-arch=sm_{arch}", "-std=c++11")) + gpu_ker = mod.get_kernel(func_name) + +# Now we are prepared to run the code from the user's perspective! +# +# ################################################################################ + + +# Below, as a user we want to perform the said in-place operation on either CPU +# or GPU, by calling the corresponding function implemented "elsewhere" (done above). + +@args_viewable_as_strided_memory((0,)) +def my_func(arr, work_stream): + # create a memory view over arr, assumed to be a 1D array of int32 + view = arr.view(work_stream.handle if work_stream else -1) + assert isinstance(view, StridedMemoryView) + assert len(view.shape) == 1 + assert view.dtype == np.int32 + + size = view.shape[0] + if view.is_device_accessible: + block = 256 + grid = size // 256 + config = LaunchConfig(grid=grid, block=block, stream=work_stream) + launch(gpu_ker, config, view.ptr, np.uint64(size)) + # here we're being conservative and synchronize over our work stream, + # assuming we do not know the (producer/source) stream; if we know + # then we could just order the producer/consumer streams here, e.g. + # + # producer_stream.wait(work_stream) + # + # without an expansive synchronization. + work_stream.sync() + else: + cpu_func(cpu_prog.cast("int*", view.ptr), size) + + +# This takes the CPU path +if FFI: + # Create input array on CPU + arr_cpu = np.zeros(1024, dtype=np.int32) + print(f"before: {arr_cpu[:10]=}") + + # Run the workload + my_func(arr_cpu, None) + + # Check the result + print(f"after: {arr_cpu[:10]=}") + assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) + + +# This takes the GPU path +if cp: + dev.set_current() + s = dev.create_stream() + # Create input array on GPU + arr_gpu = cp.ones(1024, dtype=cp.int32) + print(f"before: {arr_gpu[:10]=}") + + # Run the workload + my_func(arr_gpu, s) + + # Check the result + print(f"after: {arr_gpu[:10]=}") + assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) + s.close() diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index bb99fb33c..e35f77355 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -1,36 +1,56 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. -try: - from cuda.bindings import driver -except ImportError: - from cuda import cuda as driver - -import pytest - -from cuda.core.experimental import Device, _device -from cuda.core.experimental._utils import handle_return - - -@pytest.fixture(scope="function") -def init_cuda(): - device = Device() - device.set_current() - yield - _device_unset_current() - - -def _device_unset_current(): - handle_return(driver.cuCtxPopCurrent()) - with _device._tls_lock: - del _device._tls.devices - - -@pytest.fixture(scope="function") -def deinit_cuda(): - yield - _device_unset_current() +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import glob +import os +import sys + +try: + from cuda.bindings import driver +except ImportError: + from cuda import cuda as driver + +import pytest + +from cuda.core.experimental import Device, _device +from cuda.core.experimental._utils import handle_return + + +@pytest.fixture(scope="function") +def init_cuda(): + device = Device() + device.set_current() + yield + _device_unset_current() + + +def _device_unset_current(): + handle_return(driver.cuCtxPopCurrent()) + with _device._tls_lock: + del _device._tls.devices + + +@pytest.fixture(scope="function") +def deinit_cuda(): + yield + _device_unset_current() + + +# samples relying on cffi could fail as the modules cannot be imported +sys.path.append(os.getcwd()) + + +@pytest.fixture(scope="session", autouse=True) +def clean_up_cffi_files(): + yield + files = glob.glob(os.path.join(os.getcwd(), "_cpu_obj*")) + for f in files: + try: + os.remove(f) + except FileNotFoundError: + pass From 069f057d5e16994409439ff8bca0907eab53d6a5 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Dec 2024 02:55:41 +0000 Subject: [PATCH 13/17] fix formatting again --- cuda_core/cuda/core/experimental/utils.py | 6 ++++-- cuda_core/examples/strided_memory_view.py | 13 ++++++------- cuda_core/tests/conftest.py | 4 ++-- cuda_core/tests/test_utils.py | 6 +++--- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py index 7b86160a4..cc9a437d7 100644 --- a/cuda_core/cuda/core/experimental/utils.py +++ b/cuda_core/cuda/core/experimental/utils.py @@ -2,5 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from cuda.core.experimental._memoryview import args_viewable_as_strided_memory # noqa: F401 -from cuda.core.experimental._memoryview import StridedMemoryView # noqa: F401 +from cuda.core.experimental._memoryview import ( + StridedMemoryView, # noqa: F401 + args_viewable_as_strided_memory, # noqa: F401 +) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index 4fde4a62f..0f77edabd 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -12,7 +12,7 @@ # # To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU # path, which can be easily installed from pip or conda following their instruction. -# We also use NumPy/CuPy as the CPU/GPU array container. +# We also use NumPy/CuPy as the CPU/GPU array container. # # ################################################################################ @@ -31,11 +31,8 @@ cp = None import numpy as np -from cuda.core.experimental import Device, Program -from cuda.core.experimental import launch, LaunchConfig -from cuda.core.experimental.utils import args_viewable_as_strided_memory -from cuda.core.experimental.utils import StridedMemoryView - +from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory # ################################################################################ # @@ -94,7 +91,8 @@ mod = gpu_prog.compile( target_type="cubin", # TODO: update this after NVIDIA/cuda-python#237 is merged - options=(f"-arch=sm_{arch}", "-std=c++11")) + options=(f"-arch=sm_{arch}", "-std=c++11"), + ) gpu_ker = mod.get_kernel(func_name) # Now we are prepared to run the code from the user's perspective! @@ -105,6 +103,7 @@ # Below, as a user we want to perform the said in-place operation on either CPU # or GPU, by calling the corresponding function implemented "elsewhere" (done above). + @args_viewable_as_strided_memory((0,)) def my_func(arr, work_stream): # create a memory view over arr, assumed to be a 1D array of int32 diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index e35f77355..59e5883f3 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -50,7 +50,7 @@ def clean_up_cffi_files(): yield files = glob.glob(os.path.join(os.getcwd(), "_cpu_obj*")) for f in files: - try: + try: # noqa: SIM105 os.remove(f) except FileNotFoundError: - pass + pass # noqa: SIM105 diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 83d890f23..ef37bcabb 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -55,7 +55,7 @@ def _check_view(self, view, in_arr): assert view.strides == strides_in_counts assert view.dtype == in_arr.dtype assert view.device_id == -1 - assert view.is_device_accessible == False + assert view.is_device_accessible is False assert view.exporting_obj is in_arr @@ -83,7 +83,7 @@ def gpu_array_ptr(arr): return arr.data.ptr if numba_cuda is not None and isinstance(arr, numba_cuda.cudadrv.devicearray.DeviceNDArray): return arr.device_ctypes_pointer.value - assert False, f"{arr=}" + raise NotImplementedError(f"{arr=}") @pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),)) @@ -123,5 +123,5 @@ def _check_view(self, view, in_arr, dev): assert view.strides == strides_in_counts assert view.dtype == in_arr.dtype assert view.device_id == dev.device_id - assert view.is_device_accessible == True + assert view.is_device_accessible is True assert view.exporting_obj is in_arr From 5e393f82208a9a7e44430122cd0df865d903a0f8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Dec 2024 15:34:35 +0000 Subject: [PATCH 14/17] address review comments --- cuda_core/examples/strided_memory_view.py | 24 +++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index 0f77edabd..efcf26af3 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -6,12 +6,11 @@ # # This demo aims to illustrate two takeaways: # -# 1. The similarity between CPU and GPU JIT-compilation for C++ sources -# 2. How to use StridedMemoryView to interface with foreign CPU/GPU functions -# at low-level +# 1. The similarity between CPU and GPU JIT-compilation with C++ sources +# 2. How to use StridedMemoryView to interface with foreign C/C++ functions # # To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU -# path, which can be easily installed from pip or conda following their instruction. +# path, which can be easily installed from pip or conda following their instructions. # We also use NumPy/CuPy as the CPU/GPU array container. # # ################################################################################ @@ -22,12 +21,12 @@ try: from cffi import FFI except ImportError: - print("cffi is not installed, the CPU example would be skipped", file=sys.stderr) + print("cffi is not installed, the CPU example will be skipped", file=sys.stderr) cffi = None try: import cupy as cp except ImportError: - print("cupy is not installed, the GPU example would be skipped", file=sys.stderr) + print("cupy is not installed, the GPU example will be skipped", file=sys.stderr) cp = None import numpy as np @@ -55,11 +54,10 @@ # Here is a concrete (very naive!) implementation on CPU: if FFI: cpu_code = string.Template(r""" - extern "C" { - $func_sig { - for (size_t i = 0; i < N; i++) { - data[i] += i; - } + extern "C" + $func_sig { + for (size_t i = 0; i < N; i++) { + data[i] += i; } } """).substitute(func_sig=func_sig) @@ -115,7 +113,7 @@ def my_func(arr, work_stream): size = view.shape[0] if view.is_device_accessible: block = 256 - grid = size // 256 + grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block, stream=work_stream) launch(gpu_ker, config, view.ptr, np.uint64(size)) # here we're being conservative and synchronize over our work stream, @@ -124,7 +122,7 @@ def my_func(arr, work_stream): # # producer_stream.wait(work_stream) # - # without an expansive synchronization. + # without an expensive synchronization. work_stream.sync() else: cpu_func(cpu_prog.cast("int*", view.ptr), size) From 8e209aac6ee1c13eeaaba10298b903745fa92eed Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 2 Dec 2024 16:25:39 +0000 Subject: [PATCH 15/17] programmatically load cffi functions --- cuda_core/examples/strided_memory_view.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index efcf26af3..0d49d5aed 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -15,6 +15,7 @@ # # ################################################################################ +import importlib import string import sys @@ -61,14 +62,19 @@ } } """).substitute(func_sig=func_sig) + # This is cffi's way of JIT compiling & loading a CPU function. cffi builds an + # extension module that has the Python binding to the underlying C function. + # For more details, please refer to cffi's documentation. cpu_prog = FFI() - cpu_prog.set_source("_cpu_obj", cpu_code, source_extension=".cpp") cpu_prog.cdef(f"{func_sig};") + cpu_prog.set_source( + "_cpu_obj", + cpu_code, + source_extension=".cpp", + extra_compile_args=["-std=c++11"], + ) cpu_prog.compile() - # This is cffi's way of loading a CPU function. cffi builds an extension module - # that has the Python binding to the underlying C function. (For more details, - # please refer to cffi's documentation.) - from _cpu_obj.lib import inplace_plus_arange_N as cpu_func + cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name) # Here is a concrete (again, very naive!) implementation on GPU: if cp: From 6af4da317bc6e71ad0713af2d5b0ce80f849643b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 3 Dec 2024 00:23:51 +0000 Subject: [PATCH 16/17] fix import fallback; address review comments --- cuda_core/examples/strided_memory_view.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index 0d49d5aed..847d28b7b 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -23,7 +23,7 @@ from cffi import FFI except ImportError: print("cffi is not installed, the CPU example will be skipped", file=sys.stderr) - cffi = None + FFI = None try: import cupy as cp except ImportError: @@ -107,28 +107,33 @@ # Below, as a user we want to perform the said in-place operation on either CPU # or GPU, by calling the corresponding function implemented "elsewhere" (done above). - +# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both +# of which are supported by StridedMemoryView). @args_viewable_as_strided_memory((0,)) def my_func(arr, work_stream): - # create a memory view over arr, assumed to be a 1D array of int32 + # Create a memory view over arr (assumed to be a 1D array of int32). The stream + # ordering is taken care of, so that arr can be safely accessed on our work + # stream (ordered after a data stream on which arr is potentially prepared). view = arr.view(work_stream.handle if work_stream else -1) assert isinstance(view, StridedMemoryView) assert len(view.shape) == 1 assert view.dtype == np.int32 size = view.shape[0] + # DLPack also supports host arrays. We want to know if the array data is + # accessible from the GPU, and dispatch to the right routine accordingly. if view.is_device_accessible: block = 256 grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block, stream=work_stream) launch(gpu_ker, config, view.ptr, np.uint64(size)) - # here we're being conservative and synchronize over our work stream, - # assuming we do not know the (producer/source) stream; if we know - # then we could just order the producer/consumer streams here, e.g. + # Here we're being conservative and synchronize over our work stream, + # assuming we do not know the data stream; if we know then we could + # just order the data stream after the work stream here, e.g. # - # producer_stream.wait(work_stream) + # data_stream.wait(work_stream) # - # without an expensive synchronization. + # without an expensive synchronization (with respect to the host). work_stream.sync() else: cpu_func(cpu_prog.cast("int*", view.ptr), size) From 16fc9f69037ac1ad86622d6daf58aa0fa8401edd Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 4 Dec 2024 02:33:45 +0000 Subject: [PATCH 17/17] test readonly with numpy; rename use_stream parameter --- cuda_core/examples/strided_memory_view.py | 1 + cuda_core/tests/test_utils.py | 24 ++++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index 847d28b7b..564d7fa01 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -107,6 +107,7 @@ # Below, as a user we want to perform the said in-place operation on either CPU # or GPU, by calling the corresponding function implemented "elsewhere" (done above). + # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both # of which are supported by StridedMemoryView). @args_viewable_as_strided_memory((0,)) diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index ef37bcabb..0926a549d 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -27,6 +27,14 @@ def convert_strides_to_counts(strides, itemsize): np.empty(3, dtype=np.int32), np.empty((6, 6), dtype=np.float64)[::2, ::2], np.empty((3, 4), order="F"), + np.empty((), dtype=np.float16), + # readonly is fixed recently (numpy/numpy#26501) + pytest.param( + np.frombuffer(b""), + marks=pytest.mark.skipif( + tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+" + ), + ), ), ) class TestViewCPU: @@ -57,6 +65,7 @@ def _check_view(self, view, in_arr): assert view.device_id == -1 assert view.is_device_accessible is False assert view.exporting_obj is in_arr + assert view.readonly is not in_arr.flags.writeable def gpu_array_samples(): @@ -64,7 +73,7 @@ def gpu_array_samples(): samples = [] if cp is not None: samples += [ - (cp.empty(3, dtype=cp.complex64), None), + (cp.empty(3, dtype=cp.complex64), False), (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True), (cp.empty((3, 4), order="F"), True), ] @@ -72,7 +81,7 @@ def gpu_array_samples(): # support DLPack (so that we get to test the CAI coverage). if numba_cuda is not None: samples += [ - (numba_cuda.device_array((2,), dtype=np.int8), None), + (numba_cuda.device_array((2,), dtype=np.int8), False), (numba_cuda.device_array((4, 2), dtype=np.float32), True), ] return samples @@ -86,14 +95,14 @@ def gpu_array_ptr(arr): raise NotImplementedError(f"{arr=}") -@pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),)) +@pytest.mark.parametrize("in_arr,use_stream", (*gpu_array_samples(),)) class TestViewGPU: - def test_args_viewable_as_strided_memory_gpu(self, in_arr, stream): + def test_args_viewable_as_strided_memory_gpu(self, in_arr, use_stream): # TODO: use the device fixture? dev = Device() dev.set_current() # This is the consumer stream - s = dev.create_stream() if stream else None + s = dev.create_stream() if use_stream else None @args_viewable_as_strided_memory((0,)) def my_func(arr): @@ -102,12 +111,12 @@ def my_func(arr): my_func(in_arr) - def test_strided_memory_view_cpu(self, in_arr, stream): + def test_strided_memory_view_cpu(self, in_arr, use_stream): # TODO: use the device fixture? dev = Device() dev.set_current() # This is the consumer stream - s = dev.create_stream() if stream else None + s = dev.create_stream() if use_stream else None view = StridedMemoryView(in_arr, stream_ptr=s.handle if s else -1) self._check_view(view, in_arr, dev) @@ -125,3 +134,4 @@ def _check_view(self, view, in_arr, dev): assert view.device_id == dev.device_id assert view.is_device_accessible is True assert view.exporting_obj is in_arr + # can't test view.readonly with CuPy or Numba...