From 15ebef3c4fe2702302664ce84a12ec863261a293 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 16 Nov 2024 04:19:02 +0000
Subject: [PATCH 01/17] add tests for viewable

---
 cuda_core/cuda/core/experimental/__init__.py  |  1 +
 .../cuda/core/experimental/_memoryview.pyx    |  2 +-
 cuda_core/tests/test_utils.py                 | 68 +++++++++++++++++++
 3 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 cuda_core/tests/test_utils.py

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 9b978398d..a62f7db20 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -7,3 +7,4 @@
 from cuda.core.experimental._launcher import LaunchConfig, launch
 from cuda.core.experimental._program import Program
 from cuda.core.experimental._stream import Stream, StreamOptions
+from cuda.core.experimental import utils
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index af6b3adff..b214b0e98 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -26,7 +26,7 @@ cdef class StridedMemoryView:
     shape: tuple = None
     strides: tuple = None  # in counts, not bytes
     dtype: numpy.dtype = None
-    device_id: int = None  # -1 for CPU
+    device_id: int = None  # 0 for CPU
     device_accessible: bool = None
     readonly: bool = None
     exporting_obj: Any = None
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
new file mode 100644
index 000000000..b52df1f42
--- /dev/null
+++ b/cuda_core/tests/test_utils.py
@@ -0,0 +1,68 @@
+import numpy as np
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+import pytest
+
+from cuda.core.experimental import Device
+from cuda.core.experimental.utils import StridedMemoryView, viewable
+
+
+@pytest.mark.parametrize(
+    "in_arr,", (
+        np.empty(3, dtype=np.int32),
+        np.empty((6, 6), dtype=np.float64)[::2, ::2],
+        np.empty((3, 4), order='F'),
+    )
+)
+def test_viewable_cpu(in_arr):
+
+    @viewable((0,))
+    def my_func(arr):
+        view = arr.view(-1)
+        assert view.ptr == in_arr.ctypes.data
+        assert view.shape == in_arr.shape
+        if in_arr.flags.c_contiguous:
+            assert view.strides is None
+        else:
+            assert view.strides == tuple(s // in_arr.dtype.itemsize for s in in_arr.strides)
+        assert view.dtype == in_arr.dtype
+        assert view.device_id == 0
+        assert view.device_accessible == False
+        assert view.exporting_obj is in_arr
+
+    my_func(in_arr)
+
+
+if cp is not None:
+
+    @pytest.mark.parametrize(
+        "in_arr,stream", (
+            (cp.empty(3, dtype=cp.complex64), None),
+            (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True),
+            (cp.empty((3, 4), order='F'), True),
+        )
+    )
+    def test_viewable_gpu(in_arr, stream):
+        # TODO: use the device fixture?
+        dev = Device()
+        dev.set_current()
+        s = dev.create_stream() if stream else None
+
+        @viewable((0,))
+        def my_func(arr):
+            view = arr.view(s.handle if s else -1)
+            assert view.ptr == in_arr.data.ptr
+            assert view.shape == in_arr.shape
+            strides_in_counts = tuple(s // in_arr.dtype.itemsize for s in in_arr.strides)
+            if in_arr.flags.c_contiguous:
+                assert view.strides in (None, strides_in_counts)
+            else:
+                assert view.strides == strides_in_counts
+            assert view.dtype == in_arr.dtype
+            assert view.device_id == dev.device_id
+            assert view.device_accessible == True
+            assert view.exporting_obj is in_arr
+    
+        my_func(in_arr)

From 80b2556ce4fd045babad8614ef75d06721cdb605 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 16 Nov 2024 04:50:01 +0000
Subject: [PATCH 02/17] use numba to test CAI & make test a bit cleaner

---
 cuda_core/tests/test_utils.py | 92 ++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 28 deletions(-)

diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index b52df1f42..9646cafd6 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -1,14 +1,22 @@
-import numpy as np
 try:
     import cupy as cp
 except ImportError:
     cp = None
+try:
+    from numba import cuda as numba_cuda
+except ImportError:
+    numba_cuda = None
+import numpy as np
 import pytest
 
 from cuda.core.experimental import Device
 from cuda.core.experimental.utils import StridedMemoryView, viewable
 
 
+def convert_strides_to_counts(strides, itemsize):
+    return tuple(s // itemsize for s in strides)
+
+
 @pytest.mark.parametrize(
     "in_arr,", (
         np.empty(3, dtype=np.int32),
@@ -21,12 +29,15 @@ def test_viewable_cpu(in_arr):
     @viewable((0,))
     def my_func(arr):
         view = arr.view(-1)
+        assert isinstance(view, StridedMemoryView)
         assert view.ptr == in_arr.ctypes.data
         assert view.shape == in_arr.shape
+        strides_in_counts = convert_strides_to_counts(
+            in_arr.strides, in_arr.dtype.itemsize)
         if in_arr.flags.c_contiguous:
             assert view.strides is None
         else:
-            assert view.strides == tuple(s // in_arr.dtype.itemsize for s in in_arr.strides)
+            assert view.strides == strides_in_counts
         assert view.dtype == in_arr.dtype
         assert view.device_id == 0
         assert view.device_accessible == False
@@ -35,34 +46,59 @@ def my_func(arr):
     my_func(in_arr)
 
 
-if cp is not None:
-
-    @pytest.mark.parametrize(
-        "in_arr,stream", (
+def gpu_array_samples():
+    # TODO: this function would initialize the device at test collection time
+    samples = []
+    if cp is not None:
+        samples += [
             (cp.empty(3, dtype=cp.complex64), None),
             (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True),
             (cp.empty((3, 4), order='F'), True),
-        )
+        ]
+    # Numba's device_array is the only known array container that does not
+    # support DLPack (so that we get to test the CAI coverage).
+    if numba_cuda is not None:
+        samples += [
+            (numba_cuda.device_array((2,), dtype=np.int8), None),
+            (numba_cuda.device_array((4, 2), dtype=np.float32), True),
+        ]
+    return samples
+
+
+def gpu_array_ptr(arr):
+    if cp is not None and isinstance(arr, cp.ndarray):
+        return arr.data.ptr
+    if numba_cuda is not None and isinstance(arr, numba_cuda.cudadrv.devicearray.DeviceNDArray):
+        return arr.device_ctypes_pointer.value
+    assert False, f"{arr=}"
+
+
+@pytest.mark.parametrize(
+    "in_arr,stream", (
+        *gpu_array_samples(),
     )
-    def test_viewable_gpu(in_arr, stream):
-        # TODO: use the device fixture?
-        dev = Device()
-        dev.set_current()
-        s = dev.create_stream() if stream else None
+)
+def test_viewable_gpu(in_arr, stream):
+    # TODO: use the device fixture?
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream() if stream else None
+
+    @viewable((0,))
+    def my_func(arr):
+        view = arr.view(s.handle if s else -1)
+        assert isinstance(view, StridedMemoryView)
+        assert view.ptr == gpu_array_ptr(in_arr)
+        assert view.shape == in_arr.shape
+        strides_in_counts = convert_strides_to_counts(
+            in_arr.strides, in_arr.dtype.itemsize)
+        if in_arr.flags["C_CONTIGUOUS"]:
+            assert view.strides in (None, strides_in_counts)
+        else:
+            assert view.strides == strides_in_counts
+        assert view.dtype == in_arr.dtype
+        assert view.device_id == dev.device_id
+        assert view.device_accessible == True
+        assert view.exporting_obj is in_arr
 
-        @viewable((0,))
-        def my_func(arr):
-            view = arr.view(s.handle if s else -1)
-            assert view.ptr == in_arr.data.ptr
-            assert view.shape == in_arr.shape
-            strides_in_counts = tuple(s // in_arr.dtype.itemsize for s in in_arr.strides)
-            if in_arr.flags.c_contiguous:
-                assert view.strides in (None, strides_in_counts)
-            else:
-                assert view.strides == strides_in_counts
-            assert view.dtype == in_arr.dtype
-            assert view.device_id == dev.device_id
-            assert view.device_accessible == True
-            assert view.exporting_obj is in_arr
-    
-        my_func(in_arr)
+    my_func(in_arr)

From 8ce0aa67314f1a75952165f595e4b845cb760c44 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 16 Nov 2024 05:06:09 +0000
Subject: [PATCH 03/17] add tests for creating views directly

---
 cuda_core/tests/test_utils.py | 59 +++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 9646cafd6..6700a9c58 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -24,11 +24,22 @@ def convert_strides_to_counts(strides, itemsize):
         np.empty((3, 4), order='F'),
     )
 )
-def test_viewable_cpu(in_arr):
+class TestViewCPU:
 
-    @viewable((0,))
-    def my_func(arr):
-        view = arr.view(-1)
+    def test_viewable_cpu(self, in_arr):
+
+        @viewable((0,))
+        def my_func(arr):
+            view = arr.view(-1)
+            self._check_view(view, in_arr)
+
+        my_func(in_arr)
+
+    def test_strided_memory_view_cpu(self, in_arr):
+        view = StridedMemoryView(in_arr, stream_ptr=-1)
+        self._check_view(view, in_arr)
+
+    def _check_view(self, view, in_arr):
         assert isinstance(view, StridedMemoryView)
         assert view.ptr == in_arr.ctypes.data
         assert view.shape == in_arr.shape
@@ -43,8 +54,6 @@ def my_func(arr):
         assert view.device_accessible == False
         assert view.exporting_obj is in_arr
 
-    my_func(in_arr)
-
 
 def gpu_array_samples():
     # TODO: this function would initialize the device at test collection time
@@ -78,15 +87,33 @@ def gpu_array_ptr(arr):
         *gpu_array_samples(),
     )
 )
-def test_viewable_gpu(in_arr, stream):
-    # TODO: use the device fixture?
-    dev = Device()
-    dev.set_current()
-    s = dev.create_stream() if stream else None
-
-    @viewable((0,))
-    def my_func(arr):
-        view = arr.view(s.handle if s else -1)
+class TestViewGPU:
+
+    def test_viewable_gpu(self, in_arr, stream):
+        # TODO: use the device fixture?
+        dev = Device()
+        dev.set_current()
+        s = dev.create_stream() if stream else None
+
+        @viewable((0,))
+        def my_func(arr):
+            view = arr.view(s.handle if s else -1)
+            self._check_view(view, in_arr, dev)
+
+        my_func(in_arr)
+
+    def test_strided_memory_view_cpu(self, in_arr, stream):
+        # TODO: use the device fixture?
+        dev = Device()
+        dev.set_current()
+        s = dev.create_stream() if stream else None
+
+        view = StridedMemoryView(
+            in_arr,
+            stream_ptr=s.handle if s else -1)
+        self._check_view(view, in_arr, dev)
+
+    def _check_view(self, view, in_arr, dev):
         assert isinstance(view, StridedMemoryView)
         assert view.ptr == gpu_array_ptr(in_arr)
         assert view.shape == in_arr.shape
@@ -100,5 +127,3 @@ def my_func(arr):
         assert view.device_id == dev.device_id
         assert view.device_accessible == True
         assert view.exporting_obj is in_arr
-
-    my_func(in_arr)

From b55b8baae41c9b7a1a10d1b54f788f08c9e74bb8 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 16 Nov 2024 06:29:32 +0000
Subject: [PATCH 04/17] add docs

---
 .../cuda/core/experimental/_memoryview.pyx    | 81 ++++++++++++++++++-
 cuda_core/docs/source/api.rst                 | 15 ++++
 cuda_core/docs/source/conf.py                 |  9 +++
 cuda_core/tests/test_utils.py                 |  8 ++
 4 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index b214b0e98..4b12463e4 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -20,7 +20,59 @@ from cuda.core.experimental._utils import handle_return
 
 @cython.dataclasses.dataclass
 cdef class StridedMemoryView:
-
+    """A dataclass holding metadata of a strided dense array/tensor.
+
+    A :obj:`StridedMemoryView` instance can be created in two ways:
+
+      1. Using the :obj:`viewable` decorator (recommended)
+      2. Explicit construction, see below
+
+    This object supports both DLPack (up to v1.0) and CUDA Array Interface
+    (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol
+    first, then the CAI protocol. A :obj:`BufferError` is raised if neither is
+    supported. 
+    
+    Since either way would take a consumer stream, for DLPack it is passed to
+    ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a 
+    stream order will be established between the consumer stream and the
+    producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if  
+    ``cudaStreamWaitEvent`` is called by this method. 
+    
+    To opt-out of the stream ordering operation in either DLPack or CAI, 
+    please pass ``stream_ptr=-1``. Note that this deviates (on purpose) 
+    from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core``
+    does not encourage using the (legacy) default/null stream, but is 
+    consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be
+    internally passed to ``obj.__dlpack__()`` instead. 
+
+    Attributes
+    ----------
+    ptr : int
+        Pointer to the tensor buffer (as a Python `int`).
+    shape: tuple
+        Shape of the tensor.
+    strides: tuple
+        Strides of the tensor (in **counts**, not bytes).
+    dtype: numpy.dtype
+        Data type of the tensor.
+    device_id: int
+        The device ID for where the tensor is located. It is 0 for CPU tensors.
+    device_accessible: bool
+        Whether the tensor data can be accessed on the GPU.
+    readonly: bool
+        Whether the tensor data can be modified in place.
+    exporting_obj: Any
+        A reference to the original tensor object that is being viewed.
+
+    Parameters
+    ----------
+    obj : Any
+        Any objects that supports either DLPack (up to v1.0) or CUDA Array
+        Interface (v3).
+    stream_ptr: int
+        The pointer address (as Python `int`) to the **consumer** stream.
+        Stream ordering will be properly established unless ``-1`` is passed.
+    """
     # TODO: switch to use Cython's cdef typing?
     ptr: int = None
     shape: tuple = None
@@ -285,6 +337,33 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
 
 
 def viewable(tuple arg_indices):
+    """Decorator to create proxy objects to :obj:`StridedMemoryView` for the
+    specified positional arguments.
+
+    Inside the decorated function, the specified arguments becomes instances
+    of an (undocumented) proxy type, regardless of its original source. A
+    :obj:`StridedMemoryView` instance can be obtained by passing the (consumer)
+    stream pointer (as a Python `int`) to the proxies's ``view()`` method. For
+    example:
+
+    .. code-block:: python
+
+        @viewable((1,))
+        def my_func(arg0, arg1, arg2, stream: Stream):
+            # arg1 can be any object supporting DLPack or CUDA Array Interface
+            view = arg1.view(stream.handle)
+            assert isinstance(view, StridedMemoryView)
+            ...
+
+    This allows array/tensor attributes to be accessed inside the function
+    implementation, while keeping the function body array-library-agnostic (if
+    desired).
+
+    Parameters
+    ----------
+    arg_indices : tuple
+        The indices of the target positional arguments.
+    """
     def wrapped_func_with_indices(func):
         @functools.wraps(func)
         def wrapped_func(*args, **kwargs):
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 1cb9811b4..84720536a 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -31,3 +31,18 @@ CUDA compilation toolchain
    :toctree: generated/
 
    Program
+
+
+.. module:: cuda.core.experimental.utils
+
+Utility functions
+-----------------
+
+.. autosummary::
+   :toctree: generated/
+
+   viewable
+
+   :template: dataclass.rst
+
+   StridedMemoryView
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index 5b28d331f..5ea1d19fb 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -33,6 +33,7 @@
 	'sphinx.ext.autodoc',
     'sphinx.ext.autosummary',
 	'sphinx.ext.napoleon',
+    'sphinx.ext.intersphinx',
 	'myst_nb',
 	'enum_tools.autoenum',
     'sphinx_copybutton',
@@ -81,3 +82,11 @@
 
 # skip cmdline prompts
 copybutton_exclude = '.linenos, .gp'
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3/', None),
+    'numpy': ('https://numpy.org/doc/stable/', None),
+}
+
+napoleon_google_docstring = False
+napoleon_numpy_docstring = True
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 6700a9c58..e33aff18a 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 try:
     import cupy as cp
 except ImportError:
@@ -30,12 +34,14 @@ def test_viewable_cpu(self, in_arr):
 
         @viewable((0,))
         def my_func(arr):
+            # stream_ptr=-1 means "the consumer does not care"
             view = arr.view(-1)
             self._check_view(view, in_arr)
 
         my_func(in_arr)
 
     def test_strided_memory_view_cpu(self, in_arr):
+        # stream_ptr=-1 means "the consumer does not care"
         view = StridedMemoryView(in_arr, stream_ptr=-1)
         self._check_view(view, in_arr)
 
@@ -93,6 +99,7 @@ def test_viewable_gpu(self, in_arr, stream):
         # TODO: use the device fixture?
         dev = Device()
         dev.set_current()
+        # This is the consumer stream
         s = dev.create_stream() if stream else None
 
         @viewable((0,))
@@ -106,6 +113,7 @@ def test_strided_memory_view_cpu(self, in_arr, stream):
         # TODO: use the device fixture?
         dev = Device()
         dev.set_current()
+        # This is the consumer stream
         s = dev.create_stream() if stream else None
 
         view = StridedMemoryView(

From f1239a278377dc0c9cea4f07de78392812ac0106 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 30 Nov 2024 02:45:15 +0000
Subject: [PATCH 05/17] fix formatting

---
 cuda_core/cuda/core/experimental/__init__.py |  2 +-
 cuda_core/tests/test_utils.py                | 28 +++++++-------------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index a62f7db20..a45f4d770 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+from cuda.core.experimental import utils
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
 from cuda.core.experimental._program import Program
 from cuda.core.experimental._stream import Stream, StreamOptions
-from cuda.core.experimental import utils
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index e33aff18a..c171742a2 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -22,16 +22,15 @@ def convert_strides_to_counts(strides, itemsize):
 
 
 @pytest.mark.parametrize(
-    "in_arr,", (
+    "in_arr,",
+    (
         np.empty(3, dtype=np.int32),
         np.empty((6, 6), dtype=np.float64)[::2, ::2],
-        np.empty((3, 4), order='F'),
-    )
+        np.empty((3, 4), order="F"),
+    ),
 )
 class TestViewCPU:
-
     def test_viewable_cpu(self, in_arr):
-
         @viewable((0,))
         def my_func(arr):
             # stream_ptr=-1 means "the consumer does not care"
@@ -49,8 +48,7 @@ def _check_view(self, view, in_arr):
         assert isinstance(view, StridedMemoryView)
         assert view.ptr == in_arr.ctypes.data
         assert view.shape == in_arr.shape
-        strides_in_counts = convert_strides_to_counts(
-            in_arr.strides, in_arr.dtype.itemsize)
+        strides_in_counts = convert_strides_to_counts(in_arr.strides, in_arr.dtype.itemsize)
         if in_arr.flags.c_contiguous:
             assert view.strides is None
         else:
@@ -68,7 +66,7 @@ def gpu_array_samples():
         samples += [
             (cp.empty(3, dtype=cp.complex64), None),
             (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True),
-            (cp.empty((3, 4), order='F'), True),
+            (cp.empty((3, 4), order="F"), True),
         ]
     # Numba's device_array is the only known array container that does not
     # support DLPack (so that we get to test the CAI coverage).
@@ -88,13 +86,8 @@ def gpu_array_ptr(arr):
     assert False, f"{arr=}"
 
 
-@pytest.mark.parametrize(
-    "in_arr,stream", (
-        *gpu_array_samples(),
-    )
-)
+@pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),))
 class TestViewGPU:
-
     def test_viewable_gpu(self, in_arr, stream):
         # TODO: use the device fixture?
         dev = Device()
@@ -116,17 +109,14 @@ def test_strided_memory_view_cpu(self, in_arr, stream):
         # This is the consumer stream
         s = dev.create_stream() if stream else None
 
-        view = StridedMemoryView(
-            in_arr,
-            stream_ptr=s.handle if s else -1)
+        view = StridedMemoryView(in_arr, stream_ptr=s.handle if s else -1)
         self._check_view(view, in_arr, dev)
 
     def _check_view(self, view, in_arr, dev):
         assert isinstance(view, StridedMemoryView)
         assert view.ptr == gpu_array_ptr(in_arr)
         assert view.shape == in_arr.shape
-        strides_in_counts = convert_strides_to_counts(
-            in_arr.strides, in_arr.dtype.itemsize)
+        strides_in_counts = convert_strides_to_counts(in_arr.strides, in_arr.dtype.itemsize)
         if in_arr.flags["C_CONTIGUOUS"]:
             assert view.strides in (None, strides_in_counts)
         else:

From bcf3add300fd3209713e418c111ff686fe4111f1 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 30 Nov 2024 03:06:19 +0000
Subject: [PATCH 06/17] address comments on the docstring

---
 cuda_core/cuda/core/experimental/_memoryview.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 4b12463e4..d820c46d6 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -340,7 +340,11 @@ def viewable(tuple arg_indices):
     """Decorator to create proxy objects to :obj:`StridedMemoryView` for the
     specified positional arguments.
 
-    Inside the decorated function, the specified arguments becomes instances
+    This allows array/tensor attributes to be accessed inside the function
+    implementation, while keeping the function body array-library-agnostic (if
+    desired).
+
+    Inside the decorated function, the specified arguments become instances
     of an (undocumented) proxy type, regardless of its original source. A
     :obj:`StridedMemoryView` instance can be obtained by passing the (consumer)
     stream pointer (as a Python `int`) to the proxies's ``view()`` method. For
@@ -355,10 +359,6 @@ def viewable(tuple arg_indices):
             assert isinstance(view, StridedMemoryView)
             ...
 
-    This allows array/tensor attributes to be accessed inside the function
-    implementation, while keeping the function body array-library-agnostic (if
-    desired).
-
     Parameters
     ----------
     arg_indices : tuple

From b11e1ae8a3f8d49e12ffdb77a5eaa66db029bde1 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 30 Nov 2024 03:39:53 +0000
Subject: [PATCH 07/17] fix import accidentally removed by ruff

---
 cuda_core/cuda/core/experimental/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py
index 0717b41aa..a305d03bf 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/experimental/utils.py
@@ -2,3 +2,4 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+from cuda.core.experimental._memoryview import StridedMemoryView, viewable  # noqa: F401

From ede50769f0c2e2c4bc44ccd2f82a2849facc3127 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 30 Nov 2024 03:40:21 +0000
Subject: [PATCH 08/17] fix device_id convention for CPU

---
 .../cuda/core/experimental/_memoryview.pyx     | 18 ++++++++----------
 cuda_core/tests/test_utils.py                  |  2 +-
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index d820c46d6..ccb87c906 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -56,7 +56,8 @@ cdef class StridedMemoryView:
     dtype: numpy.dtype
         Data type of the tensor.
     device_id: int
-        The device ID for where the tensor is located. It is 0 for CPU tensors.
+        The device ID for where the tensor is located. It is -1 for CPU tensors
+        (meaning those only accessible from the host).
     device_accessible: bool
         Whether the tensor data can be accessed on the GPU.
     readonly: bool
@@ -78,7 +79,7 @@ cdef class StridedMemoryView:
     shape: tuple = None
     strides: tuple = None  # in counts, not bytes
     dtype: numpy.dtype = None
-    device_id: int = None  # 0 for CPU
+    device_id: int = None  # -1 for CPU
     device_accessible: bool = None
     readonly: bool = None
     exporting_obj: Any = None
@@ -152,27 +153,24 @@ cdef class _StridedMemoryViewProxy:
 cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
     cdef int dldevice, device_id, i
     cdef bint device_accessible, versioned, is_readonly
+    device_accessible = False
     dldevice, device_id = obj.__dlpack_device__()
     if dldevice == _kDLCPU:
-        device_accessible = False
         assert device_id == 0
+        device_id = -1
         if stream_ptr is None:
             raise BufferError("stream=None is ambiguous with view()")
         elif stream_ptr == -1:
             stream_ptr = None
     elif dldevice == _kDLCUDA:
+        assert device_id >= 0
         device_accessible = True
         # no need to check other stream values, it's a pass-through
         if stream_ptr is None:
             raise BufferError("stream=None is ambiguous with view()")
-    elif dldevice == _kDLCUDAHost:
+    elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged):
         device_accessible = True
-        assert device_id == 0
-        # just do a pass-through without any checks, as pinned memory can be
-        # accessed on both host and device
-    elif dldevice == _kDLCUDAManaged:
-        device_accessible = True
-        # just do a pass-through without any checks, as managed memory can be
+        # just do a pass-through without any checks, as pinned/managed memory can be
         # accessed on both host and device
     else:
         raise BufferError("device not supported")
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index c171742a2..3df624855 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -54,7 +54,7 @@ def _check_view(self, view, in_arr):
         else:
             assert view.strides == strides_in_counts
         assert view.dtype == in_arr.dtype
-        assert view.device_id == 0
+        assert view.device_id == -1
         assert view.device_accessible == False
         assert view.exporting_obj is in_arr
 

From 8027c78f3c467f1017e3aa51683fb7ac0254dc50 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Dec 2024 00:04:13 +0000
Subject: [PATCH 09/17] rename viewable to args_viewable_as_strided_memory

---
 cuda_core/cuda/core/experimental/_memoryview.pyx |  6 +++---
 cuda_core/cuda/core/experimental/utils.py        |  3 ++-
 cuda_core/docs/source/api.rst                    |  2 +-
 cuda_core/docs/source/release.md                 |  1 +
 cuda_core/docs/source/release/0.1.1-notes.md     | 13 +++++++++++++
 cuda_core/docs/versions.json                     |  1 +
 cuda_core/tests/test_utils.py                    | 10 +++++-----
 7 files changed, 26 insertions(+), 10 deletions(-)
 create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md

diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index ccb87c906..246644fa2 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -24,7 +24,7 @@ cdef class StridedMemoryView:
 
     A :obj:`StridedMemoryView` instance can be created in two ways:
 
-      1. Using the :obj:`viewable` decorator (recommended)
+      1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended)
       2. Explicit construction, see below
 
     This object supports both DLPack (up to v1.0) and CUDA Array Interface
@@ -334,7 +334,7 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
     return buf
 
 
-def viewable(tuple arg_indices):
+def args_viewable_as_strided_memory(tuple arg_indices):
     """Decorator to create proxy objects to :obj:`StridedMemoryView` for the
     specified positional arguments.
 
@@ -350,7 +350,7 @@ def viewable(tuple arg_indices):
 
     .. code-block:: python
 
-        @viewable((1,))
+        @args_viewable_as_strided_memory((1,))
         def my_func(arg0, arg1, arg2, stream: Stream):
             # arg1 can be any object supporting DLPack or CUDA Array Interface
             view = arg1.view(stream.handle)
diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py
index a305d03bf..7b86160a4 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/experimental/utils.py
@@ -2,4 +2,5 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from cuda.core.experimental._memoryview import StridedMemoryView, viewable  # noqa: F401
+from cuda.core.experimental._memoryview import args_viewable_as_strided_memory  # noqa: F401
+from cuda.core.experimental._memoryview import StridedMemoryView  # noqa: F401
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 84720536a..558c3ec8c 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -41,7 +41,7 @@ Utility functions
 .. autosummary::
    :toctree: generated/
 
-   viewable
+   args_viewable_as_strided_memory
 
    :template: dataclass.rst
 
diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 48e247863..55090b0b3 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -5,5 +5,6 @@
 maxdepth: 3
 ---
 
+    0.1.1 <release/0.1.1-notes>
     0.1.0 <release/0.1.0-notes>
 ```
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
new file mode 100644
index 000000000..473352a47
--- /dev/null
+++ b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -0,0 +1,13 @@
+# `cuda.core` Release notes
+
+Released on Dec XX, 2024
+
+## Hightlights
+- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
+  implementation of DLPack & CUDA Array Interface supports.
+
+
+## Limitations
+
+- All APIs are currently *experimental* and subject to change without deprecation notice.
+  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
diff --git a/cuda_core/docs/versions.json b/cuda_core/docs/versions.json
index 4163fd316..41664534b 100644
--- a/cuda_core/docs/versions.json
+++ b/cuda_core/docs/versions.json
@@ -1,4 +1,5 @@
 {
     "latest"  : "latest",
+    "0.1.1"   : "0.1.1",
     "0.1.0"   : "0.1.0"
 }
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 3df624855..6438c5628 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -14,7 +14,7 @@
 import pytest
 
 from cuda.core.experimental import Device
-from cuda.core.experimental.utils import StridedMemoryView, viewable
+from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 
 def convert_strides_to_counts(strides, itemsize):
@@ -30,8 +30,8 @@ def convert_strides_to_counts(strides, itemsize):
     ),
 )
 class TestViewCPU:
-    def test_viewable_cpu(self, in_arr):
-        @viewable((0,))
+    def test_args_viewable_as_strided_memory_cpu(self, in_arr):
+        @args_viewable_as_strided_memory((0,))
         def my_func(arr):
             # stream_ptr=-1 means "the consumer does not care"
             view = arr.view(-1)
@@ -88,14 +88,14 @@ def gpu_array_ptr(arr):
 
 @pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),))
 class TestViewGPU:
-    def test_viewable_gpu(self, in_arr, stream):
+    def test_args_viewable_as_strided_memory_gpu(self, in_arr, stream):
         # TODO: use the device fixture?
         dev = Device()
         dev.set_current()
         # This is the consumer stream
         s = dev.create_stream() if stream else None
 
-        @viewable((0,))
+        @args_viewable_as_strided_memory((0,))
         def my_func(arr):
             view = arr.view(s.handle if s else -1)
             self._check_view(view, in_arr, dev)

From 66377d8796c0249a912522e0971a306a6fdb2dc0 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Dec 2024 02:15:16 +0000
Subject: [PATCH 10/17] rename device_accessible to is_device_accessible for
 consistency

---
 .../cuda/core/experimental/_memoryview.pyx     | 18 +++++++++---------
 cuda_core/tests/test_utils.py                  |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 246644fa2..d8eba464a 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -58,7 +58,7 @@ cdef class StridedMemoryView:
     device_id: int
         The device ID for where the tensor is located. It is -1 for CPU tensors
         (meaning those only accessible from the host).
-    device_accessible: bool
+    is_device_accessible: bool
         Whether the tensor data can be accessed on the GPU.
     readonly: bool
         Whether the tensor data can be modified in place.
@@ -80,7 +80,7 @@ cdef class StridedMemoryView:
     strides: tuple = None  # in counts, not bytes
     dtype: numpy.dtype = None
     device_id: int = None  # -1 for CPU
-    device_accessible: bool = None
+    is_device_accessible: bool = None
     readonly: bool = None
     exporting_obj: Any = None
 
@@ -101,7 +101,7 @@ cdef class StridedMemoryView:
               + f"                  strides={self.strides},\n"
               + f"                  dtype={get_simple_repr(self.dtype)},\n"
               + f"                  device_id={self.device_id},\n"
-              + f"                  device_accessible={self.device_accessible},\n"
+              + f"                  is_device_accessible={self.is_device_accessible},\n"
               + f"                  readonly={self.readonly},\n"
               + f"                  exporting_obj={get_simple_repr(self.exporting_obj)})")
 
@@ -152,8 +152,8 @@ cdef class _StridedMemoryViewProxy:
 
 cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
     cdef int dldevice, device_id, i
-    cdef bint device_accessible, versioned, is_readonly
-    device_accessible = False
+    cdef bint is_device_accessible, versioned, is_readonly
+    is_device_accessible = False
     dldevice, device_id = obj.__dlpack_device__()
     if dldevice == _kDLCPU:
         assert device_id == 0
@@ -164,12 +164,12 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
             stream_ptr = None
     elif dldevice == _kDLCUDA:
         assert device_id >= 0
-        device_accessible = True
+        is_device_accessible = True
         # no need to check other stream values, it's a pass-through
         if stream_ptr is None:
             raise BufferError("stream=None is ambiguous with view()")
     elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged):
-        device_accessible = True
+        is_device_accessible = True
         # just do a pass-through without any checks, as pinned/managed memory can be
         # accessed on both host and device
     else:
@@ -221,7 +221,7 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
         buf.strides = None
     buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype)
     buf.device_id = device_id
-    buf.device_accessible = device_accessible
+    buf.is_device_accessible = is_device_accessible
     buf.readonly = is_readonly
     buf.exporting_obj = obj
 
@@ -311,7 +311,7 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
     if buf.strides is not None:
         # convert to counts
         buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides)
-    buf.device_accessible = True
+    buf.is_device_accessible = True
     buf.device_id = handle_return(
         cuda.cuPointerGetAttribute(
             cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 6438c5628..83d890f23 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -55,7 +55,7 @@ def _check_view(self, view, in_arr):
             assert view.strides == strides_in_counts
         assert view.dtype == in_arr.dtype
         assert view.device_id == -1
-        assert view.device_accessible == False
+        assert view.is_device_accessible == False
         assert view.exporting_obj is in_arr
 
 
@@ -123,5 +123,5 @@ def _check_view(self, view, in_arr, dev):
             assert view.strides == strides_in_counts
         assert view.dtype == in_arr.dtype
         assert view.device_id == dev.device_id
-        assert view.device_accessible == True
+        assert view.is_device_accessible == True
         assert view.exporting_obj is in_arr

From b5cfdce74db45324da17f404173024ed1ce56751 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Dec 2024 02:42:34 +0000
Subject: [PATCH 11/17] enforce line ending in the whole codebase

---
 .gitattributes | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitattributes b/.gitattributes
index 8c8fc4274..00407cdc0 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,7 @@
 cuda/_version.py export-subst
+
+* text eol=lf
+
+# we do not own any headers checked in, don't touch them
+*.h binary
+*.hpp binary

From 9572f8a0e8a84f3e0d5cf813fe14603b65929469 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Dec 2024 02:42:51 +0000
Subject: [PATCH 12/17] add a code sample for strided memory view

---
 cuda_core/examples/strided_memory_view.py | 162 ++++++++++++++++++++++
 cuda_core/tests/conftest.py               |  92 +++++++-----
 2 files changed, 218 insertions(+), 36 deletions(-)
 create mode 100644 cuda_core/examples/strided_memory_view.py

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
new file mode 100644
index 000000000..4fde4a62f
--- /dev/null
+++ b/cuda_core/examples/strided_memory_view.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+# ################################################################################
+#
+# This demo aims to illustrate two takeaways:
+#
+#   1. The similarity between CPU and GPU JIT-compilation for C++ sources
+#   2. How to use StridedMemoryView to interface with foreign CPU/GPU functions
+#      at low-level
+#
+# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
+# path, which can be easily installed from pip or conda following their instruction.
+# We also use NumPy/CuPy as the CPU/GPU array container. 
+#
+# ################################################################################
+
+import string
+import sys
+
+try:
+    from cffi import FFI
+except ImportError:
+    print("cffi is not installed, the CPU example would be skipped", file=sys.stderr)
+    cffi = None
+try:
+    import cupy as cp
+except ImportError:
+    print("cupy is not installed, the GPU example would be skipped", file=sys.stderr)
+    cp = None
+import numpy as np
+
+from cuda.core.experimental import Device, Program
+from cuda.core.experimental import launch, LaunchConfig
+from cuda.core.experimental.utils import args_viewable_as_strided_memory
+from cuda.core.experimental.utils import StridedMemoryView
+
+
+# ################################################################################
+#
+# Usually this entire code block is in a separate file, built as a Python extension
+# module that can be imported by users at run time. For illustrative purposes we
+# use JIT compilation to make this demo self-contained.
+#
+# Here we assume an in-place operation, equivalent to the following NumPy code:
+#
+#   >>> arr = ...
+#   >>> assert arr.dtype == np.int32
+#   >>> assert arr.ndim == 1
+#   >>> arr += np.arange(arr.size, dtype=arr.dtype)
+#
+# is implemented for both CPU and GPU at low-level, with the following C function
+# signature:
+func_name = "inplace_plus_arange_N"
+func_sig = f"void {func_name}(int* data, size_t N)"
+
+# Here is a concrete (very naive!) implementation on CPU:
+if FFI:
+    cpu_code = string.Template(r"""
+    extern "C" {
+        $func_sig {
+            for (size_t i = 0; i < N; i++) {
+                data[i] += i;
+            }
+        }
+    }
+    """).substitute(func_sig=func_sig)
+    cpu_prog = FFI()
+    cpu_prog.set_source("_cpu_obj", cpu_code, source_extension=".cpp")
+    cpu_prog.cdef(f"{func_sig};")
+    cpu_prog.compile()
+    # This is cffi's way of loading a CPU function. cffi builds an extension module
+    # that has the Python binding to the underlying C function. (For more details,
+    # please refer to cffi's documentation.)
+    from _cpu_obj.lib import inplace_plus_arange_N as cpu_func
+
+# Here is a concrete (again, very naive!) implementation on GPU:
+if cp:
+    gpu_code = string.Template(r"""
+    extern "C"
+    __global__ $func_sig {
+        const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+        const size_t stride_size = gridDim.x * blockDim.x;
+        for (size_t i = tid; i < N; i += stride_size) {
+            data[i] += i;
+        }
+    }
+    """).substitute(func_sig=func_sig)
+    gpu_prog = Program(gpu_code, code_type="c++")
+    # To know the GPU's compute capability, we need to identify which GPU to use.
+    dev = Device(0)
+    arch = "".join(f"{i}" for i in dev.compute_capability)
+    mod = gpu_prog.compile(
+        target_type="cubin",
+        # TODO: update this after NVIDIA/cuda-python#237 is merged
+        options=(f"-arch=sm_{arch}", "-std=c++11"))
+    gpu_ker = mod.get_kernel(func_name)
+
+# Now we are prepared to run the code from the user's perspective!
+#
+# ################################################################################
+
+
+# Below, as a user we want to perform the said in-place operation on either CPU
+# or GPU, by calling the corresponding function implemented "elsewhere" (done above).
+
+@args_viewable_as_strided_memory((0,))
+def my_func(arr, work_stream):
+    # create a memory view over arr, assumed to be a 1D array of int32
+    view = arr.view(work_stream.handle if work_stream else -1)
+    assert isinstance(view, StridedMemoryView)
+    assert len(view.shape) == 1
+    assert view.dtype == np.int32
+
+    size = view.shape[0]
+    if view.is_device_accessible:
+        block = 256
+        grid = size // 256
+        config = LaunchConfig(grid=grid, block=block, stream=work_stream)
+        launch(gpu_ker, config, view.ptr, np.uint64(size))
+        # here we're being conservative and synchronize over our work stream,
+        # assuming we do not know the (producer/source) stream; if we know
+        # then we could just order the producer/consumer streams here, e.g.
+        #
+        #   producer_stream.wait(work_stream)
+        #
+        # without an expansive synchronization.
+        work_stream.sync()
+    else:
+        cpu_func(cpu_prog.cast("int*", view.ptr), size)
+
+
+# This takes the CPU path
+if FFI:
+    # Create input array on CPU
+    arr_cpu = np.zeros(1024, dtype=np.int32)
+    print(f"before: {arr_cpu[:10]=}")
+
+    # Run the workload
+    my_func(arr_cpu, None)
+
+    # Check the result
+    print(f"after: {arr_cpu[:10]=}")
+    assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
+
+
+# This takes the GPU path
+if cp:
+    dev.set_current()
+    s = dev.create_stream()
+    # Create input array on GPU
+    arr_gpu = cp.ones(1024, dtype=cp.int32)
+    print(f"before: {arr_gpu[:10]=}")
+
+    # Run the workload
+    my_func(arr_gpu, s)
+
+    # Check the result
+    print(f"after: {arr_gpu[:10]=}")
+    assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
+    s.close()
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index bb99fb33c..e35f77355 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -1,36 +1,56 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-try:
-    from cuda.bindings import driver
-except ImportError:
-    from cuda import cuda as driver
-
-import pytest
-
-from cuda.core.experimental import Device, _device
-from cuda.core.experimental._utils import handle_return
-
-
-@pytest.fixture(scope="function")
-def init_cuda():
-    device = Device()
-    device.set_current()
-    yield
-    _device_unset_current()
-
-
-def _device_unset_current():
-    handle_return(driver.cuCtxPopCurrent())
-    with _device._tls_lock:
-        del _device._tls.devices
-
-
-@pytest.fixture(scope="function")
-def deinit_cuda():
-    yield
-    _device_unset_current()
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import glob
+import os
+import sys
+
+try:
+    from cuda.bindings import driver
+except ImportError:
+    from cuda import cuda as driver
+
+import pytest
+
+from cuda.core.experimental import Device, _device
+from cuda.core.experimental._utils import handle_return
+
+
+@pytest.fixture(scope="function")
+def init_cuda():
+    device = Device()
+    device.set_current()
+    yield
+    _device_unset_current()
+
+
+def _device_unset_current():
+    handle_return(driver.cuCtxPopCurrent())
+    with _device._tls_lock:
+        del _device._tls.devices
+
+
+@pytest.fixture(scope="function")
+def deinit_cuda():
+    yield
+    _device_unset_current()
+
+
+# samples relying on cffi could fail as the modules cannot be imported
+sys.path.append(os.getcwd())
+
+
+@pytest.fixture(scope="session", autouse=True)
+def clean_up_cffi_files():
+    yield
+    files = glob.glob(os.path.join(os.getcwd(), "_cpu_obj*"))
+    for f in files:
+        try:
+            os.remove(f)
+        except FileNotFoundError:
+            pass

From 069f057d5e16994409439ff8bca0907eab53d6a5 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Dec 2024 02:55:41 +0000
Subject: [PATCH 13/17] fix formatting again

---
 cuda_core/cuda/core/experimental/utils.py |  6 ++++--
 cuda_core/examples/strided_memory_view.py | 13 ++++++-------
 cuda_core/tests/conftest.py               |  4 ++--
 cuda_core/tests/test_utils.py             |  6 +++---
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py
index 7b86160a4..cc9a437d7 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/experimental/utils.py
@@ -2,5 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from cuda.core.experimental._memoryview import args_viewable_as_strided_memory  # noqa: F401
-from cuda.core.experimental._memoryview import StridedMemoryView  # noqa: F401
+from cuda.core.experimental._memoryview import (
+    StridedMemoryView,  # noqa: F401
+    args_viewable_as_strided_memory,  # noqa: F401
+)
diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index 4fde4a62f..0f77edabd 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -12,7 +12,7 @@
 #
 # To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
 # path, which can be easily installed from pip or conda following their instruction.
-# We also use NumPy/CuPy as the CPU/GPU array container. 
+# We also use NumPy/CuPy as the CPU/GPU array container.
 #
 # ################################################################################
 
@@ -31,11 +31,8 @@
     cp = None
 import numpy as np
 
-from cuda.core.experimental import Device, Program
-from cuda.core.experimental import launch, LaunchConfig
-from cuda.core.experimental.utils import args_viewable_as_strided_memory
-from cuda.core.experimental.utils import StridedMemoryView
-
+from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
 #
@@ -94,7 +91,8 @@
     mod = gpu_prog.compile(
         target_type="cubin",
         # TODO: update this after NVIDIA/cuda-python#237 is merged
-        options=(f"-arch=sm_{arch}", "-std=c++11"))
+        options=(f"-arch=sm_{arch}", "-std=c++11"),
+    )
     gpu_ker = mod.get_kernel(func_name)
 
 # Now we are prepared to run the code from the user's perspective!
@@ -105,6 +103,7 @@
 # Below, as a user we want to perform the said in-place operation on either CPU
 # or GPU, by calling the corresponding function implemented "elsewhere" (done above).
 
+
 @args_viewable_as_strided_memory((0,))
 def my_func(arr, work_stream):
     # create a memory view over arr, assumed to be a 1D array of int32
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index e35f77355..59e5883f3 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -50,7 +50,7 @@ def clean_up_cffi_files():
     yield
     files = glob.glob(os.path.join(os.getcwd(), "_cpu_obj*"))
     for f in files:
-        try:
+        try:  # noqa: SIM105
             os.remove(f)
         except FileNotFoundError:
-            pass
+            pass  # noqa: SIM105
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 83d890f23..ef37bcabb 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -55,7 +55,7 @@ def _check_view(self, view, in_arr):
             assert view.strides == strides_in_counts
         assert view.dtype == in_arr.dtype
         assert view.device_id == -1
-        assert view.is_device_accessible == False
+        assert view.is_device_accessible is False
         assert view.exporting_obj is in_arr
 
 
@@ -83,7 +83,7 @@ def gpu_array_ptr(arr):
         return arr.data.ptr
     if numba_cuda is not None and isinstance(arr, numba_cuda.cudadrv.devicearray.DeviceNDArray):
         return arr.device_ctypes_pointer.value
-    assert False, f"{arr=}"
+    raise NotImplementedError(f"{arr=}")
 
 
 @pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),))
@@ -123,5 +123,5 @@ def _check_view(self, view, in_arr, dev):
             assert view.strides == strides_in_counts
         assert view.dtype == in_arr.dtype
         assert view.device_id == dev.device_id
-        assert view.is_device_accessible == True
+        assert view.is_device_accessible is True
         assert view.exporting_obj is in_arr

From 5e393f82208a9a7e44430122cd0df865d903a0f8 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Dec 2024 15:34:35 +0000
Subject: [PATCH 14/17] address review comments

---
 cuda_core/examples/strided_memory_view.py | 24 +++++++++++------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index 0f77edabd..efcf26af3 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -6,12 +6,11 @@
 #
 # This demo aims to illustrate two takeaways:
 #
-#   1. The similarity between CPU and GPU JIT-compilation for C++ sources
-#   2. How to use StridedMemoryView to interface with foreign CPU/GPU functions
-#      at low-level
+#   1. The similarity between CPU and GPU JIT-compilation with C++ sources
+#   2. How to use StridedMemoryView to interface with foreign C/C++ functions
 #
 # To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
-# path, which can be easily installed from pip or conda following their instruction.
+# path, which can be easily installed from pip or conda following their instructions.
 # We also use NumPy/CuPy as the CPU/GPU array container.
 #
 # ################################################################################
@@ -22,12 +21,12 @@
 try:
     from cffi import FFI
 except ImportError:
-    print("cffi is not installed, the CPU example would be skipped", file=sys.stderr)
+    print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
     cffi = None
 try:
     import cupy as cp
 except ImportError:
-    print("cupy is not installed, the GPU example would be skipped", file=sys.stderr)
+    print("cupy is not installed, the GPU example will be skipped", file=sys.stderr)
     cp = None
 import numpy as np
 
@@ -55,11 +54,10 @@
 # Here is a concrete (very naive!) implementation on CPU:
 if FFI:
     cpu_code = string.Template(r"""
-    extern "C" {
-        $func_sig {
-            for (size_t i = 0; i < N; i++) {
-                data[i] += i;
-            }
+    extern "C"
+    $func_sig {
+        for (size_t i = 0; i < N; i++) {
+            data[i] += i;
         }
     }
     """).substitute(func_sig=func_sig)
@@ -115,7 +113,7 @@ def my_func(arr, work_stream):
     size = view.shape[0]
     if view.is_device_accessible:
         block = 256
-        grid = size // 256
+        grid = (size + block - 1) // block
         config = LaunchConfig(grid=grid, block=block, stream=work_stream)
         launch(gpu_ker, config, view.ptr, np.uint64(size))
         # here we're being conservative and synchronize over our work stream,
@@ -124,7 +122,7 @@ def my_func(arr, work_stream):
         #
         #   producer_stream.wait(work_stream)
         #
-        # without an expansive synchronization.
+        # without an expensive synchronization.
         work_stream.sync()
     else:
         cpu_func(cpu_prog.cast("int*", view.ptr), size)

From 8e209aac6ee1c13eeaaba10298b903745fa92eed Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 2 Dec 2024 16:25:39 +0000
Subject: [PATCH 15/17] programmatically load cffi functions

---
 cuda_core/examples/strided_memory_view.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index efcf26af3..0d49d5aed 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -15,6 +15,7 @@
 #
 # ################################################################################
 
+import importlib
 import string
 import sys
 
@@ -61,14 +62,19 @@
         }
     }
     """).substitute(func_sig=func_sig)
+    # This is cffi's way of JIT compiling & loading a CPU function. cffi builds an
+    # extension module that has the Python binding to the underlying C function.
+    # For more details, please refer to cffi's documentation.
     cpu_prog = FFI()
-    cpu_prog.set_source("_cpu_obj", cpu_code, source_extension=".cpp")
     cpu_prog.cdef(f"{func_sig};")
+    cpu_prog.set_source(
+        "_cpu_obj",
+        cpu_code,
+        source_extension=".cpp",
+        extra_compile_args=["-std=c++11"],
+    )
     cpu_prog.compile()
-    # This is cffi's way of loading a CPU function. cffi builds an extension module
-    # that has the Python binding to the underlying C function. (For more details,
-    # please refer to cffi's documentation.)
-    from _cpu_obj.lib import inplace_plus_arange_N as cpu_func
+    cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
 
 # Here is a concrete (again, very naive!) implementation on GPU:
 if cp:

From 6af4da317bc6e71ad0713af2d5b0ce80f849643b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 3 Dec 2024 00:23:51 +0000
Subject: [PATCH 16/17] fix import fallback; address review comments

---
 cuda_core/examples/strided_memory_view.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index 0d49d5aed..847d28b7b 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -23,7 +23,7 @@
     from cffi import FFI
 except ImportError:
     print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
-    cffi = None
+    FFI = None
 try:
     import cupy as cp
 except ImportError:
@@ -107,28 +107,33 @@
 # Below, as a user we want to perform the said in-place operation on either CPU
 # or GPU, by calling the corresponding function implemented "elsewhere" (done above).
 
-
+# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
+# of which are supported by StridedMemoryView).
 @args_viewable_as_strided_memory((0,))
 def my_func(arr, work_stream):
-    # create a memory view over arr, assumed to be a 1D array of int32
+    # Create a memory view over arr (assumed to be a 1D array of int32). The stream
+    # ordering is taken care of, so that arr can be safely accessed on our work
+    # stream (ordered after a data stream on which arr is potentially prepared).
     view = arr.view(work_stream.handle if work_stream else -1)
     assert isinstance(view, StridedMemoryView)
     assert len(view.shape) == 1
     assert view.dtype == np.int32
 
     size = view.shape[0]
+    # DLPack also supports host arrays. We want to know if the array data is
+    # accessible from the GPU, and dispatch to the right routine accordingly.
     if view.is_device_accessible:
         block = 256
         grid = (size + block - 1) // block
         config = LaunchConfig(grid=grid, block=block, stream=work_stream)
         launch(gpu_ker, config, view.ptr, np.uint64(size))
-        # here we're being conservative and synchronize over our work stream,
-        # assuming we do not know the (producer/source) stream; if we know
-        # then we could just order the producer/consumer streams here, e.g.
+        # Here we're being conservative and synchronize over our work stream,
+        # assuming we do not know the data stream; if we know then we could
+        # just order the data stream after the work stream here, e.g.
         #
-        #   producer_stream.wait(work_stream)
+        #   data_stream.wait(work_stream)
         #
-        # without an expensive synchronization.
+        # without an expensive synchronization (with respect to the host).
         work_stream.sync()
     else:
         cpu_func(cpu_prog.cast("int*", view.ptr), size)

From 16fc9f69037ac1ad86622d6daf58aa0fa8401edd Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 4 Dec 2024 02:33:45 +0000
Subject: [PATCH 17/17] test readonly with numpy; rename use_stream parameter

---
 cuda_core/examples/strided_memory_view.py |  1 +
 cuda_core/tests/test_utils.py             | 24 ++++++++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index 847d28b7b..564d7fa01 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -107,6 +107,7 @@
 # Below, as a user we want to perform the said in-place operation on either CPU
 # or GPU, by calling the corresponding function implemented "elsewhere" (done above).
 
+
 # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
 # of which are supported by StridedMemoryView).
 @args_viewable_as_strided_memory((0,))
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index ef37bcabb..0926a549d 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -27,6 +27,14 @@ def convert_strides_to_counts(strides, itemsize):
         np.empty(3, dtype=np.int32),
         np.empty((6, 6), dtype=np.float64)[::2, ::2],
         np.empty((3, 4), order="F"),
+        np.empty((), dtype=np.float16),
+        # readonly is fixed recently (numpy/numpy#26501)
+        pytest.param(
+            np.frombuffer(b""),
+            marks=pytest.mark.skipif(
+                tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+"
+            ),
+        ),
     ),
 )
 class TestViewCPU:
@@ -57,6 +65,7 @@ def _check_view(self, view, in_arr):
         assert view.device_id == -1
         assert view.is_device_accessible is False
         assert view.exporting_obj is in_arr
+        assert view.readonly is not in_arr.flags.writeable
 
 
 def gpu_array_samples():
@@ -64,7 +73,7 @@ def gpu_array_samples():
     samples = []
     if cp is not None:
         samples += [
-            (cp.empty(3, dtype=cp.complex64), None),
+            (cp.empty(3, dtype=cp.complex64), False),
             (cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True),
             (cp.empty((3, 4), order="F"), True),
         ]
@@ -72,7 +81,7 @@ def gpu_array_samples():
     # support DLPack (so that we get to test the CAI coverage).
     if numba_cuda is not None:
         samples += [
-            (numba_cuda.device_array((2,), dtype=np.int8), None),
+            (numba_cuda.device_array((2,), dtype=np.int8), False),
             (numba_cuda.device_array((4, 2), dtype=np.float32), True),
         ]
     return samples
@@ -86,14 +95,14 @@ def gpu_array_ptr(arr):
     raise NotImplementedError(f"{arr=}")
 
 
-@pytest.mark.parametrize("in_arr,stream", (*gpu_array_samples(),))
+@pytest.mark.parametrize("in_arr,use_stream", (*gpu_array_samples(),))
 class TestViewGPU:
-    def test_args_viewable_as_strided_memory_gpu(self, in_arr, stream):
+    def test_args_viewable_as_strided_memory_gpu(self, in_arr, use_stream):
         # TODO: use the device fixture?
         dev = Device()
         dev.set_current()
         # This is the consumer stream
-        s = dev.create_stream() if stream else None
+        s = dev.create_stream() if use_stream else None
 
         @args_viewable_as_strided_memory((0,))
         def my_func(arr):
@@ -102,12 +111,12 @@ def my_func(arr):
 
         my_func(in_arr)
 
-    def test_strided_memory_view_cpu(self, in_arr, stream):
+    def test_strided_memory_view_cpu(self, in_arr, use_stream):
         # TODO: use the device fixture?
         dev = Device()
         dev.set_current()
         # This is the consumer stream
-        s = dev.create_stream() if stream else None
+        s = dev.create_stream() if use_stream else None
 
         view = StridedMemoryView(in_arr, stream_ptr=s.handle if s else -1)
         self._check_view(view, in_arr, dev)
@@ -125,3 +134,4 @@ def _check_view(self, view, in_arr, dev):
         assert view.device_id == dev.device_id
         assert view.is_device_accessible is True
         assert view.exporting_obj is in_arr
+        # can't test view.readonly with CuPy or Numba...