Skip to content

Commit 6af4da3

Browse files
committed
fix import fallback; address review comments
1 parent bd1d944 commit 6af4da3

File tree

1 file changed

+13
-8
lines changed

1 file changed

+13
-8
lines changed

cuda_core/examples/strided_memory_view.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from cffi import FFI
2424
except ImportError:
2525
print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
26-
cffi = None
26+
FFI = None
2727
try:
2828
import cupy as cp
2929
except ImportError:
@@ -107,28 +107,33 @@
107107
# Below, as a user we want to perform the said in-place operation on either CPU
108108
# or GPU, by calling the corresponding function implemented "elsewhere" (done above).
109109

110-
110+
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
111+
# of which are supported by StridedMemoryView).
111112
@args_viewable_as_strided_memory((0,))
112113
def my_func(arr, work_stream):
113-
# create a memory view over arr, assumed to be a 1D array of int32
114+
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
115+
# ordering is taken care of, so that arr can be safely accessed on our work
116+
# stream (ordered after a data stream on which arr is potentially prepared).
114117
view = arr.view(work_stream.handle if work_stream else -1)
115118
assert isinstance(view, StridedMemoryView)
116119
assert len(view.shape) == 1
117120
assert view.dtype == np.int32
118121

119122
size = view.shape[0]
123+
# DLPack also supports host arrays. We want to know if the array data is
124+
# accessible from the GPU, and dispatch to the right routine accordingly.
120125
if view.is_device_accessible:
121126
block = 256
122127
grid = (size + block - 1) // block
123128
config = LaunchConfig(grid=grid, block=block, stream=work_stream)
124129
launch(gpu_ker, config, view.ptr, np.uint64(size))
125-
# here we're being conservative and synchronize over our work stream,
126-
# assuming we do not know the (producer/source) stream; if we know
127-
# then we could just order the producer/consumer streams here, e.g.
130+
# Here we're being conservative and synchronize over our work stream,
131+
# assuming we do not know the data stream; if we know then we could
132+
# just order the data stream after the work stream here, e.g.
128133
#
129-
# producer_stream.wait(work_stream)
134+
# data_stream.wait(work_stream)
130135
#
131-
# without an expensive synchronization.
136+
# without an expensive synchronization (with respect to the host).
132137
work_stream.sync()
133138
else:
134139
cpu_func(cpu_prog.cast("int*", view.ptr), size)

0 commit comments

Comments
 (0)