|
23 | 23 | from cffi import FFI
|
24 | 24 | except ImportError:
|
25 | 25 | print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
|
26 |
| - cffi = None |
| 26 | + FFI = None |
27 | 27 | try:
|
28 | 28 | import cupy as cp
|
29 | 29 | except ImportError:
|
|
107 | 107 | # Below, as a user we want to perform the said in-place operation on either CPU
|
108 | 108 | # or GPU, by calling the corresponding function implemented "elsewhere" (done above).
|
109 | 109 |
|
110 |
| - |
| 110 | +# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both |
| 111 | +# of which are supported by StridedMemoryView). |
111 | 112 | @args_viewable_as_strided_memory((0,))
|
112 | 113 | def my_func(arr, work_stream):
|
113 |
| - # create a memory view over arr, assumed to be a 1D array of int32 |
| 114 | + # Create a memory view over arr (assumed to be a 1D array of int32). The stream |
| 115 | + # ordering is taken care of, so that arr can be safely accessed on our work |
| 116 | + # stream (ordered after a data stream on which arr is potentially prepared). |
114 | 117 | view = arr.view(work_stream.handle if work_stream else -1)
|
115 | 118 | assert isinstance(view, StridedMemoryView)
|
116 | 119 | assert len(view.shape) == 1
|
117 | 120 | assert view.dtype == np.int32
|
118 | 121 |
|
119 | 122 | size = view.shape[0]
|
| 123 | + # DLPack also supports host arrays. We want to know if the array data is |
| 124 | + # accessible from the GPU, and dispatch to the right routine accordingly. |
120 | 125 | if view.is_device_accessible:
|
121 | 126 | block = 256
|
122 | 127 | grid = (size + block - 1) // block
|
123 | 128 | config = LaunchConfig(grid=grid, block=block, stream=work_stream)
|
124 | 129 | launch(gpu_ker, config, view.ptr, np.uint64(size))
|
125 |
| - # here we're being conservative and synchronize over our work stream, |
126 |
| - # assuming we do not know the (producer/source) stream; if we know |
127 |
| - # then we could just order the producer/consumer streams here, e.g. |
| 130 | + # Here we're being conservative and synchronize over our work stream, |
| 131 | + # assuming we do not know the data stream; if we know then we could |
| 132 | + # just order the data stream after the work stream here, e.g. |
128 | 133 | #
|
129 |
| - # producer_stream.wait(work_stream) |
| 134 | + # data_stream.wait(work_stream) |
130 | 135 | #
|
131 |
| - # without an expensive synchronization. |
| 136 | + # without an expensive synchronization (with respect to the host). |
132 | 137 | work_stream.sync()
|
133 | 138 | else:
|
134 | 139 | cpu_func(cpu_prog.cast("int*", view.ptr), size)
|
|
0 commit comments