Skip to content

Commit 16ce636

Browse files
ax3lRemiLehe
andauthored
Array4: __cuda_array_interface__ v3 (#30)
* Array4: __cuda_array_interface__ v2 Start implementing the `__cuda_array_interface__` for zero-copy data exchange on Nvidia CUDA GPUs. * MultiFab: CuPy Test * `MFIter`: `Finalize()` on `StopIteration` Since `for` loops create no scope in Python, we need to trigger finalize logic, including stream syncs, before the destructor of `MultiFab` iterators are called. * Add numba test incl. 3D kernel launch * Add pytorch * CuPy Fuse: Avoid Extra Memset * MultiFab Device Test: Fixes * Update to v3 * Array4: TODO from CUDA A bit tricky to implement this caster as new constructor. Not currently needed, but adds comments where to do this. Co-authored-by: Remi Lehe <[email protected]>
1 parent e0cac44 commit 16ce636

File tree

5 files changed

+361
-71
lines changed

5 files changed

+361
-71
lines changed

src/Base/Array4.cpp

Lines changed: 92 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,61 @@ namespace py = pybind11;
1919
using namespace amrex;
2020

2121

22+
namespace
23+
{
24+
/** CPU: __array_interface__ v3
25+
*
26+
* https://numpy.org/doc/stable/reference/arrays.interface.html
27+
*/
28+
template<typename T>
29+
py::dict
30+
array_interface(Array4<T> const & a4)
31+
{
32+
auto d = py::dict();
33+
auto const len = length(a4);
34+
// F->C index conversion here
35+
// p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
36+
// Buffer dimensions: zero-size shall not skip dimension
37+
auto shape = py::make_tuple(
38+
a4.ncomp,
39+
len.z <= 0 ? 1 : len.z,
40+
len.y <= 0 ? 1 : len.y,
41+
len.x <= 0 ? 1 : len.x // fastest varying index
42+
);
43+
// buffer protocol strides are in bytes, AMReX strides are elements
44+
auto const strides = py::make_tuple(
45+
sizeof(T) * a4.nstride,
46+
sizeof(T) * a4.kstride,
47+
sizeof(T) * a4.jstride,
48+
sizeof(T) // fastest varying index
49+
);
50+
bool const read_only = false;
51+
d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
52+
// note: if we want to keep the same global indexing with non-zero
53+
// box small_end as in AMReX, then we can explore playing with
54+
// this offset as well
55+
//d["offset"] = 0; // default
56+
//d["mask"] = py::none(); // default
57+
58+
d["shape"] = shape;
59+
// we could also set this after checking the strides are C-style contiguous:
60+
//if (is_contiguous<T>(shape, strides))
61+
// d["strides"] = py::none(); // C-style contiguous
62+
//else
63+
d["strides"] = strides;
64+
65+
// type description
66+
// for more complicated types, e.g., tuples/structs
67+
//d["descr"] = ...;
68+
// we currently only need this
69+
d["typestr"] = py::format_descriptor<T>::format();
70+
71+
d["version"] = 3;
72+
return d;
73+
}
74+
}
75+
76+
2277
template< typename T >
2378
void make_Array4(py::module &m, std::string typestr)
2479
{
@@ -85,56 +140,55 @@ void make_Array4(py::module &m, std::string typestr)
85140
return a4;
86141
}))
87142

143+
/* init from __cuda_array_interface__: non-owning view
144+
* TODO
145+
*/
146+
147+
148+
// CPU: __array_interface__ v3
149+
// https://numpy.org/doc/stable/reference/arrays.interface.html
88150
.def_property_readonly("__array_interface__", [](Array4<T> const & a4) {
89-
auto d = py::dict();
90-
auto const len = length(a4);
91-
// F->C index conversion here
92-
// p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
93-
// Buffer dimensions: zero-size shall not skip dimension
94-
auto shape = py::make_tuple(
95-
a4.ncomp,
96-
len.z <= 0 ? 1 : len.z,
97-
len.y <= 0 ? 1 : len.y,
98-
len.x <= 0 ? 1 : len.x // fastest varying index
99-
);
100-
// buffer protocol strides are in bytes, AMReX strides are elements
101-
auto const strides = py::make_tuple(
102-
sizeof(T) * a4.nstride,
103-
sizeof(T) * a4.kstride,
104-
sizeof(T) * a4.jstride,
105-
sizeof(T) // fastest varying index
106-
);
107-
bool const read_only = false;
108-
d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
109-
// note: if we want to keep the same global indexing with non-zero
110-
// box small_end as in AMReX, then we can explore playing with
111-
// this offset as well
112-
//d["offset"] = 0; // default
113-
//d["mask"] = py::none(); // default
114-
115-
d["shape"] = shape;
116-
// we could also set this after checking the strides are C-style contiguous:
117-
//if (is_contiguous<T>(shape, strides))
118-
// d["strides"] = py::none(); // C-style contiguous
119-
//else
120-
d["strides"] = strides;
121-
122-
d["typestr"] = py::format_descriptor<T>::format();
123-
d["version"] = 3;
124-
return d;
151+
return array_interface(a4);
125152
})
126153

154+
// CPU: __array_function__ interface (TODO)
155+
//
156+
// NEP 18 — A dispatch mechanism for NumPy's high level array functions.
157+
// https://numpy.org/neps/nep-0018-array-function-protocol.html
158+
// This enables code using NumPy to be directly operated on Array4 arrays.
159+
// __array_function__ feature requires NumPy 1.16 or later.
160+
127161

128-
// TODO: __cuda_array_interface__
162+
// Nvidia GPUs: __cuda_array_interface__ v2
129163
// https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
164+
.def_property_readonly("__cuda_array_interface__", [](Array4<T> const & a4) {
165+
auto d = array_interface(a4);
166+
167+
// data:
168+
// Because the user of the interface may or may not be in the same context, the most common case is to use cuPointerGetAttribute with CU_POINTER_ATTRIBUTE_DEVICE_POINTER in the CUDA driver API (or the equivalent CUDA Runtime API) to retrieve a device pointer that is usable in the currently active context.
169+
// TODO For zero-size arrays, use 0 here.
170+
171+
// None or integer
172+
// An optional stream upon which synchronization must take place at the point of consumption, either by synchronizing on the stream or enqueuing operations on the data on the given stream. Integer values in this entry are as follows:
173+
// 0: This is disallowed as it would be ambiguous between None and the default stream, and also between the legacy and per-thread default streams. Any use case where 0 might be given should either use None, 1, or 2 instead for clarity.
174+
// 1: The legacy default stream.
175+
// 2: The per-thread default stream.
176+
// Any other integer: a cudaStream_t represented as a Python integer.
177+
// When None, no synchronization is required.
178+
d["stream"] = py::none();
179+
180+
d["version"] = 3;
181+
return d;
182+
})
130183

131184

132-
// TODO: __dlpack__
185+
// TODO: __dlpack__ __dlpack_device__
133186
// DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
134187
// https://dmlc.github.io/dlpack/latest/
135188
// https://data-apis.org/array-api/latest/design_topics/data_interchange.html
136189
// https://github.com/data-apis/consortium-feedback/issues/1
137190
// https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
191+
// https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
138192

139193

140194
.def("contains", &Array4<T>::contains)

src/Base/MultiFab.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ void init_MultiFab(py::module &m) {
9494
if( !mfi.isValid() )
9595
{
9696
first_or_done = true;
97+
mfi.Finalize();
9798
throw py::stop_iteration();
9899
}
99100
return mfi;

tests/conftest.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,31 @@ def create():
8484
return mfab
8585

8686
return create
87+
88+
89+
@pytest.mark.skipif(
90+
amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
91+
)
92+
@pytest.fixture(scope="function", params=list(itertools.product([1, 3], [0, 1])))
93+
def make_mfab_device(boxarr, distmap, request):
94+
"""MultiFab that resides purely on the device:
95+
The MultiFab object itself is not a fixture because we want to avoid caching
96+
it between amrex.initialize/finalize calls of various tests.
97+
https://github.com/pytest-dev/pytest/discussions/10387
98+
https://github.com/pytest-dev/pytest/issues/5642#issuecomment-1279612764
99+
"""
100+
101+
def create():
102+
num_components = request.param[0]
103+
num_ghost = request.param[1]
104+
mfab = amrex.MultiFab(
105+
boxarr,
106+
distmap,
107+
num_components,
108+
num_ghost,
109+
amrex.MFInfo().set_arena(amrex.The_Device_Arena()),
110+
)
111+
mfab.set_val(0.0, 0, num_components)
112+
return mfab
113+
114+
return create

tests/test_array4.py

Lines changed: 78 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -69,36 +69,81 @@ def test_array4():
6969
x[1, 1, 1] = 44
7070
assert v_carr2np[0, 1, 1, 1] == 44
7171

72-
# from cupy
73-
74-
# to numpy
75-
76-
# to cupy
77-
78-
return
79-
80-
# Check indexing
81-
assert obj[0] == 1
82-
assert obj[1] == 2
83-
assert obj[2] == 3
84-
assert obj[-1] == 3
85-
assert obj[-2] == 2
86-
assert obj[-3] == 1
87-
with pytest.raises(IndexError):
88-
obj[-4]
89-
with pytest.raises(IndexError):
90-
obj[3]
91-
92-
# Check assignment
93-
obj[0] = 2
94-
obj[1] = 3
95-
obj[2] = 4
96-
assert obj[0] == 2
97-
assert obj[1] == 3
98-
assert obj[2] == 4
99-
100-
101-
# def test_iv_conversions():
102-
# obj = amrex.IntVect.max_vector().numpy()
103-
# assert(isinstance(obj, np.ndarray))
104-
# assert(obj.dtype == np.int32)
72+
73+
@pytest.mark.skipif(
74+
amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
75+
)
76+
def test_array4_numba():
77+
# https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
78+
from numba import cuda
79+
80+
# numba -> AMReX Array4
81+
x = np.ones(
82+
(
83+
2,
84+
3,
85+
4,
86+
)
87+
) # type: numpy.ndarray
88+
89+
# host-to-device copy
90+
x_numba = cuda.to_device(x) # type: numba.cuda.cudadrv.devicearray.DeviceNDArray
91+
# x_cupy = cupy.asarray(x_numba) # type: cupy.ndarray
92+
x_arr = amrex.Array4_double(x_numba) # type: amrex.Array4_double
93+
94+
assert (
95+
x_arr.__cuda_array_interface__["data"][0]
96+
== x_numba.__cuda_array_interface__["data"][0]
97+
)
98+
99+
# AMReX -> numba
100+
# arr_numba = cuda.as_cuda_array(arr4)
101+
# ... or as MultiFab test
102+
# TODO
103+
104+
105+
@pytest.mark.skipif(
106+
amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
107+
)
108+
def test_array4_cupy():
109+
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html
110+
import cupy as cp
111+
112+
# cupy -> AMReX Array4
113+
x = np.ones(
114+
(
115+
2,
116+
3,
117+
4,
118+
)
119+
) # TODO: merge into next line and create on device?
120+
x_cupy = cp.asarray(x) # type: cupy.ndarray
121+
print(f"x_cupy={x_cupy}")
122+
print(x_cupy.__cuda_array_interface__)
123+
124+
# cupy -> AMReX array4
125+
x_arr = amrex.Array4_double(x_cupy) # type: amrex.Array4_double
126+
print(f"x_arr={x_arr}")
127+
print(x_arr.__cuda_array_interface__)
128+
129+
assert (
130+
x_arr.__cuda_array_interface__["data"][0]
131+
== x_cupy.__cuda_array_interface__["data"][0]
132+
)
133+
134+
# AMReX -> cupy
135+
# arr_numba = cuda.as_cuda_array(arr4)
136+
# ... or as MultiFab test
137+
# TODO
138+
139+
140+
@pytest.mark.skipif(
141+
amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
142+
)
143+
def test_array4_pytorch():
144+
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html#pytorch
145+
# arr_torch = torch.as_tensor(arr, device='cuda')
146+
# assert(arr_torch.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
147+
# TODO
148+
149+
pass

0 commit comments

Comments
 (0)