Skip to content

Commit 6ce0526

Browse files
[v3] Array.append (#2413)
* feature(array): implement Array.append changes the Array.resize to be an inplace operation * better error message * no more warn * style: pre-commit fixes --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 8a33df7 commit 6ce0526

File tree

3 files changed

+302
-19
lines changed

3 files changed

+302
-19
lines changed

src/zarr/core/array.py

Lines changed: 112 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import json
44
from asyncio import gather
5-
from dataclasses import dataclass, field, replace
5+
from dataclasses import dataclass, field
66
from itertools import starmap
77
from logging import getLogger
88
from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload
@@ -1104,15 +1104,15 @@ async def setitem(
11041104
)
11051105
return await self._set_selection(indexer, value, prototype=prototype)
11061106

1107-
async def resize(self, new_shape: ChunkCoords, delete_outside_chunks: bool = True) -> Self:
1107+
async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None:
1108+
new_shape = parse_shapelike(new_shape)
11081109
assert len(new_shape) == len(self.metadata.shape)
11091110
new_metadata = self.metadata.update_shape(new_shape)
11101111

1111-
# Remove all chunks outside of the new shape
1112-
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
1113-
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))
1114-
11151112
if delete_outside_chunks:
1113+
# Remove all chunks outside of the new shape
1114+
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
1115+
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))
11161116

11171117
async def _delete_key(key: str) -> None:
11181118
await (self.store_path / key).delete()
@@ -1128,7 +1128,63 @@ async def _delete_key(key: str) -> None:
11281128

11291129
# Write new metadata
11301130
await self._save_metadata(new_metadata)
1131-
return replace(self, metadata=new_metadata)
1131+
1132+
# Update metadata (in place)
1133+
object.__setattr__(self, "metadata", new_metadata)
1134+
1135+
async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
1136+
"""Append `data` to `axis`.
1137+
1138+
Parameters
1139+
----------
1140+
data : array-like
1141+
Data to be appended.
1142+
axis : int
1143+
Axis along which to append.
1144+
1145+
Returns
1146+
-------
1147+
new_shape : tuple
1148+
1149+
Notes
1150+
-----
1151+
The size of all dimensions other than `axis` must match between this
1152+
array and `data`.
1153+
"""
1154+
# ensure data is array-like
1155+
if not hasattr(data, "shape"):
1156+
data = np.asanyarray(data)
1157+
1158+
self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis)
1159+
data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis)
1160+
if self_shape_preserved != data_shape_preserved:
1161+
raise ValueError(
1162+
f"shape of data to append is not compatible with the array. "
1163+
f"The shape of the data is ({data_shape_preserved})"
1164+
f"and the shape of the array is ({self_shape_preserved})."
1165+
"All dimensions must match except for the dimension being "
1166+
"appended."
1167+
)
1168+
# remember old shape
1169+
old_shape = self.shape
1170+
1171+
# determine new shape
1172+
new_shape = tuple(
1173+
self.shape[i] if i != axis else self.shape[i] + data.shape[i]
1174+
for i in range(len(self.shape))
1175+
)
1176+
1177+
# resize
1178+
await self.resize(new_shape)
1179+
1180+
# store data
1181+
append_selection = tuple(
1182+
slice(None) if i != axis else slice(old_shape[i], new_shape[i])
1183+
for i in range(len(self.shape))
1184+
)
1185+
await self.setitem(append_selection, data)
1186+
1187+
return new_shape
11321188

11331189
async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
11341190
# metadata.attributes is "frozen" so we simply clear and update the dict
@@ -1147,7 +1203,8 @@ async def info(self) -> None:
11471203
raise NotImplementedError
11481204

11491205

1150-
@dataclass(frozen=True)
1206+
# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
1207+
@dataclass(frozen=False)
11511208
class Array:
11521209
"""Instantiate an array from an initialized store."""
11531210

@@ -1297,6 +1354,11 @@ def shape(self) -> ChunkCoords:
12971354
"""
12981355
return self._async_array.shape
12991356

1357+
@shape.setter
1358+
def shape(self, value: ChunkCoords) -> None:
1359+
"""Sets the shape of the array by calling resize."""
1360+
self.resize(value)
1361+
13001362
@property
13011363
def chunks(self) -> ChunkCoords:
13021364
"""Returns a tuple of integers describing the length of each dimension of a chunk of the array.
@@ -2754,18 +2816,18 @@ def blocks(self) -> BlockIndex:
27542816
:func:`set_block_selection` for documentation and examples."""
27552817
return BlockIndex(self)
27562818

2757-
def resize(self, new_shape: ChunkCoords) -> Array:
2819+
def resize(self, new_shape: ShapeLike) -> None:
27582820
"""
27592821
Change the shape of the array by growing or shrinking one or more
27602822
dimensions.
27612823
2762-
This method does not modify the original Array object. Instead, it returns a new Array
2763-
with the specified shape.
2824+
Parameters
2825+
----------
2826+
new_shape : tuple
2827+
New shape of the array.
27642828
27652829
Notes
27662830
-----
2767-
When resizing an array, the data are not rearranged in any way.
2768-
27692831
If one or more dimensions are shrunk, any chunks falling outside the
27702832
new array shape will be deleted from the underlying store.
27712833
However, it is noteworthy that the chunks partially falling inside the new array
@@ -2778,7 +2840,6 @@ def resize(self, new_shape: ChunkCoords) -> Array:
27782840
>>> import zarr
27792841
>>> z = zarr.zeros(shape=(10000, 10000),
27802842
>>> chunk_shape=(1000, 1000),
2781-
>>> store=StorePath(MemoryStore(mode="w")),
27822843
>>> dtype="i4",)
27832844
>>> z.shape
27842845
(10000, 10000)
@@ -2791,10 +2852,43 @@ def resize(self, new_shape: ChunkCoords) -> Array:
27912852
>>> z2.shape
27922853
(50, 50)
27932854
"""
2794-
resized = sync(self._async_array.resize(new_shape))
2795-
# TODO: remove this cast when type inference improves
2796-
_resized = cast(AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], resized)
2797-
return type(self)(_resized)
2855+
sync(self._async_array.resize(new_shape))
2856+
2857+
def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
2858+
"""Append `data` to `axis`.
2859+
2860+
Parameters
2861+
----------
2862+
data : array-like
2863+
Data to be appended.
2864+
axis : int
2865+
Axis along which to append.
2866+
2867+
Returns
2868+
-------
2869+
new_shape : tuple
2870+
2871+
Notes
2872+
-----
2873+
The size of all dimensions other than `axis` must match between this
2874+
array and `data`.
2875+
2876+
Examples
2877+
--------
2878+
>>> import numpy as np
2879+
>>> import zarr
2880+
>>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
2881+
>>> z = zarr.array(a, chunks=(1000, 100))
2882+
>>> z.shape
2883+
(10000, 1000)
2884+
>>> z.append(a)
2885+
(20000, 1000)
2886+
>>> z.append(np.vstack([a, a]), axis=1)
2887+
(20000, 2000)
2888+
>>> z.shape
2889+
(20000, 2000)
2890+
"""
2891+
return sync(self._async_array.append(data, axis=axis))
27982892

27992893
def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
28002894
# TODO: remove this cast when type inference improves

tests/test_array.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,194 @@ def test_update_attrs(zarr_format: int) -> None:
419419
assert arr2.attrs["foo"] == "bar"
420420

421421

422+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
423+
@pytest.mark.parametrize("zarr_format", [2, 3])
424+
def test_resize_1d(store: MemoryStore, zarr_format: int) -> None:
425+
z = zarr.create(
426+
shape=105, chunks=10, dtype="i4", fill_value=0, store=store, zarr_format=zarr_format
427+
)
428+
a = np.arange(105, dtype="i4")
429+
z[:] = a
430+
assert (105,) == z.shape
431+
assert (105,) == z[:].shape
432+
assert np.dtype("i4") == z.dtype
433+
assert np.dtype("i4") == z[:].dtype
434+
assert (10,) == z.chunks
435+
np.testing.assert_array_equal(a, z[:])
436+
437+
z.resize(205)
438+
assert (205,) == z.shape
439+
assert (205,) == z[:].shape
440+
assert np.dtype("i4") == z.dtype
441+
assert np.dtype("i4") == z[:].dtype
442+
assert (10,) == z.chunks
443+
np.testing.assert_array_equal(a, z[:105])
444+
np.testing.assert_array_equal(np.zeros(100, dtype="i4"), z[105:])
445+
446+
z.resize(55)
447+
assert (55,) == z.shape
448+
assert (55,) == z[:].shape
449+
assert np.dtype("i4") == z.dtype
450+
assert np.dtype("i4") == z[:].dtype
451+
assert (10,) == z.chunks
452+
np.testing.assert_array_equal(a[:55], z[:])
453+
454+
# via shape setter
455+
new_shape = (105,)
456+
z.shape = new_shape
457+
assert new_shape == z.shape
458+
assert new_shape == z[:].shape
459+
460+
461+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
462+
@pytest.mark.parametrize("zarr_format", [2, 3])
463+
def test_resize_2d(store: MemoryStore, zarr_format: int) -> None:
464+
z = zarr.create(
465+
shape=(105, 105),
466+
chunks=(10, 10),
467+
dtype="i4",
468+
fill_value=0,
469+
store=store,
470+
zarr_format=zarr_format,
471+
)
472+
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
473+
z[:] = a
474+
assert (105, 105) == z.shape
475+
assert (105, 105) == z[:].shape
476+
assert np.dtype("i4") == z.dtype
477+
assert np.dtype("i4") == z[:].dtype
478+
assert (10, 10) == z.chunks
479+
np.testing.assert_array_equal(a, z[:])
480+
481+
z.resize((205, 205))
482+
assert (205, 205) == z.shape
483+
assert (205, 205) == z[:].shape
484+
assert np.dtype("i4") == z.dtype
485+
assert np.dtype("i4") == z[:].dtype
486+
assert (10, 10) == z.chunks
487+
np.testing.assert_array_equal(a, z[:105, :105])
488+
np.testing.assert_array_equal(np.zeros((100, 205), dtype="i4"), z[105:, :])
489+
np.testing.assert_array_equal(np.zeros((205, 100), dtype="i4"), z[:, 105:])
490+
491+
z.resize((55, 55))
492+
assert (55, 55) == z.shape
493+
assert (55, 55) == z[:].shape
494+
assert np.dtype("i4") == z.dtype
495+
assert np.dtype("i4") == z[:].dtype
496+
assert (10, 10) == z.chunks
497+
np.testing.assert_array_equal(a[:55, :55], z[:])
498+
499+
z.resize((55, 1))
500+
assert (55, 1) == z.shape
501+
assert (55, 1) == z[:].shape
502+
assert np.dtype("i4") == z.dtype
503+
assert np.dtype("i4") == z[:].dtype
504+
assert (10, 10) == z.chunks
505+
np.testing.assert_array_equal(a[:55, :1], z[:])
506+
507+
z.resize((1, 55))
508+
assert (1, 55) == z.shape
509+
assert (1, 55) == z[:].shape
510+
assert np.dtype("i4") == z.dtype
511+
assert np.dtype("i4") == z[:].dtype
512+
assert (10, 10) == z.chunks
513+
np.testing.assert_array_equal(a[:1, :10], z[:, :10])
514+
np.testing.assert_array_equal(np.zeros((1, 55 - 10), dtype="i4"), z[:, 10:55])
515+
516+
# via shape setter
517+
new_shape = (105, 105)
518+
z.shape = new_shape
519+
assert new_shape == z.shape
520+
assert new_shape == z[:].shape
521+
522+
523+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
524+
@pytest.mark.parametrize("zarr_format", [2, 3])
525+
def test_append_1d(store: MemoryStore, zarr_format: int) -> None:
526+
a = np.arange(105)
527+
z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format)
528+
z[:] = a
529+
assert a.shape == z.shape
530+
assert a.dtype == z.dtype
531+
assert (10,) == z.chunks
532+
np.testing.assert_array_equal(a, z[:])
533+
534+
b = np.arange(105, 205)
535+
e = np.append(a, b)
536+
assert z.shape == (105,)
537+
z.append(b)
538+
assert e.shape == z.shape
539+
assert e.dtype == z.dtype
540+
assert (10,) == z.chunks
541+
np.testing.assert_array_equal(e, z[:])
542+
543+
# check append handles array-like
544+
c = [1, 2, 3]
545+
f = np.append(e, c)
546+
z.append(c)
547+
assert f.shape == z.shape
548+
assert f.dtype == z.dtype
549+
assert (10,) == z.chunks
550+
np.testing.assert_array_equal(f, z[:])
551+
552+
553+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
554+
@pytest.mark.parametrize("zarr_format", [2, 3])
555+
def test_append_2d(store: MemoryStore, zarr_format: int) -> None:
556+
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
557+
z = zarr.create(
558+
shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format
559+
)
560+
z[:] = a
561+
assert a.shape == z.shape
562+
assert a.dtype == z.dtype
563+
assert (10, 10) == z.chunks
564+
actual = z[:]
565+
np.testing.assert_array_equal(a, actual)
566+
567+
b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105))
568+
e = np.append(a, b, axis=0)
569+
z.append(b)
570+
assert e.shape == z.shape
571+
assert e.dtype == z.dtype
572+
assert (10, 10) == z.chunks
573+
actual = z[:]
574+
np.testing.assert_array_equal(e, actual)
575+
576+
577+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
578+
@pytest.mark.parametrize("zarr_format", [2, 3])
579+
def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None:
580+
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
581+
z = zarr.create(
582+
shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format
583+
)
584+
z[:] = a
585+
assert a.shape == z.shape
586+
assert a.dtype == z.dtype
587+
assert (10, 10) == z.chunks
588+
np.testing.assert_array_equal(a, z[:])
589+
590+
b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105))
591+
e = np.append(a, b, axis=1)
592+
z.append(b, axis=1)
593+
assert e.shape == z.shape
594+
assert e.dtype == z.dtype
595+
assert (10, 10) == z.chunks
596+
np.testing.assert_array_equal(e, z[:])
597+
598+
599+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
600+
@pytest.mark.parametrize("zarr_format", [2, 3])
601+
def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None:
602+
a = np.arange(100)
603+
z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format)
604+
z[:] = a
605+
b = a.reshape(10, 10)
606+
with pytest.raises(ValueError):
607+
z.append(b)
608+
609+
422610
@pytest.mark.parametrize("order", ["C", "F", None])
423611
@pytest.mark.parametrize("zarr_format", [2, 3])
424612
@pytest.mark.parametrize("store", ["memory"], indirect=True)

0 commit comments

Comments
 (0)