Skip to content

Commit 385b5d3

Browse files
jstriebelgrlee77joshmoore
authored
add storage_transformers and get/set_partial_values (#1096)
* add storage_transformers and get/set_partial_values * formatting * add docs and release notes * add test_core testcase * Update zarr/creation.py Co-authored-by: Gregory Lee <[email protected]> * apply PR feedback * add comment that storage_transformers=None is the same as storage_transformers=[] * use empty tuple as default for storage_transformers * make mypy happy * better coverage, minor fix, adding rmdir * add missing rmdir to test * increase coverage * improve test coverage * fix TestArrayWithStorageTransformersV3 * Update zarr/creation.py Co-authored-by: Gregory Lee <[email protected]> * pick generic storage transformer changes from #1111 * increase coverage * fix order of storage transformers * retrigger CI * minor fixes * make flake8 happy * apply PR feedback Co-authored-by: Gregory Lee <[email protected]> Co-authored-by: Josh Moore <[email protected]>
1 parent df6e071 commit 385b5d3

File tree

9 files changed

+493
-18
lines changed

9 files changed

+493
-18
lines changed

docs/release.rst

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,20 @@ Release notes
66
# to document your changes. On releases it will be
77
# re-indented so that it does not show up in the notes.
88
9-
.. _unreleased:
9+
.. _unreleased:
1010

11-
Unreleased
12-
----------
11+
Unreleased
12+
----------
1313
..
1414
# .. warning::
1515
# Pre-release! Use :command:`pip install --pre zarr` to evaluate this release.
1616
17+
* Improve Zarr V3 support, adding partial store read/write and storage transformers.
18+
Add two features of the [v3 spec](https://zarr-specs.readthedocs.io/en/latest/core/v3.0.html):
19+
* storage transformers
20+
* `get_partial_values` and `set_partial_values`
21+
By :user:`Jonathan Striebel <jstriebel>`; :issue:`1096`.
22+
1723
.. _release_2.13.6:
1824

1925
2.13.6
@@ -44,7 +50,10 @@ Bug fixes
4450
Appreciation
4551
~~~~~~~~~~~~~
4652

47-
Special thanks to Outreachy participants for contributing to most of the maintenance PRs. Please read the blog post summarising the contribution phase and welcoming new Outreachy interns: https://zarr.dev/blog/welcoming-outreachy-2022-interns/
53+
Special thanks to Outreachy participants for contributing to most of the
54+
maintenance PRs. Please read the blog post summarising the contribution phase
55+
and welcoming new Outreachy interns:
56+
https://zarr.dev/blog/welcoming-outreachy-2022-interns/
4857

4958

5059
Enhancements

zarr/_storage/store.py

Lines changed: 224 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import abc
22
import os
3+
from collections import defaultdict
34
from collections.abc import MutableMapping
5+
from copy import copy
46
from string import ascii_letters, digits
5-
from typing import Any, List, Mapping, Optional, Union
7+
from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union
68

79
from zarr.meta import Metadata2, Metadata3
810
from zarr.util import normalize_storage_path
@@ -254,6 +256,82 @@ def __setitem__(self, key, value):
254256
def __getitem__(self, key):
255257
"""Get a value."""
256258

259+
@abc.abstractmethod
260+
def rmdir(self, path=None):
261+
"""Remove a data path and all its subkeys and related metadata.
262+
Expects a path without the data or meta root prefix."""
263+
264+
@property
265+
def supports_efficient_get_partial_values(self):
266+
return False
267+
268+
def get_partial_values(
269+
self,
270+
key_ranges: Sequence[Tuple[str, Tuple[int, Optional[int]]]]
271+
) -> List[Union[bytes, memoryview, bytearray]]:
272+
"""Get multiple partial values.
273+
key_ranges can be an iterable of key, range pairs,
274+
where a range specifies two integers range_start and range_length
275+
as a tuple, (range_start, range_length).
276+
range_length may be None to indicate to read until the end.
277+
range_start may be negative to start reading range_start bytes
278+
from the end of the file.
279+
A key may occur multiple times with different ranges.
280+
Inserts None for missing keys into the returned list."""
281+
results: List[Union[bytes, memoryview, bytearray]] = (
282+
[None] * len(key_ranges) # type: ignore[list-item]
283+
)
284+
indexed_ranges_by_key: Dict[str, List[Tuple[int, Tuple[int, Optional[int]]]]] = (
285+
defaultdict(list)
286+
)
287+
for i, (key, range_) in enumerate(key_ranges):
288+
indexed_ranges_by_key[key].append((i, range_))
289+
for key, indexed_ranges in indexed_ranges_by_key.items():
290+
try:
291+
value = self[key]
292+
except KeyError: # pragma: no cover
293+
continue
294+
for i, (range_from, range_length) in indexed_ranges:
295+
if range_length is None:
296+
results[i] = value[range_from:]
297+
else:
298+
results[i] = value[range_from:range_from + range_length]
299+
return results
300+
301+
def supports_efficient_set_partial_values(self):
302+
return False
303+
304+
def set_partial_values(self, key_start_values):
305+
"""Set multiple partial values.
306+
key_start_values can be an iterable of key, start and value triplets
307+
as tuples, (key, start, value), where start defines the offset in bytes.
308+
A key may occur multiple times with different starts and non-overlapping values.
309+
Also, start may only be beyond the current value if other values fill the gap.
310+
start may be negative to start writing start bytes from the current
311+
end of the file, ending the file with the new value."""
312+
unique_keys = set(next(zip(*key_start_values)))
313+
values = {}
314+
for key in unique_keys:
315+
old_value = self.get(key)
316+
values[key] = None if old_value is None else bytearray(old_value)
317+
for key, start, value in key_start_values:
318+
if values[key] is None:
319+
assert start == 0
320+
values[key] = value
321+
else:
322+
if start > len(values[key]): # pragma: no cover
323+
raise ValueError(
324+
f"Cannot set value at start {start}, "
325+
+ f"since it is beyond the data at key {key}, "
326+
+ f"having length {len(values[key])}."
327+
)
328+
if start < 0:
329+
values[key][start:] = value
330+
else:
331+
values[key][start:start + len(value)] = value
332+
for key, value in values.items():
333+
self[key] = value
334+
257335
def clear(self):
258336
"""Remove all items from store."""
259337
self.erase_prefix("/")
@@ -303,6 +381,151 @@ def _ensure_store(store):
303381
)
304382

305383

384+
class StorageTransformer(MutableMapping, abc.ABC):
385+
"""Base class for storage transformers. The methods simply pass on the data as-is
386+
and should be overwritten by sub-classes."""
387+
388+
_store_version = 3
389+
_metadata_class = Metadata3
390+
391+
def __init__(self, _type) -> None:
392+
if _type not in self.valid_types: # pragma: no cover
393+
raise ValueError(
394+
f"Storage transformer cannot be initialized with type {_type}, "
395+
+ f"must be one of {list(self.valid_types)}."
396+
)
397+
self.type = _type
398+
self._inner_store = None
399+
400+
def _copy_for_array(self, array, inner_store):
401+
transformer_copy = copy(self)
402+
transformer_copy._inner_store = inner_store
403+
return transformer_copy
404+
405+
@abc.abstractproperty
406+
def extension_uri(self):
407+
pass # pragma: no cover
408+
409+
@abc.abstractproperty
410+
def valid_types(self):
411+
pass # pragma: no cover
412+
413+
def get_config(self):
414+
"""Return a dictionary holding configuration parameters for this
415+
storage transformer. All values must be compatible with JSON encoding."""
416+
# Override in sub-class if need special encoding of config values.
417+
# By default, assume all non-private members are configuration
418+
# parameters except for type .
419+
return {
420+
k: v for k, v in self.__dict__.items()
421+
if not k.startswith('_') and k != "type"
422+
}
423+
424+
@classmethod
425+
def from_config(cls, _type, config):
426+
"""Instantiate storage transformer from a configuration object."""
427+
# override in sub-class if need special decoding of config values
428+
429+
# by default, assume constructor accepts configuration parameters as
430+
# keyword arguments without any special decoding
431+
return cls(_type, **config)
432+
433+
@property
434+
def inner_store(self) -> Union["StorageTransformer", StoreV3]:
435+
assert self._inner_store is not None, (
436+
"inner_store is not initialized, first get a copy via _copy_for_array."
437+
)
438+
return self._inner_store
439+
440+
# The following implementations are usually fine to keep as-is:
441+
442+
def __eq__(self, other):
443+
return (
444+
type(self) == type(other) and
445+
self._inner_store == other._inner_store and
446+
self.get_config() == other.get_config()
447+
)
448+
449+
def erase(self, key):
450+
self.__delitem__(key)
451+
452+
def list(self):
453+
return list(self.keys())
454+
455+
def list_dir(self, prefix):
456+
return StoreV3.list_dir(self, prefix)
457+
458+
def is_readable(self):
459+
return self.inner_store.is_readable()
460+
461+
def is_writeable(self):
462+
return self.inner_store.is_writeable()
463+
464+
def is_listable(self):
465+
return self.inner_store.is_listable()
466+
467+
def is_erasable(self):
468+
return self.inner_store.is_erasable()
469+
470+
def clear(self):
471+
return self.inner_store.clear()
472+
473+
def __enter__(self):
474+
return self.inner_store.__enter__()
475+
476+
def __exit__(self, exc_type, exc_value, traceback):
477+
return self.inner_store.__exit__(exc_type, exc_value, traceback)
478+
479+
def close(self) -> None:
480+
return self.inner_store.close()
481+
482+
# The following implementations might need to be re-implemented
483+
# by subclasses implementing storage transformers:
484+
485+
def rename(self, src_path: str, dst_path: str) -> None:
486+
return self.inner_store.rename(src_path, dst_path)
487+
488+
def list_prefix(self, prefix):
489+
return self.inner_store.list_prefix(prefix)
490+
491+
def erase_prefix(self, prefix):
492+
return self.inner_store.erase_prefix(prefix)
493+
494+
def rmdir(self, path=None):
495+
return self.inner_store.rmdir(path)
496+
497+
def __contains__(self, key):
498+
return self.inner_store.__contains__(key)
499+
500+
def __setitem__(self, key, value):
501+
return self.inner_store.__setitem__(key, value)
502+
503+
def __getitem__(self, key):
504+
return self.inner_store.__getitem__(key)
505+
506+
def __delitem__(self, key):
507+
return self.inner_store.__delitem__(key)
508+
509+
def __iter__(self):
510+
return self.inner_store.__iter__()
511+
512+
def __len__(self):
513+
return self.inner_store.__len__()
514+
515+
@property
516+
def supports_efficient_get_partial_values(self):
517+
return self.inner_store.supports_efficient_get_partial_values
518+
519+
def get_partial_values(self, key_ranges):
520+
return self.inner_store.get_partial_values(key_ranges)
521+
522+
def supports_efficient_set_partial_values(self):
523+
return self.inner_store.supports_efficient_set_partial_values()
524+
525+
def set_partial_values(self, key_start_values):
526+
return self.inner_store.set_partial_values(key_start_values)
527+
528+
306529
# allow MutableMapping for backwards compatibility
307530
StoreLike = Union[BaseStore, MutableMapping]
308531

zarr/core.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ def __init__(
189189

190190
self._store = store
191191
self._chunk_store = chunk_store
192+
self._transformed_chunk_store = None
192193
self._path = normalize_storage_path(path)
193194
if self._path:
194195
self._key_prefix = self._path + '/'
@@ -292,6 +293,16 @@ def _load_metadata_nosync(self):
292293
filters = [get_codec(config) for config in filters]
293294
self._filters = filters
294295

296+
if self._version == 3:
297+
storage_transformers = meta.get('storage_transformers', [])
298+
if storage_transformers:
299+
transformed_store = self._chunk_store or self._store
300+
for storage_transformer in storage_transformers[::-1]:
301+
transformed_store = storage_transformer._copy_for_array(
302+
self, transformed_store
303+
)
304+
self._transformed_chunk_store = transformed_store
305+
295306
def _refresh_metadata(self):
296307
if not self._cache_metadata:
297308
self._load_metadata()
@@ -371,10 +382,12 @@ def read_only(self, value):
371382
@property
372383
def chunk_store(self):
373384
"""A MutableMapping providing the underlying storage for array chunks."""
374-
if self._chunk_store is None:
375-
return self._store
376-
else:
385+
if self._transformed_chunk_store is not None:
386+
return self._transformed_chunk_store
387+
elif self._chunk_store is not None:
377388
return self._chunk_store
389+
else:
390+
return self._store
378391

379392
@property
380393
def shape(self):
@@ -1800,7 +1813,7 @@ def _set_selection(self, indexer, value, fields=None):
18001813
check_array_shape('value', value, sel_shape)
18011814

18021815
# iterate over chunks in range
1803-
if not hasattr(self.store, "setitems") or self._synchronizer is not None \
1816+
if not hasattr(self.chunk_store, "setitems") or self._synchronizer is not None \
18041817
or any(map(lambda x: x == 0, self.shape)):
18051818
# iterative approach
18061819
for chunk_coords, chunk_selection, out_selection in indexer:
@@ -2229,7 +2242,10 @@ def _encode_chunk(self, chunk):
22292242
cdata = chunk
22302243

22312244
# ensure in-memory data is immutable and easy to compare
2232-
if isinstance(self.chunk_store, KVStore):
2245+
if (
2246+
isinstance(self.chunk_store, KVStore)
2247+
or isinstance(self._chunk_store, KVStore)
2248+
):
22332249
cdata = ensure_bytes(cdata)
22342250

22352251
return cdata

zarr/creation.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def create(shape, chunks=True, dtype=None, compressor='default',
2222
overwrite=False, path=None, chunk_store=None, filters=None,
2323
cache_metadata=True, cache_attrs=True, read_only=False,
2424
object_codec=None, dimension_separator=None, write_empty_chunks=True,
25-
*, zarr_version=None, meta_array=None, **kwargs):
25+
*, zarr_version=None, meta_array=None, storage_transformers=(), **kwargs):
2626
"""Create an array.
2727
2828
Parameters
@@ -85,6 +85,14 @@ def create(shape, chunks=True, dtype=None, compressor='default',
8585
8686
.. versionadded:: 2.11
8787
88+
storage_transformers : sequence of StorageTransformers, optional
89+
Setting storage transformers, changes the storage structure and behaviour
90+
of data coming from the underlying store. The transformers are applied in the
91+
order of the given sequence. Supplying an empty sequence is the same as omitting
92+
the argument or setting it to None. May only be set when using zarr_version 3.
93+
94+
.. versionadded:: 2.13
95+
8896
zarr_version : {None, 2, 3}, optional
8997
The zarr protocol version of the created array. If None, it will be
9098
inferred from ``store`` or ``chunk_store`` if they are provided,
@@ -170,7 +178,7 @@ def create(shape, chunks=True, dtype=None, compressor='default',
170178
init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
171179
fill_value=fill_value, order=order, overwrite=overwrite, path=path,
172180
chunk_store=chunk_store, filters=filters, object_codec=object_codec,
173-
dimension_separator=dimension_separator)
181+
dimension_separator=dimension_separator, storage_transformers=storage_transformers)
174182

175183
# instantiate array
176184
z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer,

0 commit comments

Comments
 (0)