Skip to content
16 changes: 16 additions & 0 deletions changes/3228.removal.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Removes default chunk encoding settings (filters, serializer, compressors) from the global
configuration object.

This removal is justified on the basis that storing chunk encoding settings in the config required
a brittle, confusing, and inaccurate categorization of array data types, which was particularly
unsuitable after the recent addition of new data types that didn't fit naturally into the
pre-existing categories.

The default chunk encoding is the same (Zstandard compression, and the required object codecs for
variable length data types), but the chunk encoding is now generated by functions that cannot be
reconfigured at runtime. Users who relied on setting the default chunk encoding via the global configuration object should
instead specify the desired chunk encoding explicitly when creating an array.

This change also adds an extra validation step to the creation of Zarr V2 arrays, which ensures that
arrays with a ``VariableLengthUTF8`` or ``VariableLengthBytes`` data type cannot be created without the
correct "object codec".
10 changes: 0 additions & 10 deletions docs/user-guide/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -246,16 +246,6 @@ built-in delta filter::
>>> z.compressors
(LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),)

The default compressor can be changed by setting the value of the using Zarr's
:ref:`user-guide-config`, e.g.::

>>> with zarr.config.set({'array.v2_default_compressor.default': {'id': 'blosc'}}):
... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2)
>>> z.filters
()
>>> z.compressors
(Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),)

To disable compression, set ``compressors=None`` when creating an array, e.g.::

>>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None)
Expand Down
20 changes: 1 addition & 19 deletions docs/user-guide/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,7 @@ This is the current default configuration::

>>> zarr.config.pprint()
{'array': {'order': 'C',
'v2_default_compressor': {'default': {'checksum': False,
'id': 'zstd',
'level': 0},
'variable-length-string': {'checksum': False,
'id': 'zstd',
'level': 0}},
'v2_default_filters': {'default': None,
'variable-length-string': [{'id': 'vlen-utf8'}]},
'v3_default_compressors': {'default': [{'configuration': {'checksum': False,
'level': 0},
'name': 'zstd'}],
'variable-length-string': [{'configuration': {'checksum': False,
'level': 0},
'name': 'zstd'}]},
'v3_default_filters': {'default': [], 'variable-length-string': []},
'v3_default_serializer': {'default': {'configuration': {'endian': 'little'},
'name': 'bytes'},
'variable-length-string': {'name': 'vlen-utf8'}},
'write_empty_chunks': False},
'write_empty_chunks': False},
'async': {'concurrency': 10, 'timeout': None},
'buffer': 'zarr.buffer.cpu.Buffer',
'codec_pipeline': {'batch_size': 1,
Expand Down
150 changes: 117 additions & 33 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
from zarr.abc.store import Store, set_or_delete
from zarr.codecs._v2 import V2Codec
from zarr.codecs.bytes import BytesCodec
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
from zarr.codecs.zstd import ZstdCodec
from zarr.core._info import ArrayInfo
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config
from zarr.core.attributes import Attributes
Expand Down Expand Up @@ -68,11 +70,13 @@
from zarr.core.config import categorize_data_type
from zarr.core.config import config as zarr_config
from zarr.core.dtype import (
VariableLengthBytes,
VariableLengthUTF8,
ZDType,
ZDTypeLike,
parse_data_type,
)
from zarr.core.dtype.common import HasEndianness, HasItemSize
from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec
from zarr.core.indexing import (
BasicIndexer,
BasicSelection,
Expand Down Expand Up @@ -109,6 +113,7 @@
)
from zarr.core.metadata.v2 import (
CompressorLikev2,
get_object_codec_id,
parse_compressor,
parse_filters,
)
Expand Down Expand Up @@ -710,7 +715,10 @@ def _create_metadata_v3(

shape = parse_shapelike(shape)
if codecs is None:
filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype)
filters = default_filters_v3(dtype)
serializer = default_serializer_v3(dtype)
compressors = default_compressors_v3(dtype)

codecs_parsed = (*filters, serializer, *compressors)
else:
codecs_parsed = tuple(codecs)
Expand Down Expand Up @@ -850,10 +858,9 @@ async def _create_v2(
else:
await ensure_no_existing_node(store_path, zarr_format=2)

default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
compressor_parsed: CompressorLikev2
if compressor == "auto":
compressor_parsed = default_compressor
compressor_parsed = default_compressor_v2(dtype)
elif isinstance(compressor, BytesBytesCodec):
raise ValueError(
"Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
Expand All @@ -863,7 +870,7 @@ async def _create_v2(
compressor_parsed = compressor

if filters is None:
filters = default_filters
filters = default_filters_v2(dtype)

metadata = cls._create_metadata_v2(
shape=shape,
Expand Down Expand Up @@ -4654,19 +4661,80 @@ def _get_default_chunk_encoding_v3(
)


def _get_default_chunk_encoding_v2(
dtype: ZDType[TBaseDType, TBaseScalar],
) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]:
def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]:
"""
Get the default chunk encoding for Zarr format 2 arrays, given a dtype
Given a data type, return the default filters for that data type.

This is an empty tuple. No data types have default filters.
"""
dtype_category = categorize_data_type(dtype)
filters = zarr_config.get("array.v2_default_filters").get(dtype_category)
compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category)
if filters is not None:
filters = tuple(numcodecs.get_codec(f) for f in filters)
return ()


def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]:
"""
Given a data type, return the default compressors for that data type.

This is just a tuple containing ``ZstdCodec``
"""
return (ZstdCodec(),)


def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
"""
Given a data type, return the default serializer for that data type.

The default serializer for most data types is the ``BytesCodec``, which may or may not be
parameterized with an endianness, depending on whether the data type has endianness. Variable
length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
``VLenBytesCodec``, respectively.

"""
serializer: ArrayBytesCodec = BytesCodec(endian=None)

if isinstance(dtype, HasEndianness):
serializer = BytesCodec(endian="little")
elif isinstance(dtype, HasObjectCodec):
if dtype.object_codec_id == "vlen-bytes":
serializer = VLenBytesCodec()
elif dtype.object_codec_id == "vlen-utf8":
serializer = VLenUTF8Codec()
else:
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
raise ValueError(msg)
return serializer


def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None:
"""
Given a data type, return the default filters for that data type.

For data types that require an object codec, namely variable length data types,
this is a tuple containing the object codec. Otherwise it's ``None``.
"""
if isinstance(dtype, HasObjectCodec):
if dtype.object_codec_id == "vlen-bytes":
from numcodecs import VLenBytes

return filters, numcodecs.get_codec(compressor)
return (VLenBytes(),)
elif dtype.object_codec_id == "vlen-utf8":
from numcodecs import VLenUTF8

return (VLenUTF8(),)
else:
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
raise ValueError(msg)
return None


def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec:
"""
Given a data type, return the default compressors for that data type.

This is just the numcodecs ``Zstd`` codec.
"""
from numcodecs import Zstd

return Zstd(level=0, checksum=False)


def _parse_chunk_encoding_v2(
Expand All @@ -4678,14 +4746,13 @@ def _parse_chunk_encoding_v2(
"""
Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
"""
default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
_filters: tuple[numcodecs.abc.Codec, ...] | None
_compressor: numcodecs.abc.Codec | None

if compressor is None or compressor == ():
_compressor = None
elif compressor == "auto":
_compressor = default_compressor
_compressor = default_compressor_v2(dtype)
elif isinstance(compressor, tuple | list) and len(compressor) == 1:
_compressor = parse_compressor(compressor[0])
else:
Expand All @@ -4697,7 +4764,7 @@ def _parse_chunk_encoding_v2(
if filters is None:
_filters = None
elif filters == "auto":
_filters = default_filters
_filters = default_filters_v2(dtype)
else:
if isinstance(filters, Iterable):
for idx, f in enumerate(filters):
Expand All @@ -4708,7 +4775,33 @@ def _parse_chunk_encoding_v2(
)
raise TypeError(msg)
_filters = parse_filters(filters)

if isinstance(dtype, HasObjectCodec):
# check the filters and the compressor for the object codec required for this data type
if _filters is None:
if _compressor is None:
object_codec_id = None
else:
object_codec_id = get_object_codec_id((_compressor.get_config(),))
else:
object_codec_id = get_object_codec_id(
(
*[f.get_config() for f in _filters],
_compressor.get_config() if _compressor is not None else None,
)
)
if object_codec_id is None:
if isinstance(dtype, VariableLengthUTF8): # type: ignore[unreachable]
codec_name = "the numcodecs.VLenUTF8 codec" # type: ignore[unreachable]
elif isinstance(dtype, VariableLengthBytes): # type: ignore[unreachable]
codec_name = "the numcodecs.VLenBytes codec" # type: ignore[unreachable]
else:
codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}"
msg = (
f"Data type {dtype} requires {codec_name}, "
"but no such codec was specified in the filters or compressor parameters for "
"this array. "
)
raise ValueError(msg)
return _filters, _compressor


Expand All @@ -4722,14 +4815,11 @@ def _parse_chunk_encoding_v3(
"""
Generate chunk encoding classes for v3 arrays with optional defaults.
"""
default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3(
dtype
)

if filters is None:
out_array_array: tuple[ArrayArrayCodec, ...] = ()
elif filters == "auto":
out_array_array = default_array_array
out_array_array = default_filters_v3(dtype)
else:
maybe_array_array: Iterable[Codec | dict[str, JSON]]
if isinstance(filters, dict | Codec):
Expand All @@ -4739,7 +4829,7 @@ def _parse_chunk_encoding_v3(
out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)

if serializer == "auto":
out_array_bytes = default_array_bytes
out_array_bytes = default_serializer_v3(dtype)
else:
# TODO: ensure that the serializer is compatible with the ndarray produced by the
# array-array codecs. For example, if a sequence of array-array codecs produces an
Expand All @@ -4749,7 +4839,7 @@ def _parse_chunk_encoding_v3(
if compressors is None:
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
elif compressors == "auto":
out_bytes_bytes = default_bytes_bytes
out_bytes_bytes = default_compressors_v3(dtype)
else:
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
if isinstance(compressors, dict | Codec):
Expand All @@ -4759,17 +4849,11 @@ def _parse_chunk_encoding_v3(

out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)

# specialize codecs as needed given the dtype

# TODO: refactor so that the config only contains the name of the codec, and we use the dtype
# to create the codec instance, instead of storing a dict representation of a full codec.

# TODO: ensure that the serializer is compatible with the ndarray produced by the
# array-array codecs. For example, if a sequence of array-array codecs produces an
# array with a single-byte data type, then the serializer should not specify endiannesss.
if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness):
# The default endianness in the bytescodec might not be None, so we need to replace it
out_array_bytes = replace(out_array_bytes, endian=None)

# TODO: add checks to ensure that the right serializer is used for vlen data types
return out_array_array, out_array_bytes, out_bytes_bytes


Expand Down
41 changes: 20 additions & 21 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,25 @@ def enable_gpu(self) -> ConfigSet:
)


# these keys were removed from the config as part of the 3.1.0 release.
# these deprecations should be removed in 3.1.1 or thereabouts.
deprecations = {
"array.v2_default_compressor.numeric": None,
"array.v2_default_compressor.string": None,
"array.v2_default_compressor.bytes": None,
"array.v2_default_filters.string": None,
"array.v2_default_filters.bytes": None,
"array.v3_default_filters.numeric": None,
"array.v3_default_filters.raw": None,
"array.v3_default_filters.bytes": None,
"array.v3_default_serializer.numeric": None,
"array.v3_default_serializer.string": None,
"array.v3_default_serializer.bytes": None,
"array.v3_default_compressors.string": None,
"array.v3_default_compressors.bytes": None,
"array.v3_default_compressors": None,
}

# The default configuration for zarr
config = Config(
"zarr",
Expand All @@ -87,27 +106,6 @@ def enable_gpu(self) -> ConfigSet:
"array": {
"order": "C",
"write_empty_chunks": False,
"v2_default_compressor": {
"default": {"id": "zstd", "level": 0, "checksum": False},
"variable-length-string": {"id": "zstd", "level": 0, "checksum": False},
},
"v2_default_filters": {
"default": None,
"variable-length-string": [{"id": "vlen-utf8"}],
},
"v3_default_filters": {"default": [], "variable-length-string": []},
"v3_default_serializer": {
"default": {"name": "bytes", "configuration": {"endian": "little"}},
"variable-length-string": {"name": "vlen-utf8"},
},
"v3_default_compressors": {
"default": [
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
],
"variable-length-string": [
{"name": "zstd", "configuration": {"level": 0, "checksum": False}}
],
},
},
"async": {"concurrency": 10, "timeout": None},
"threading": {"max_workers": None},
Expand All @@ -132,6 +130,7 @@ def enable_gpu(self) -> ConfigSet:
"ndbuffer": "zarr.buffer.cpu.NDBuffer",
}
],
deprecations=deprecations,
)


Expand Down
Loading