Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ Breaking changes
files to disk (:pull:`10624`).
By `Stephan Hoyer <https://github.com/shoyer>`_.

- Zarr stores written with Xarray now consistently use a default Zarr fill value
of ``NaN`` for float variables, for both Zarr v2 and v3 (:issue:`10646``). All
other dtypes still use the Zarr default ``fill_value`` of zero. To customize,
explicitly set encoding in :py:meth:`~Dataset.to_zarr`, e.g.,
``encoding=dict.fromkey(ds.data_vars, {'fill_value': 0})``.
By `Stephan Hoyer <https://github.com/shoyer>`_.

Deprecations
~~~~~~~~~~~~

Expand Down
5 changes: 5 additions & 0 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1178,6 +1178,11 @@ def set_variables(
fill_value = attrs.pop("_FillValue", None)
else:
fill_value = v.encoding.pop("fill_value", None)
if fill_value is None and v.dtype.kind == "f":
# For floating point data, Xarray defaults to a fill_value
# of NaN (unlike Zarr, which uses zero):
# https://github.com/pydata/xarray/issues/10646
fill_value = v.dtype.type(np.nan)
if "_FillValue" in attrs:
# replace with encoded fill value
fv = attrs.pop("_FillValue")
Expand Down
13 changes: 9 additions & 4 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,10 +2349,15 @@ def to_zarr(
used. Override any existing encodings by providing the ``encoding`` kwarg.

``fill_value`` handling:
There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format
arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute
in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used
to mask the data if requested using ``mask_and_scale=True``. See this `Github issue <https://github.com/pydata/xarray/issues/5475>`_
There exists a subtlety in interpreting zarr's ``fill_value`` property.
For Zarr v2 format arrays, ``fill_value`` is *always* interpreted as an
invalid value similar to the ``_FillValue`` attribute in CF/netCDF.
For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute
will be used to mask the data if requested using ``mask_and_scale=True``.
To customize the fill value Zarr uses as a default for unwritten
chunks on disk, set ``_FillValue`` in encoding for Zarr v2 or
``fill_value`` for Zarr v3.
See this `Github issue <https://github.com/pydata/xarray/issues/5475>`_
for more.

See Also
Expand Down
24 changes: 20 additions & 4 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4382,6 +4382,23 @@ def roundtrip_dir(
) as ds:
yield ds

@requires_dask
def test_default_zarr_fill_value(self):
inputs = xr.Dataset({"floats": ("x", [1.0]), "ints": ("x", [1])}).chunk()
expected = xr.Dataset({"floats": ("x", [np.nan]), "ints": ("x", [0])})
with self.temp_dir() as (d, store):
inputs.to_zarr(store, compute=False)
with open_dataset(store) as on_disk:
assert np.isnan(on_disk.variables["floats"].encoding["_FillValue"])
assert (
"_FillValue" not in on_disk.variables["ints"].encoding
) # use default
if not has_zarr_v3:
# zarr-python v2 interprets fill_value=None inconsistently
del on_disk["ints"]
del expected["ints"]
assert_identical(expected, on_disk)

@pytest.mark.parametrize("consolidated", [True, False, None])
@pytest.mark.parametrize("write_empty", [True, False, None])
def test_write_empty(
Expand Down Expand Up @@ -4420,14 +4437,13 @@ def assert_expected_files(expected: list[str], store: str) -> None:
"0.1.1",
]

# use nan for default fill_value behaviour
data = np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2))

if zarr_format_3:
data = np.array([0.0, 0, 1.0, 0]).reshape((1, 2, 2))
# transform to the path style of zarr 3
# e.g. 0/0/1
expected = [e.replace(".", "/") for e in expected]
else:
# use nan for default fill_value behaviour
data = np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2))

ds = xr.Dataset(data_vars={"test": (("Z", "Y", "X"), data)})

Expand Down
Loading