Skip to content

Commit a04900d

Browse files
markelggarciampreddcherianpre-commit-ci[bot]rabernat
authored
Support for the new compression arguments. (#7551)
* Support for the new compression arguments. Use a dict for the arguments and update it with the encoding, so all variables are passed. * significant_digit and other missing keys added Should close #7634 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test for the new compression argument * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move the new test to TestNetCDF4Data * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * simplify this line (code review) * Added entry to whats-new Also removed an unnecesary call to monkeypatch fixture. * bump netcdf4 to 1.6.2 in min-all-deps.yml * parametrize compression in test * Revert "bump netcdf4 to 1.6.2 in min-all-deps.yml" This reverts commit c2ce8d5. * check netCDF4 version and skip test if netcdf4 version <1.6.2 * fix typing * Larger chunks to avoid random blosc errors With smaller chunks it raises "Blosc_FIlter Error: blosc_filter: Buffer is uncompressible." one out of three times. * use decorator to skip old netCDF4 versions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove stale version-property * fix whats-new.rst * fix requires-decorator * fix for asserts of other tests that use test data * Apply suggestions from code review * Update xarray/tests/__init__.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray/tests/test_backends.py --------- Co-authored-by: garciam <[email protected]> Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Abernathey <[email protected]> Co-authored-by: Kai Mühlbauer <[email protected]> Co-authored-by: Kai Mühlbauer <[email protected]>
1 parent c35d6b6 commit a04900d

File tree

4 files changed

+117
-17
lines changed

4 files changed

+117
-17
lines changed

doc/whats-new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ New Features
2626

2727
- :py:meth:`xr.cov` and :py:meth:`xr.corr` now support using weights (:issue:`8527`, :pull:`7392`).
2828
By `Llorenç Lledó <https://github.com/lluritu>`_.
29+
- Accept the compression arguments new in netCDF 1.6.0 in the netCDF4 backend.
30+
See `netCDF4 documentation <https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables>`_ for details.
31+
By `Markel García-Díez <https://github.com/markelg>`_. (:issue:`6929`, :pull:`7551`) Note that some
32+
new compression filters needs plugins to be installed which may not be available in all netCDF distributions.
2933

3034
Breaking changes
3135
~~~~~~~~~~~~~~~~

xarray/backends/netCDF4_.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,12 @@ def _extract_nc4_variable_encoding(
257257
"_FillValue",
258258
"dtype",
259259
"compression",
260+
"significant_digits",
261+
"quantize_mode",
262+
"blosc_shuffle",
263+
"szip_coding",
264+
"szip_pixels_per_block",
265+
"endian",
260266
}
261267
if lsd_okay:
262268
valid_encodings.add("least_significant_digit")
@@ -497,20 +503,23 @@ def prepare_variable(
497503
if name in self.ds.variables:
498504
nc4_var = self.ds.variables[name]
499505
else:
500-
nc4_var = self.ds.createVariable(
506+
default_args = dict(
501507
varname=name,
502508
datatype=datatype,
503509
dimensions=variable.dims,
504-
zlib=encoding.get("zlib", False),
505-
complevel=encoding.get("complevel", 4),
506-
shuffle=encoding.get("shuffle", True),
507-
fletcher32=encoding.get("fletcher32", False),
508-
contiguous=encoding.get("contiguous", False),
509-
chunksizes=encoding.get("chunksizes"),
510+
zlib=False,
511+
complevel=4,
512+
shuffle=True,
513+
fletcher32=False,
514+
contiguous=False,
515+
chunksizes=None,
510516
endian="native",
511-
least_significant_digit=encoding.get("least_significant_digit"),
517+
least_significant_digit=None,
512518
fill_value=fill_value,
513519
)
520+
default_args.update(encoding)
521+
default_args.pop("_FillValue", None)
522+
nc4_var = self.ds.createVariable(**default_args)
514523

515524
nc4_var.setncatts(attrs)
516525

xarray/tests/__init__.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import importlib
44
import platform
5+
import string
56
import warnings
67
from contextlib import contextmanager, nullcontext
78
from unittest import mock # noqa: F401
@@ -112,6 +113,10 @@ def _importorskip(
112113
not has_h5netcdf_ros3[0], reason="requires h5netcdf 1.3.0"
113114
)
114115

116+
has_netCDF4_1_6_2_or_above, requires_netCDF4_1_6_2_or_above = _importorskip(
117+
"netCDF4", "1.6.2"
118+
)
119+
115120
# change some global options for tests
116121
set_options(warn_for_unclosed_files=True)
117122

@@ -262,28 +267,41 @@ def assert_allclose(a, b, check_default_indexes=True, **kwargs):
262267
xarray.testing._assert_internal_invariants(b, check_default_indexes)
263268

264269

265-
def create_test_data(seed: int | None = None, add_attrs: bool = True) -> Dataset:
270+
_DEFAULT_TEST_DIM_SIZES = (8, 9, 10)
271+
272+
273+
def create_test_data(
274+
seed: int | None = None,
275+
add_attrs: bool = True,
276+
dim_sizes: tuple[int, int, int] = _DEFAULT_TEST_DIM_SIZES,
277+
) -> Dataset:
266278
rs = np.random.RandomState(seed)
267279
_vars = {
268280
"var1": ["dim1", "dim2"],
269281
"var2": ["dim1", "dim2"],
270282
"var3": ["dim3", "dim1"],
271283
}
272-
_dims = {"dim1": 8, "dim2": 9, "dim3": 10}
284+
_dims = {"dim1": dim_sizes[0], "dim2": dim_sizes[1], "dim3": dim_sizes[2]}
273285

274286
obj = Dataset()
275287
obj["dim2"] = ("dim2", 0.5 * np.arange(_dims["dim2"]))
276-
obj["dim3"] = ("dim3", list("abcdefghij"))
288+
if _dims["dim3"] > 26:
289+
raise RuntimeError(
290+
f'Not enough letters for filling this dimension size ({_dims["dim3"]})'
291+
)
292+
obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]]))
277293
obj["time"] = ("time", pd.date_range("2000-01-01", periods=20))
278294
for v, dims in sorted(_vars.items()):
279295
data = rs.normal(size=tuple(_dims[d] for d in dims))
280296
obj[v] = (dims, data)
281297
if add_attrs:
282298
obj[v].attrs = {"foo": "variable"}
283-
obj.coords["numbers"] = (
284-
"dim3",
285-
np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64"),
286-
)
299+
300+
if dim_sizes == _DEFAULT_TEST_DIM_SIZES:
301+
numbers_values = np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64")
302+
else:
303+
numbers_values = np.random.randint(0, 3, _dims["dim3"], dtype="int64")
304+
obj.coords["numbers"] = ("dim3", numbers_values)
287305
obj.encoding = {"foo": "bar"}
288306
assert all(obj.data.flags.writeable for obj in obj.variables.values())
289307
return obj

xarray/tests/test_backends.py

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
requires_h5netcdf_ros3,
7373
requires_iris,
7474
requires_netCDF4,
75+
requires_netCDF4_1_6_2_or_above,
7576
requires_pydap,
7677
requires_pynio,
7778
requires_scipy,
@@ -1486,7 +1487,7 @@ def test_dump_and_open_encodings(self) -> None:
14861487
assert ds.variables["time"].getncattr("units") == units
14871488
assert_array_equal(ds.variables["time"], np.arange(10) + 4)
14881489

1489-
def test_compression_encoding(self) -> None:
1490+
def test_compression_encoding_legacy(self) -> None:
14901491
data = create_test_data()
14911492
data["var2"].encoding.update(
14921493
{
@@ -1767,6 +1768,74 @@ def test_setncattr_string(self) -> None:
17671768
assert_array_equal(one_element_list_of_strings, totest.attrs["bar"])
17681769
assert one_string == totest.attrs["baz"]
17691770

1771+
@pytest.mark.parametrize(
1772+
"compression",
1773+
[
1774+
None,
1775+
"zlib",
1776+
"szip",
1777+
"zstd",
1778+
"blosc_lz",
1779+
"blosc_lz4",
1780+
"blosc_lz4hc",
1781+
"blosc_zlib",
1782+
"blosc_zstd",
1783+
],
1784+
)
1785+
@requires_netCDF4_1_6_2_or_above
1786+
@pytest.mark.xfail(ON_WINDOWS, reason="new compression not yet implemented")
1787+
def test_compression_encoding(self, compression: str | None) -> None:
1788+
data = create_test_data(dim_sizes=(20, 80, 10))
1789+
encoding_params: dict[str, Any] = dict(compression=compression, blosc_shuffle=1)
1790+
data["var2"].encoding.update(encoding_params)
1791+
data["var2"].encoding.update(
1792+
{
1793+
"chunksizes": (20, 40),
1794+
"original_shape": data.var2.shape,
1795+
"blosc_shuffle": 1,
1796+
"fletcher32": False,
1797+
}
1798+
)
1799+
with self.roundtrip(data) as actual:
1800+
expected_encoding = data["var2"].encoding.copy()
1801+
# compression does not appear in the retrieved encoding, that differs
1802+
# from the input encoding. shuffle also chantges. Here we modify the
1803+
# expected encoding to account for this
1804+
compression = expected_encoding.pop("compression")
1805+
blosc_shuffle = expected_encoding.pop("blosc_shuffle")
1806+
if compression is not None:
1807+
if "blosc" in compression and blosc_shuffle:
1808+
expected_encoding["blosc"] = {
1809+
"compressor": compression,
1810+
"shuffle": blosc_shuffle,
1811+
}
1812+
expected_encoding["shuffle"] = False
1813+
elif compression == "szip":
1814+
expected_encoding["szip"] = {
1815+
"coding": "nn",
1816+
"pixels_per_block": 8,
1817+
}
1818+
expected_encoding["shuffle"] = False
1819+
else:
1820+
# This will set a key like zlib=true which is what appears in
1821+
# the encoding when we read it.
1822+
expected_encoding[compression] = True
1823+
if compression == "zstd":
1824+
expected_encoding["shuffle"] = False
1825+
else:
1826+
expected_encoding["shuffle"] = False
1827+
1828+
actual_encoding = actual["var2"].encoding
1829+
assert expected_encoding.items() <= actual_encoding.items()
1830+
if (
1831+
encoding_params["compression"] is not None
1832+
and "blosc" not in encoding_params["compression"]
1833+
):
1834+
# regression test for #156
1835+
expected = data.isel(dim1=0)
1836+
with self.roundtrip(expected) as actual:
1837+
assert_equal(expected, actual)
1838+
17701839
@pytest.mark.skip(reason="https://github.com/Unidata/netcdf4-python/issues/1195")
17711840
def test_refresh_from_disk(self) -> None:
17721841
super().test_refresh_from_disk()
@@ -4518,7 +4587,7 @@ def test_extract_nc4_variable_encoding(self) -> None:
45184587
assert {} == encoding
45194588

45204589
@requires_netCDF4
4521-
def test_extract_nc4_variable_encoding_netcdf4(self, monkeypatch):
4590+
def test_extract_nc4_variable_encoding_netcdf4(self):
45224591
# New netCDF4 1.6.0 compression argument.
45234592
var = xr.Variable(("x",), [1, 2, 3], {}, {"compression": "szlib"})
45244593
_extract_nc4_variable_encoding(var, backend="netCDF4", raise_on_invalid=True)

0 commit comments

Comments
 (0)