Support for the new compression arguments. (#7551)

markelg · garciampred · dcherian · web-flow · commit a04900d724f0 · 2023-12-21T08:24:15.000-07:00
* Support for the new compression arguments. Use a dict for the arguments and update it with the encoding, so all variables are passed. * significant_digit and other missing keys added Should close #7634 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test for the new compression argument * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move the new test to TestNetCDF4Data * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * simplify this line (code review) * Added entry to whats-new Also removed an unnecesary call to monkeypatch fixture. * bump netcdf4 to 1.6.2 in min-all-deps.yml * parametrize compression in test * Revert "bump netcdf4 to 1.6.2 in min-all-deps.yml" This reverts commit c2ce8d5. * check netCDF4 version and skip test if netcdf4 version <1.6.2 * fix typing * Larger chunks to avoid random blosc errors With smaller chunks it raises "Blosc_FIlter Error: blosc_filter: Buffer is uncompressible." one out of three times. * use decorator to skip old netCDF4 versions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove stale version-property * fix whats-new.rst * fix requires-decorator * fix for asserts of other tests that use test data * Apply suggestions from code review * Update xarray/tests/__init__.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray/tests/test_backends.py --------- Co-authored-by: garciam <garciam@predictia.es> Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Abernathey <ryan.abernathey@gmail.com> Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de> Co-authored-by: Kai Mühlbauer <kmuehlbauer@wradlib.org>
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -26,6 +26,10 @@ New Features
 
 - :py:meth:`xr.cov` and :py:meth:`xr.corr` now support using weights (:issue:`8527`, :pull:`7392`).
   By `Llorenç Lledó <https://github.com/lluritu>`_.
+- Accept the compression arguments new in netCDF 1.6.0 in the netCDF4 backend.
+  See `netCDF4 documentation <https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables>`_ for details.
+  By `Markel García-Díez <https://github.com/markelg>`_. (:issue:`6929`, :pull:`7551`) Note that some
+  new compression filters needs plugins to be installed which may not be available in all netCDF distributions.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -257,6 +257,12 @@ def _extract_nc4_variable_encoding(
         "_FillValue",
         "dtype",
         "compression",
+        "significant_digits",
+        "quantize_mode",
+        "blosc_shuffle",
+        "szip_coding",
+        "szip_pixels_per_block",
+        "endian",
     }
     if lsd_okay:
         valid_encodings.add("least_significant_digit")
@@ -497,20 +503,23 @@ def prepare_variable(
         if name in self.ds.variables:
             nc4_var = self.ds.variables[name]
         else:
-            nc4_var = self.ds.createVariable(
+            default_args = dict(
                 varname=name,
                 datatype=datatype,
                 dimensions=variable.dims,
-                zlib=encoding.get("zlib", False),
-                complevel=encoding.get("complevel", 4),
-                shuffle=encoding.get("shuffle", True),
-                fletcher32=encoding.get("fletcher32", False),
-                contiguous=encoding.get("contiguous", False),
-                chunksizes=encoding.get("chunksizes"),
+                zlib=False,
+                complevel=4,
+                shuffle=True,
+                fletcher32=False,
+                contiguous=False,
+                chunksizes=None,
                 endian="native",
-                least_significant_digit=encoding.get("least_significant_digit"),
+                least_significant_digit=None,
                 fill_value=fill_value,
             )
+            default_args.update(encoding)
+            default_args.pop("_FillValue", None)
+            nc4_var = self.ds.createVariable(**default_args)
 
         nc4_var.setncatts(attrs)
 
diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -2,6 +2,7 @@
 
 import importlib
 import platform
+import string
 import warnings
 from contextlib import contextmanager, nullcontext
 from unittest import mock  # noqa: F401
@@ -112,6 +113,10 @@ def _importorskip(
     not has_h5netcdf_ros3[0], reason="requires h5netcdf 1.3.0"
 )
 
+has_netCDF4_1_6_2_or_above, requires_netCDF4_1_6_2_or_above = _importorskip(
+    "netCDF4", "1.6.2"
+)
+
 # change some global options for tests
 set_options(warn_for_unclosed_files=True)
 
@@ -262,28 +267,41 @@ def assert_allclose(a, b, check_default_indexes=True, **kwargs):
     xarray.testing._assert_internal_invariants(b, check_default_indexes)
 
 
-def create_test_data(seed: int | None = None, add_attrs: bool = True) -> Dataset:
+_DEFAULT_TEST_DIM_SIZES = (8, 9, 10)
+
+
+def create_test_data(
+    seed: int | None = None,
+    add_attrs: bool = True,
+    dim_sizes: tuple[int, int, int] = _DEFAULT_TEST_DIM_SIZES,
+) -> Dataset:
     rs = np.random.RandomState(seed)
     _vars = {
         "var1": ["dim1", "dim2"],
         "var2": ["dim1", "dim2"],
         "var3": ["dim3", "dim1"],
     }
-    _dims = {"dim1": 8, "dim2": 9, "dim3": 10}
+    _dims = {"dim1": dim_sizes[0], "dim2": dim_sizes[1], "dim3": dim_sizes[2]}
 
     obj = Dataset()
     obj["dim2"] = ("dim2", 0.5 * np.arange(_dims["dim2"]))
-    obj["dim3"] = ("dim3", list("abcdefghij"))
+    if _dims["dim3"] > 26:
+        raise RuntimeError(
+            f'Not enough letters for filling this dimension size ({_dims["dim3"]})'
+        )
+    obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]]))
     obj["time"] = ("time", pd.date_range("2000-01-01", periods=20))
     for v, dims in sorted(_vars.items()):
         data = rs.normal(size=tuple(_dims[d] for d in dims))
         obj[v] = (dims, data)
         if add_attrs:
             obj[v].attrs = {"foo": "variable"}
-    obj.coords["numbers"] = (
-        "dim3",
-        np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64"),
-    )
+
+    if dim_sizes == _DEFAULT_TEST_DIM_SIZES:
+        numbers_values = np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64")
+    else:
+        numbers_values = np.random.randint(0, 3, _dims["dim3"], dtype="int64")
+    obj.coords["numbers"] = ("dim3", numbers_values)
     obj.encoding = {"foo": "bar"}
     assert all(obj.data.flags.writeable for obj in obj.variables.values())
     return obj
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -72,6 +72,7 @@
     requires_h5netcdf_ros3,
     requires_iris,
     requires_netCDF4,
+    requires_netCDF4_1_6_2_or_above,
     requires_pydap,
     requires_pynio,
     requires_scipy,
@@ -1486,7 +1487,7 @@ def test_dump_and_open_encodings(self) -> None:
                         assert ds.variables["time"].getncattr("units") == units
                         assert_array_equal(ds.variables["time"], np.arange(10) + 4)
 
-    def test_compression_encoding(self) -> None:
+    def test_compression_encoding_legacy(self) -> None:
         data = create_test_data()
         data["var2"].encoding.update(
             {
@@ -1767,6 +1768,74 @@ def test_setncattr_string(self) -> None:
                 assert_array_equal(one_element_list_of_strings, totest.attrs["bar"])
                 assert one_string == totest.attrs["baz"]
 
+    @pytest.mark.parametrize(
+        "compression",
+        [
+            None,
+            "zlib",
+            "szip",
+            "zstd",
+            "blosc_lz",
+            "blosc_lz4",
+            "blosc_lz4hc",
+            "blosc_zlib",
+            "blosc_zstd",
+        ],
+    )
+    @requires_netCDF4_1_6_2_or_above
+    @pytest.mark.xfail(ON_WINDOWS, reason="new compression not yet implemented")
+    def test_compression_encoding(self, compression: str | None) -> None:
+        data = create_test_data(dim_sizes=(20, 80, 10))
+        encoding_params: dict[str, Any] = dict(compression=compression, blosc_shuffle=1)
+        data["var2"].encoding.update(encoding_params)
+        data["var2"].encoding.update(
+            {
+                "chunksizes": (20, 40),
+                "original_shape": data.var2.shape,
+                "blosc_shuffle": 1,
+                "fletcher32": False,
+            }
+        )
+        with self.roundtrip(data) as actual:
+            expected_encoding = data["var2"].encoding.copy()
+            # compression does not appear in the retrieved encoding, that differs
+            # from the input encoding. shuffle also chantges. Here we modify the
+            # expected encoding to account for this
+            compression = expected_encoding.pop("compression")
+            blosc_shuffle = expected_encoding.pop("blosc_shuffle")
+            if compression is not None:
+                if "blosc" in compression and blosc_shuffle:
+                    expected_encoding["blosc"] = {
+                        "compressor": compression,
+                        "shuffle": blosc_shuffle,
+                    }
+                    expected_encoding["shuffle"] = False
+                elif compression == "szip":
+                    expected_encoding["szip"] = {
+                        "coding": "nn",
+                        "pixels_per_block": 8,
+                    }
+                    expected_encoding["shuffle"] = False
+                else:
+                    # This will set a key like zlib=true which is what appears in
+                    # the encoding when we read it.
+                    expected_encoding[compression] = True
+                    if compression == "zstd":
+                        expected_encoding["shuffle"] = False
+            else:
+                expected_encoding["shuffle"] = False
+
+            actual_encoding = actual["var2"].encoding
+            assert expected_encoding.items() <= actual_encoding.items()
+        if (
+            encoding_params["compression"] is not None
+            and "blosc" not in encoding_params["compression"]
+        ):
+            # regression test for #156
+            expected = data.isel(dim1=0)
+            with self.roundtrip(expected) as actual:
+                assert_equal(expected, actual)
+
     @pytest.mark.skip(reason="https://github.com/Unidata/netcdf4-python/issues/1195")
     def test_refresh_from_disk(self) -> None:
         super().test_refresh_from_disk()
@@ -4518,7 +4587,7 @@ def test_extract_nc4_variable_encoding(self) -> None:
         assert {} == encoding
 
     @requires_netCDF4
-    def test_extract_nc4_variable_encoding_netcdf4(self, monkeypatch):
+    def test_extract_nc4_variable_encoding_netcdf4(self):
         # New netCDF4 1.6.0 compression argument.
         var = xr.Variable(("x",), [1, 2, 3], {}, {"compression": "szlib"})
         _extract_nc4_variable_encoding(var, backend="netCDF4", raise_on_invalid=True)