-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Refactoring/fixing zarr-pyhton v3 incompatibilities in xarray datatrees #10020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
0a2a49e
ae80662
379db18
846dc50
ddfd0b5
3f9a8fb
f140658
0e790eb
403afa9
58e8f8e
fd357fa
8b993a1
d4aeeca
3125647
0c7485b
fdeee94
f3e2c66
f9f1043
c118841
ec2086a
abaea4e
aa85bed
fce2957
e27b4b9
d03b003
810a623
eabcc76
60e19d9
0934461
6a74275
011f29c
e31c646
e65f229
9c88b26
5a668a4
53a9309
72c1ad6
502981c
2f94763
3e09b61
d5a061e
a417371
8756919
b85d70d
f1cc331
296ed03
46c61ca
77e68e3
4da72ae
5f7c6b9
5dc7df7
9823d64
980ebb4
30f5bba
4d1fdb5
ecef578
c2a1f5f
cde6b65
09fad6e
77575b5
0a9f874
565938b
daf0f42
84bde40
04d937c
765c5f0
7eee31c
0969422
cacf419
1a60ebe
d98abe3
2dcefe4
a88e503
22ac9b4
69dc976
6e3e2aa
6ce9578
94f0ddc
8573740
e2a58e8
71288c6
47f3315
2b50a97
59a978d
fc368ce
cd6aad6
3fb0b7f
e06fa25
0829c68
ee4273d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -655,10 +655,18 @@ def open_store( | |
| use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, | ||
| zarr_format=zarr_format, | ||
| ) | ||
| group_paths = list(_iter_zarr_groups(zarr_group, parent=group)) | ||
| from zarr.core.group import Group | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do you think about moving this if/else into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will make the code way much cleaner! |
||
| group_members: dict = dict(zarr_group.members(max_depth=1000)) | ||
| group_members = { | ||
| (f"{group}/{path}" if group != "/" else path): group_store | ||
| for path, group_store in group_members.items() | ||
| if isinstance(group_store, Group) | ||
| } | ||
aladinor marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| group_members[group] = zarr_group | ||
| return { | ||
| group: cls( | ||
| zarr_group.get(group), | ||
| group_store, | ||
| mode, | ||
| consolidate_on_close, | ||
| append_dim, | ||
|
|
@@ -669,7 +677,7 @@ def open_store( | |
| use_zarr_fill_value_as_mask, | ||
| cache_members=cache_members, | ||
| ) | ||
| for group in group_paths | ||
| for group, group_store in group_members.items() | ||
| } | ||
|
|
||
| @classmethod | ||
|
|
@@ -1651,8 +1659,6 @@ def open_groups_as_dict( | |
| zarr_version=None, | ||
| zarr_format=None, | ||
| ) -> dict[str, Dataset]: | ||
| from xarray.core.treenode import NodePath | ||
|
|
||
| filename_or_obj = _normalize_path(filename_or_obj) | ||
|
|
||
| # Check for a group and make it a parent if it exists | ||
|
|
@@ -1698,14 +1704,6 @@ def open_groups_as_dict( | |
| return groups_dict | ||
|
|
||
|
|
||
| def _iter_zarr_groups(root: ZarrGroup, parent: str = "/") -> Iterable[str]: | ||
| parent_nodepath = NodePath(parent) | ||
| yield str(parent_nodepath) | ||
| for path, group in root.groups(): | ||
| gpath = parent_nodepath / path | ||
| yield from _iter_zarr_groups(group, parent=str(gpath)) | ||
aladinor marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def _get_open_params( | ||
| store, | ||
| mode, | ||
|
|
@@ -1751,7 +1749,7 @@ def _get_open_params( | |
| consolidated = False | ||
|
|
||
| if _zarr_v3(): | ||
| missing_exc = ValueError | ||
| missing_exc = AssertionError | ||
aladinor marked this conversation as resolved.
Show resolved
Hide resolved
aladinor marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else: | ||
| missing_exc = zarr.errors.GroupNotFoundError | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -191,7 +191,7 @@ def create_test_datatree(): | |
| """ | ||
|
|
||
| def _create_test_datatree(modify=lambda ds: ds): | ||
| set1_data = modify(xr.Dataset({"a": 0, "b": 1})) | ||
| set1_data = modify(xr.Dataset({"a": 1, "b": 2})) | ||
|
||
| set2_data = modify(xr.Dataset({"a": ("x", [2, 3]), "b": ("x", [0.1, 0.2])})) | ||
| root_data = modify(xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])})) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |
| requires_dask, | ||
| requires_h5netcdf, | ||
| requires_netCDF4, | ||
| requires_zarr, | ||
| requires_zarr_v3, | ||
| ) | ||
|
|
||
| if TYPE_CHECKING: | ||
|
|
@@ -141,14 +141,16 @@ def unaligned_datatree_zarr(tmp_path_factory): | |
| a (y) int64 16B ... | ||
| b (x) float64 16B ... | ||
| """ | ||
| from zarr import consolidate_metadata | ||
|
|
||
| filepath = tmp_path_factory.mktemp("data") / "unaligned_simple_datatree.zarr" | ||
| root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) | ||
| set1_data = xr.Dataset({"a": 0, "b": 1}) | ||
| set2_data = xr.Dataset({"a": ("y", [2, 3]), "b": ("x", [0.1, 0.2])}) | ||
| root_data.to_zarr(filepath) | ||
| set1_data.to_zarr(filepath, group="/Group1", mode="a") | ||
| set2_data.to_zarr(filepath, group="/Group2", mode="a") | ||
| set1_data.to_zarr(filepath, group="/Group1/subgroup1", mode="a") | ||
| consolidate_metadata(filepath) | ||
aladinor marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| yield filepath | ||
|
|
||
|
|
||
|
|
@@ -373,15 +375,12 @@ class TestH5NetCDFDatatreeIO(DatatreeIOBase): | |
| engine: T_DataTreeNetcdfEngine | None = "h5netcdf" | ||
|
|
||
|
|
||
| @pytest.mark.skipif( | ||
| have_zarr_v3, reason="datatree support for zarr 3 is not implemented yet" | ||
| ) | ||
| @requires_zarr | ||
| @requires_zarr_v3 | ||
|
||
| class TestZarrDatatreeIO: | ||
| engine = "zarr" | ||
|
|
||
| def test_to_zarr(self, tmpdir, simple_datatree): | ||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
jhamman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| original_dt = simple_datatree | ||
| original_dt.to_zarr(filepath) | ||
|
|
||
|
|
@@ -391,16 +390,31 @@ def test_to_zarr(self, tmpdir, simple_datatree): | |
| def test_zarr_encoding(self, tmpdir, simple_datatree): | ||
| from numcodecs.blosc import Blosc | ||
|
|
||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
| original_dt = simple_datatree | ||
|
|
||
| comp = {"compressor": Blosc(cname="zstd", clevel=3, shuffle=2)} | ||
| blosc = Blosc(cname="zstd", clevel=3, shuffle="shuffle").get_config() | ||
| comp = {"compressor": {"name": blosc.pop("id"), "configuration": blosc}} | ||
| enc = {"/set2": {var: comp for var in original_dt["/set2"].dataset.data_vars}} | ||
| original_dt.to_zarr(filepath, encoding=enc) | ||
|
|
||
| with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
| print(roundtrip_dt["/set2/a"].encoding) | ||
| assert roundtrip_dt["/set2/a"].encoding["compressor"] == comp["compressor"] | ||
| retrieved_compressor = roundtrip_dt["/set2/a"].encoding["compressors"][ | ||
| 0 | ||
| ] # Get the BloscCodec object | ||
| assert ( | ||
| retrieved_compressor.cname.name | ||
| == comp["compressor"]["configuration"]["cname"] | ||
| ) | ||
| assert ( | ||
| retrieved_compressor.clevel | ||
| == comp["compressor"]["configuration"]["clevel"] | ||
| ) | ||
| assert ( | ||
| retrieved_compressor.shuffle.name | ||
| == comp["compressor"]["configuration"]["shuffle"] | ||
| ) | ||
TomNicholas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| enc["/not/a/group"] = {"foo": "bar"} # type: ignore[dict-item] | ||
| with pytest.raises(ValueError, match="unexpected encoding group.*"): | ||
|
|
@@ -409,9 +423,9 @@ def test_zarr_encoding(self, tmpdir, simple_datatree): | |
| def test_to_zarr_zip_store(self, tmpdir, simple_datatree): | ||
| from zarr.storage import ZipStore | ||
|
|
||
| filepath = tmpdir / "test.zarr.zip" | ||
| filepath = str(tmpdir / "test.zarr.zip") | ||
| original_dt = simple_datatree | ||
| store = ZipStore(filepath) | ||
| store = ZipStore(filepath, mode="w") | ||
| original_dt.to_zarr(store) | ||
|
|
||
| with open_datatree(store, engine="zarr") as roundtrip_dt: # type: ignore[arg-type, unused-ignore] | ||
|
|
@@ -432,32 +446,29 @@ def test_to_zarr_not_consolidated(self, tmpdir, simple_datatree): | |
| assert_equal(original_dt, roundtrip_dt) | ||
|
|
||
| def test_to_zarr_default_write_mode(self, tmpdir, simple_datatree): | ||
| import zarr | ||
|
|
||
| simple_datatree.to_zarr(tmpdir) | ||
| simple_datatree.to_zarr(str(tmpdir)) | ||
|
|
||
| # with default settings, to_zarr should not overwrite an existing dir | ||
| with pytest.raises(zarr.errors.ContainsGroupError): | ||
| simple_datatree.to_zarr(tmpdir) | ||
| with pytest.raises(FileExistsError): | ||
|
||
| simple_datatree.to_zarr(str(tmpdir)) | ||
|
|
||
| @requires_dask | ||
| def test_to_zarr_compute_false(self, tmpdir, simple_datatree): | ||
| import dask.array as da | ||
|
|
||
| filepath = tmpdir / "test.zarr" | ||
| original_dt = simple_datatree.chunk() | ||
| original_dt.to_zarr(filepath, compute=False) | ||
| original_dt.to_zarr(str(filepath), compute=False) | ||
|
|
||
| for node in original_dt.subtree: | ||
| for name, variable in node.dataset.variables.items(): | ||
| var_dir = filepath / node.path / name | ||
| var_files = var_dir.listdir() | ||
| assert var_dir / ".zarray" in var_files | ||
| assert var_dir / ".zattrs" in var_files | ||
| assert var_dir / "zarr.json" in var_files | ||
| if isinstance(variable.data, da.Array): | ||
| assert var_dir / "0" not in var_files | ||
| assert var_dir / "zarr.json" in var_files | ||
| else: | ||
| assert var_dir / "0" in var_files | ||
| assert var_dir / "c" in var_files | ||
|
||
|
|
||
| def test_to_zarr_inherited_coords(self, tmpdir): | ||
| original_dt = DataTree.from_dict( | ||
|
|
@@ -466,7 +477,7 @@ def test_to_zarr_inherited_coords(self, tmpdir): | |
| "/sub": xr.Dataset({"b": (("x",), [5, 6])}), | ||
| } | ||
| ) | ||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
| original_dt.to_zarr(filepath) | ||
|
|
||
| with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
|
|
@@ -476,7 +487,7 @@ def test_to_zarr_inherited_coords(self, tmpdir): | |
|
|
||
| def test_open_groups_round_trip(self, tmpdir, simple_datatree) -> None: | ||
| """Test `open_groups` opens a zarr store with the `simple_datatree` structure.""" | ||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
| original_dt = simple_datatree | ||
| original_dt.to_zarr(filepath) | ||
|
|
||
|
|
@@ -501,7 +512,7 @@ def test_open_datatree(self, unaligned_datatree_zarr) -> None: | |
|
|
||
| @requires_dask | ||
| def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: | ||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
|
|
||
| chunks = {"x": 2, "y": 1} | ||
|
|
||
|
|
@@ -527,9 +538,8 @@ def test_open_groups(self, unaligned_datatree_zarr) -> None: | |
| unaligned_dict_of_datasets = open_groups(unaligned_datatree_zarr, engine="zarr") | ||
|
|
||
| assert "/" in unaligned_dict_of_datasets.keys() | ||
| assert "/Group1" in unaligned_dict_of_datasets.keys() | ||
| assert "/Group1/subgroup1" in unaligned_dict_of_datasets.keys() | ||
| assert "/Group2" in unaligned_dict_of_datasets.keys() | ||
| assert "Group1" in unaligned_dict_of_datasets.keys() | ||
| assert "Group2" in unaligned_dict_of_datasets.keys() | ||
| # Check that group name returns the correct datasets | ||
| with xr.open_dataset( | ||
| unaligned_datatree_zarr, group="/", engine="zarr" | ||
|
|
@@ -538,22 +548,20 @@ def test_open_groups(self, unaligned_datatree_zarr) -> None: | |
| with xr.open_dataset( | ||
| unaligned_datatree_zarr, group="Group1", engine="zarr" | ||
| ) as expected: | ||
| assert_identical(unaligned_dict_of_datasets["/Group1"], expected) | ||
| with xr.open_dataset( | ||
| unaligned_datatree_zarr, group="/Group1/subgroup1", engine="zarr" | ||
| ) as expected: | ||
| assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected) | ||
| assert_identical(unaligned_dict_of_datasets["Group1"], expected) | ||
| with xr.open_dataset( | ||
| unaligned_datatree_zarr, group="/Group2", engine="zarr" | ||
| unaligned_datatree_zarr, group="Group2", engine="zarr" | ||
| ) as expected: | ||
| assert_identical(unaligned_dict_of_datasets["/Group2"], expected) | ||
|
|
||
| assert_identical(unaligned_dict_of_datasets["Group2"], expected) | ||
| for ds in unaligned_dict_of_datasets.values(): | ||
| ds.close() | ||
|
|
||
| @pytest.mark.filterwarnings( | ||
| "ignore:Failed to open Zarr store with consolidated metadata:RuntimeWarning" | ||
| ) | ||
| def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None: | ||
| """Test opening a specific group within a Zarr store using `open_datatree`.""" | ||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
| group = "/set2" | ||
| original_dt = simple_datatree | ||
| original_dt.to_zarr(filepath) | ||
|
|
@@ -568,10 +576,7 @@ def test_open_groups_chunks(self, tmpdir) -> None: | |
| """Test `open_groups` with chunks on a zarr store.""" | ||
|
|
||
| chunks = {"x": 2, "y": 1} | ||
| filepath = tmpdir / "test.zarr" | ||
|
|
||
| chunks = {"x": 2, "y": 1} | ||
|
|
||
| filepath = str(tmpdir / "test.zarr") | ||
| root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) | ||
| set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) | ||
| set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) | ||
|
|
@@ -605,13 +610,16 @@ def test_write_subgroup(self, tmpdir): | |
| expected_dt = original_dt.copy() | ||
| expected_dt.name = None | ||
|
|
||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
| original_dt.to_zarr(filepath) | ||
|
|
||
| with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
| assert_equal(original_dt, roundtrip_dt) | ||
| assert_identical(expected_dt, roundtrip_dt) | ||
|
|
||
| @pytest.mark.filterwarnings( | ||
| "ignore:Failed to open Zarr store with consolidated metadata:RuntimeWarning" | ||
| ) | ||
| def test_write_inherited_coords_false(self, tmpdir): | ||
| original_dt = DataTree.from_dict( | ||
| { | ||
|
|
@@ -620,7 +628,7 @@ def test_write_inherited_coords_false(self, tmpdir): | |
| } | ||
| ) | ||
|
|
||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
| original_dt.to_zarr(filepath, write_inherited_coords=False) | ||
|
|
||
| with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
|
|
@@ -631,6 +639,9 @@ def test_write_inherited_coords_false(self, tmpdir): | |
| with open_datatree(filepath, group="child", engine="zarr") as roundtrip_child: | ||
| assert_identical(expected_child, roundtrip_child) | ||
|
|
||
| @pytest.mark.filterwarnings( | ||
| "ignore:Failed to open Zarr store with consolidated metadata:RuntimeWarning" | ||
| ) | ||
| def test_write_inherited_coords_true(self, tmpdir): | ||
| original_dt = DataTree.from_dict( | ||
| { | ||
|
|
@@ -639,7 +650,7 @@ def test_write_inherited_coords_true(self, tmpdir): | |
| } | ||
| ) | ||
|
|
||
| filepath = tmpdir / "test.zarr" | ||
| filepath = str(tmpdir / "test.zarr") | ||
| original_dt.to_zarr(filepath, write_inherited_coords=True) | ||
|
|
||
| with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.