diff --git a/datatree/datatree.py b/datatree/datatree.py index 52049f47..ec7ffb4b 100644 --- a/datatree/datatree.py +++ b/datatree/datatree.py @@ -2,6 +2,7 @@ import copy import itertools +import re from collections import OrderedDict from html import escape from typing import ( @@ -79,6 +80,72 @@ T_Path = Union[str, NodePath] +_SYMBOLIC_NODE_NAME = r"\w+" +_SYMBOLIC_NODEPATH = rf"\/?{_SYMBOLIC_NODE_NAME}(\/{_SYMBOLIC_NODE_NAME})*\/?" +_SYMBOLIC_REORDERING = f"^{_SYMBOLIC_NODEPATH}->{_SYMBOLIC_NODEPATH}$" + + +def _parse_symbolic_ordering(ordering: str) -> Tuple[List[str], List[str]]: + """Parse a symbolic reordering string of the form 'a/b -> b/a'.""" + if not re.match(_SYMBOLIC_REORDERING, ordering): + raise ValueError(f"Invalid symbolic reordering: {ordering}") + + in_txt, out_txt = ordering.split("->") + old_symbolic_order = re.findall(_SYMBOLIC_NODE_NAME, in_txt) + new_symbolic_order = re.findall(_SYMBOLIC_NODE_NAME, out_txt) + + # Check number of symbols is the same on both sides + if len(old_symbolic_order) != len(new_symbolic_order): + raise ValueError( + "Invalid symbolic reordering. The depth of the symbolic path on each side must be equal, " + f"but the left has {len(old_symbolic_order)} parts and the right has {len(new_symbolic_order)}" + f" parts." + ) + + # Check every symbol appears on both sides + unmatched_symbols = set(old_symbolic_order).symmetric_difference(new_symbolic_order) + if unmatched_symbols: + raise ValueError( + "Invalid symbolic reordering. Every symbol must be present on both sides, but " + f"the symbols {unmatched_symbols} are only present on one side." + ) + + # Check each symbol appears only once on each side + repeated_symbols_in_old_order = set( + sym for sym in old_symbolic_order if old_symbolic_order.count(sym) > 1 + ) + if repeated_symbols_in_old_order: + raise ValueError( + "Invalid symbolic reordering. Each symbol must appear only once on each side, " + f"but the symbols {repeated_symbols_in_old_order} appear more than once in the left-hand side." + ) + repeated_symbols_in_new_order = set( + sym for sym in new_symbolic_order if new_symbolic_order.count(sym) > 1 + ) + if repeated_symbols_in_new_order: + raise ValueError( + "Invalid symbolic reordering. Each symbol must appear only once on each side, " + f"but the symbols {repeated_symbols_in_new_order} appear more than once in the right-hand side." + ) + + return old_symbolic_order, new_symbolic_order + + +def _reorder_path(path: str, old_order: List[str], new_order: List[str]) -> str: + """Re-orders the parts of the given path from old_order to match new_order.""" + + parts = NodePath(path).parts + if len(old_order) > len(parts): + raise ValueError( + f"Node {path} only has depth {len(parts)}, " + f"but the reordering requires depth >= {len(old_order)}" + ) + + new_order_indices = [new_order.index(el) for el in old_order] + reordered_parts = [parts[i] for i in new_order_indices] + return str(NodePath(*reordered_parts)) + + def _coerce_to_dataset(data: Dataset | DataArray | None) -> Dataset: if isinstance(data, DataArray): ds = data.to_dataset() @@ -1307,6 +1374,90 @@ def match(self, pattern: str) -> DataTree: } return DataTree.from_dict(matching_nodes, name=self.root.name) + def reorder(self, ordering: str) -> DataTree: + """ + Reorder levels of all leaf nodes in this subtree by rearranging the parts of each of their paths. + + Raises an error on non-hollow trees. + + In general this operation will preserve the depth of each leaf node (and hence depth of the whole subtree), + but will not preserve the width at any level. + + Parameters + ---------- + ordering: str + String specifying symbolically how to reorder levels of each path, for example: + 'a/b/c -> b/c/a' + + Generally must be of the form: + '{OLD_ORDER} -> {NEW_ORDER}' + where OLD_ORDER = 'a/b/***/y/z', representing a symbolic ordering of the parts of the node path, + and NEW_ORDER = 'z/a/***/b/y', representing an arbitrary re-ordering of the same number of parts. + (Here the triple asterisk stands in for an arbitrary number of parts.) + + Symbols must be unique, and each symbol in the old order must have a corresponding entry in the new order, + so the number of symbols must be the same in the new order as in the old order. + + By default paths will be re-ordered starting at the root. To re-order at the leaves instead, an ellipsis can + be pre-prended, e.g. '.../a/b -> .../b/a'. The ellipsis can be present in the new order, old order, both, + or neither. (Ellipses will have no effect on a node which has a depth equal to the number of symbols.) + + Returns + ------- + reordered: DataTree + DataTree object where each node has the same depth as it did originally. + + Examples + -------- + >>> dt = DataTree.from_dict( + ... {"A/B1": xr.Dataset({"x": 1}), "A/B2": xr.Dataset({"x": 2})} + ... ) + >>> dt + DataTree('None', parent=None) + └── DataTree('A') + ├── DataTree('B1') + │ Dimensions: () + │ Data variables: + │ x int64 1 + └── DataTree('B2') + Dimensions: () + Data variables: + x int64 2 + >>> dt.reorder("a/b->b/a") + DataTree('None', parent=None) + ├── DataTree('B1') + │ └── DataTree('A') + │ Dimensions: () + │ Data variables: + │ x int64 1 + └── DataTree('B2') + └── DataTree('A') + Dimensions: () + Data variables: + x int64 2 + """ + if not self.is_hollow: + # TODO can we relax this restriction to only raising if a data-filled node would be moved? + raise ValueError("Only hollow trees can be unambiguously reordered.") + + # TODO do we require the root to have a name if we are to reorder from the root? + + old_symbolic_order, new_symbolic_order = _parse_symbolic_ordering(ordering) + + # only re-order the subtree, and return a new copy, to avoid messing up parents of this node + reordered_dict = { + _reorder_path( + node.relative_to(self), old_symbolic_order, new_symbolic_order + ): node.ds + for node in self.leaves # hollow trees are defined entirely by their leaves + } + + if self.depth > len(new_symbolic_order): + # TODO implement this + raise NotImplementedError() + + return DataTree.from_dict(reordered_dict) + def map_over_subtree( self, func: Callable, @@ -1482,7 +1633,7 @@ def to_netcdf( Note that unlimited_dims may also be set via ``dataset.encoding["unlimited_dims"]``. kwargs : - Addional keyword arguments to be passed to ``xarray.Dataset.to_netcdf`` + Additional keyword arguments to be passed to ``xarray.Dataset.to_netcdf`` """ from .io import _datatree_to_netcdf diff --git a/datatree/tests/test_datatree.py b/datatree/tests/test_datatree.py index e9f373d7..1629a308 100644 --- a/datatree/tests/test_datatree.py +++ b/datatree/tests/test_datatree.py @@ -655,6 +655,58 @@ def test_assign(self): dtt.assert_equal(result, expected) +class TestReorder: + @pytest.mark.parametrize( + "in_dict, reordering, expected_dict", + [ + ({"A": xr.Dataset()}, "a->a", {"A": xr.Dataset()}), + ({"A/B": xr.Dataset()}, "a/b->b/a", {"B/A": xr.Dataset()}), + ({"A/B/C": xr.Dataset()}, "a/b/c->c/b/a", {"C/B/A": xr.Dataset()}), + ( + {"A/B1": xr.Dataset({"x": 1}), "A/B2": xr.Dataset({"x": 2})}, + "a/b->b/a", + {"B1/A": xr.Dataset({"x": 1}), "B2/A": xr.Dataset({"x": 2})}, + ), + ], + ) + def test_reorder(self, in_dict, reordering, expected_dict): + dt = DataTree.from_dict(in_dict) + result = dt.reorder(reordering) + expected = DataTree.from_dict(expected_dict) + dtt.assert_equal(result, expected) + + def test_invalid_order(self): + dt = DataTree.from_dict({"A/B/C": None}) + + with pytest.raises(ValueError, match="Invalid symbolic reordering"): + dt.reorder("a") + + with pytest.raises(ValueError, match="Invalid symbolic reordering"): + dt.reorder("a->") + + with pytest.raises( + ValueError, match="depth of the symbolic path on each side must be equal" + ): + dt.reorder("a->a/b") + + with pytest.raises(ValueError, match="only present on one side"): + dt.reorder("a->b") + + with pytest.raises(ValueError, match="symbols {'a'} appear more than once"): + dt.reorder("a/a/b->a/b/b") + + def test_invalid_tree(self): + dt = DataTree.from_dict({"A": None}) + + with pytest.raises(ValueError, match="Node A only has depth 1"): + dt.reorder("a/b/c->c/b/a") + + dt = DataTree.from_dict({"A": xr.Dataset({"t": 1}), "A/B": None}) + + with pytest.raises(ValueError, match="Only hollow trees"): + dt.reorder("a/b->b/a") + + class TestPipe: def test_noop(self, create_test_datatree): dt = create_test_datatree() diff --git a/docs/source/api.rst b/docs/source/api.rst index 215105ef..ee2a5967 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -105,6 +105,7 @@ For manipulating, traversing, navigating, or mapping over the tree structure. DataTree.pipe DataTree.match DataTree.filter + DataTree.reorder DataTree Contents ----------------- diff --git a/docs/source/hierarchical-data.rst b/docs/source/hierarchical-data.rst index 51bcea56..c1ca1ba9 100644 --- a/docs/source/hierarchical-data.rst +++ b/docs/source/hierarchical-data.rst @@ -25,11 +25,11 @@ Many real-world datasets are composed of multiple differing components, and it can often be be useful to think of these in terms of a hierarchy of related groups of data. Examples of data which one might want organise in a grouped or hierarchical manner include: -- Simulation data at multiple resolutions, +- Simulation data at multiple resolutions, or using multiple models, - Observational data about the same system but from multiple different types of sensors, - Mixed experimental and theoretical data, - A systematic study recording the same experiment but with different parameters, -- Heterogenous data, such as demographic and metereological data, +- Heterogeneous data, such as demographic and meteorological data, or even any combination of the above. diff --git a/docs/source/whats-new.rst b/docs/source/whats-new.rst index 5163fdd6..ca50b27e 100644 --- a/docs/source/whats-new.rst +++ b/docs/source/whats-new.rst @@ -23,6 +23,9 @@ v0.0.13 (unreleased) New Features ~~~~~~~~~~~~ +- New :py:meth:`DataTree.reorder` method for re-ordering levels of all nodes in the tree according to a + symbolic pattern such as ``a/b->b/a``. (:pull:`271`) + By `Tom Nicholas `_. - New :py:meth:`DataTree.match` method for glob-like pattern matching of node paths. (:pull:`267`) By `Tom Nicholas `_. - New :py:meth:`DataTree.is_hollow` property for checking if data is only contained at the leaf nodes. (:pull:`272`)