Skip to content

Commit c129eb1

Browse files
committed
Merge branch 'main' into datatree_alignment_docs
2 parents 118e802 + 70a2a55 commit c129eb1

File tree

3 files changed

+290
-21
lines changed

3 files changed

+290
-21
lines changed

doc/api.rst

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -761,16 +761,17 @@ Compare one ``DataTree`` object to another.
761761
DataTree.equals
762762
DataTree.identical
763763

764-
.. Indexing
765-
.. --------
764+
Indexing
765+
--------
766766

767-
.. Index into all nodes in the subtree simultaneously.
767+
Index into all nodes in the subtree simultaneously.
768768

769-
.. .. autosummary::
770-
.. :toctree: generated/
769+
.. autosummary::
770+
:toctree: generated/
771+
772+
DataTree.isel
773+
DataTree.sel
771774

772-
.. DataTree.isel
773-
.. DataTree.sel
774775
.. DataTree.drop_sel
775776
.. DataTree.drop_isel
776777
.. DataTree.head

xarray/core/datatree.py

Lines changed: 189 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@
3232
from xarray.core.merge import dataset_update_method
3333
from xarray.core.options import OPTIONS as XR_OPTS
3434
from xarray.core.treenode import NamedNode, NodePath
35+
from xarray.core.types import Self
3536
from xarray.core.utils import (
3637
Default,
3738
FilteredMapping,
3839
Frozen,
3940
_default,
41+
drop_dims_from_indexers,
4042
either_dict_or_kwargs,
4143
maybe_wrap_array,
4244
)
@@ -54,7 +56,12 @@
5456

5557
from xarray.core.datatree_io import T_DataTreeNetcdfEngine, T_DataTreeNetcdfTypes
5658
from xarray.core.merge import CoercibleMapping, CoercibleValue
57-
from xarray.core.types import ErrorOptions, NetcdfWriteModes, ZarrWriteModes
59+
from xarray.core.types import (
60+
ErrorOptions,
61+
ErrorOptionsWithWarn,
62+
NetcdfWriteModes,
63+
ZarrWriteModes,
64+
)
5865

5966
# """
6067
# DEVELOPERS' NOTE
@@ -806,6 +813,9 @@ def _replace_node(
806813
if data is not _default:
807814
self._set_node_data(ds)
808815

816+
if self.parent is not None:
817+
_deduplicate_inherited_coordinates(self, self.parent)
818+
809819
self.children = children
810820

811821
def _copy_node(
@@ -1081,7 +1091,7 @@ def from_dict(
10811091
d: Mapping[str, Dataset | DataTree | None],
10821092
/,
10831093
name: str | None = None,
1084-
) -> DataTree:
1094+
) -> Self:
10851095
"""
10861096
Create a datatree from a dictionary of data objects, organised by paths into the tree.
10871097
@@ -1601,3 +1611,180 @@ def to_zarr(
16011611
compute=compute,
16021612
**kwargs,
16031613
)
1614+
1615+
def _selective_indexing(
1616+
self,
1617+
func: Callable[[Dataset, Mapping[Any, Any]], Dataset],
1618+
indexers: Mapping[Any, Any],
1619+
missing_dims: ErrorOptionsWithWarn = "raise",
1620+
) -> Self:
1621+
"""Apply an indexing operation over the subtree, handling missing
1622+
dimensions and inherited coordinates gracefully by only applying
1623+
indexing at each node selectively.
1624+
"""
1625+
all_dims = set()
1626+
for node in self.subtree:
1627+
all_dims.update(node._node_dims)
1628+
indexers = drop_dims_from_indexers(indexers, all_dims, missing_dims)
1629+
1630+
result = {}
1631+
for node in self.subtree:
1632+
node_indexers = {k: v for k, v in indexers.items() if k in node.dims}
1633+
node_result = func(node.dataset, node_indexers)
1634+
# Indexing datasets corresponding to each node results in redundant
1635+
# coordinates when indexes from a parent node are inherited.
1636+
# Ideally, we would avoid creating such coordinates in the first
1637+
# place, but that would require implementing indexing operations at
1638+
# the Variable instead of the Dataset level.
1639+
for k in node_indexers:
1640+
if k not in node._node_coord_variables and k in node_result.coords:
1641+
# We remove all inherited coordinates. Coordinates
1642+
# corresponding to an index would be de-duplicated by
1643+
# _deduplicate_inherited_coordinates(), but indexing (e.g.,
1644+
# with a scalar) can also create scalar coordinates, which
1645+
# need to be explicitly removed.
1646+
del node_result.coords[k]
1647+
result[node.path] = node_result
1648+
return type(self).from_dict(result, name=self.name)
1649+
1650+
def isel(
1651+
self,
1652+
indexers: Mapping[Any, Any] | None = None,
1653+
drop: bool = False,
1654+
missing_dims: ErrorOptionsWithWarn = "raise",
1655+
**indexers_kwargs: Any,
1656+
) -> Self:
1657+
"""Returns a new data tree with each array indexed along the specified
1658+
dimension(s).
1659+
1660+
This method selects values from each array using its `__getitem__`
1661+
method, except this method does not require knowing the order of
1662+
each array's dimensions.
1663+
1664+
Parameters
1665+
----------
1666+
indexers : dict, optional
1667+
A dict with keys matching dimensions and values given
1668+
by integers, slice objects or arrays.
1669+
indexer can be a integer, slice, array-like or DataArray.
1670+
If DataArrays are passed as indexers, xarray-style indexing will be
1671+
carried out. See :ref:`indexing` for the details.
1672+
One of indexers or indexers_kwargs must be provided.
1673+
drop : bool, default: False
1674+
If ``drop=True``, drop coordinates variables indexed by integers
1675+
instead of making them scalar.
1676+
missing_dims : {"raise", "warn", "ignore"}, default: "raise"
1677+
What to do if dimensions that should be selected from are not present in the
1678+
Dataset:
1679+
- "raise": raise an exception
1680+
- "warn": raise a warning, and ignore the missing dimensions
1681+
- "ignore": ignore the missing dimensions
1682+
1683+
**indexers_kwargs : {dim: indexer, ...}, optional
1684+
The keyword arguments form of ``indexers``.
1685+
One of indexers or indexers_kwargs must be provided.
1686+
1687+
Returns
1688+
-------
1689+
obj : DataTree
1690+
A new DataTree with the same contents as this data tree, except each
1691+
array and dimension is indexed by the appropriate indexers.
1692+
If indexer DataArrays have coordinates that do not conflict with
1693+
this object, then these coordinates will be attached.
1694+
In general, each array's data will be a view of the array's data
1695+
in this dataset, unless vectorized indexing was triggered by using
1696+
an array indexer, in which case the data will be a copy.
1697+
1698+
See Also
1699+
--------
1700+
DataTree.sel
1701+
Dataset.isel
1702+
"""
1703+
1704+
def apply_indexers(dataset, node_indexers):
1705+
return dataset.isel(node_indexers, drop=drop)
1706+
1707+
indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "isel")
1708+
return self._selective_indexing(
1709+
apply_indexers, indexers, missing_dims=missing_dims
1710+
)
1711+
1712+
def sel(
1713+
self,
1714+
indexers: Mapping[Any, Any] | None = None,
1715+
method: str | None = None,
1716+
tolerance: int | float | Iterable[int | float] | None = None,
1717+
drop: bool = False,
1718+
**indexers_kwargs: Any,
1719+
) -> Self:
1720+
"""Returns a new data tree with each array indexed by tick labels
1721+
along the specified dimension(s).
1722+
1723+
In contrast to `DataTree.isel`, indexers for this method should use
1724+
labels instead of integers.
1725+
1726+
Under the hood, this method is powered by using pandas's powerful Index
1727+
objects. This makes label based indexing essentially just as fast as
1728+
using integer indexing.
1729+
1730+
It also means this method uses pandas's (well documented) logic for
1731+
indexing. This means you can use string shortcuts for datetime indexes
1732+
(e.g., '2000-01' to select all values in January 2000). It also means
1733+
that slices are treated as inclusive of both the start and stop values,
1734+
unlike normal Python indexing.
1735+
1736+
Parameters
1737+
----------
1738+
indexers : dict, optional
1739+
A dict with keys matching dimensions and values given
1740+
by scalars, slices or arrays of tick labels. For dimensions with
1741+
multi-index, the indexer may also be a dict-like object with keys
1742+
matching index level names.
1743+
If DataArrays are passed as indexers, xarray-style indexing will be
1744+
carried out. See :ref:`indexing` for the details.
1745+
One of indexers or indexers_kwargs must be provided.
1746+
method : {None, "nearest", "pad", "ffill", "backfill", "bfill"}, optional
1747+
Method to use for inexact matches:
1748+
1749+
* None (default): only exact matches
1750+
* pad / ffill: propagate last valid index value forward
1751+
* backfill / bfill: propagate next valid index value backward
1752+
* nearest: use nearest valid index value
1753+
tolerance : optional
1754+
Maximum distance between original and new labels for inexact
1755+
matches. The values of the index at the matching locations must
1756+
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
1757+
drop : bool, optional
1758+
If ``drop=True``, drop coordinates variables in `indexers` instead
1759+
of making them scalar.
1760+
**indexers_kwargs : {dim: indexer, ...}, optional
1761+
The keyword arguments form of ``indexers``.
1762+
One of indexers or indexers_kwargs must be provided.
1763+
1764+
Returns
1765+
-------
1766+
obj : DataTree
1767+
A new DataTree with the same contents as this data tree, except each
1768+
variable and dimension is indexed by the appropriate indexers.
1769+
If indexer DataArrays have coordinates that do not conflict with
1770+
this object, then these coordinates will be attached.
1771+
In general, each array's data will be a view of the array's data
1772+
in this dataset, unless vectorized indexing was triggered by using
1773+
an array indexer, in which case the data will be a copy.
1774+
1775+
See Also
1776+
--------
1777+
DataTree.isel
1778+
Dataset.sel
1779+
"""
1780+
1781+
def apply_indexers(dataset, node_indexers):
1782+
# TODO: reimplement in terms of map_index_queries(), to avoid
1783+
# redundant look-ups of integer positions from labels (via indexes)
1784+
# on child nodes.
1785+
return dataset.sel(
1786+
node_indexers, method=method, tolerance=tolerance, drop=drop
1787+
)
1788+
1789+
indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "sel")
1790+
return self._selective_indexing(apply_indexers, indexers)

xarray/tests/test_datatree.py

Lines changed: 93 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -971,7 +971,6 @@ def test_ipython_key_completions(self, create_test_datatree):
971971
var_keys = list(dt.variables.keys())
972972
assert all(var_key in key_completions for var_key in var_keys)
973973

974-
@pytest.mark.xfail(reason="sel not implemented yet")
975974
def test_operation_with_attrs_but_no_data(self):
976975
# tests bug from xarray-datatree GH262
977976
xs = xr.Dataset({"testvar": xr.DataArray(np.ones((2, 3)))})
@@ -1316,6 +1315,19 @@ def test_inherited_coords_with_index_are_deduplicated(self):
13161315
expected = xr.Dataset({"foo": ("x", [4, 5])})
13171316
assert_identical(child_dataset, expected)
13181317

1318+
def test_deduplicated_after_setitem(self):
1319+
# regression test for GH #9601
1320+
dt = DataTree.from_dict(
1321+
{
1322+
"/": xr.Dataset(coords={"x": [1, 2]}),
1323+
"/b": None,
1324+
}
1325+
)
1326+
dt["b/x"] = dt["x"]
1327+
child_dataset = dt.children["b"].to_dataset(inherited=False)
1328+
expected = xr.Dataset()
1329+
assert_identical(child_dataset, expected)
1330+
13191331
def test_inconsistent_dims(self):
13201332
expected_msg = _exact_match(
13211333
"""
@@ -1561,26 +1573,95 @@ def test_filter(self):
15611573
assert_identical(elders, expected)
15621574

15631575

1564-
class TestDSMethodInheritance:
1565-
@pytest.mark.xfail(reason="isel not implemented yet")
1566-
def test_dataset_method(self):
1567-
ds = xr.Dataset({"a": ("x", [1, 2, 3])})
1568-
dt = DataTree.from_dict(
1576+
class TestIndexing:
1577+
1578+
def test_isel_siblings(self):
1579+
tree = DataTree.from_dict(
15691580
{
1570-
"/": ds,
1571-
"/results": ds,
1581+
"/first": xr.Dataset({"a": ("x", [1, 2])}),
1582+
"/second": xr.Dataset({"b": ("x", [1, 2, 3])}),
15721583
}
15731584
)
15741585

15751586
expected = DataTree.from_dict(
15761587
{
1577-
"/": ds.isel(x=1),
1578-
"/results": ds.isel(x=1),
1588+
"/first": xr.Dataset({"a": 2}),
1589+
"/second": xr.Dataset({"b": 3}),
15791590
}
15801591
)
1592+
actual = tree.isel(x=-1)
1593+
assert_equal(actual, expected)
15811594

1582-
result = dt.isel(x=1)
1583-
assert_equal(result, expected)
1595+
expected = DataTree.from_dict(
1596+
{
1597+
"/first": xr.Dataset({"a": ("x", [1])}),
1598+
"/second": xr.Dataset({"b": ("x", [1])}),
1599+
}
1600+
)
1601+
actual = tree.isel(x=slice(1))
1602+
assert_equal(actual, expected)
1603+
1604+
actual = tree.isel(x=[0])
1605+
assert_equal(actual, expected)
1606+
1607+
actual = tree.isel(x=slice(None))
1608+
assert_equal(actual, tree)
1609+
1610+
def test_isel_inherited(self):
1611+
tree = DataTree.from_dict(
1612+
{
1613+
"/": xr.Dataset(coords={"x": [1, 2]}),
1614+
"/child": xr.Dataset({"foo": ("x", [3, 4])}),
1615+
}
1616+
)
1617+
1618+
expected = DataTree.from_dict(
1619+
{
1620+
"/": xr.Dataset(coords={"x": 2}),
1621+
"/child": xr.Dataset({"foo": 4}),
1622+
}
1623+
)
1624+
actual = tree.isel(x=-1)
1625+
assert_equal(actual, expected)
1626+
1627+
expected = DataTree.from_dict(
1628+
{
1629+
"/child": xr.Dataset({"foo": 4}),
1630+
}
1631+
)
1632+
actual = tree.isel(x=-1, drop=True)
1633+
assert_equal(actual, expected)
1634+
1635+
expected = DataTree.from_dict(
1636+
{
1637+
"/": xr.Dataset(coords={"x": [1]}),
1638+
"/child": xr.Dataset({"foo": ("x", [3])}),
1639+
}
1640+
)
1641+
actual = tree.isel(x=[0])
1642+
assert_equal(actual, expected)
1643+
1644+
actual = tree.isel(x=slice(None))
1645+
assert_equal(actual, tree)
1646+
1647+
def test_sel(self):
1648+
tree = DataTree.from_dict(
1649+
{
1650+
"/first": xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"x": [1, 2, 3]}),
1651+
"/second": xr.Dataset({"b": ("x", [4, 5])}, coords={"x": [2, 3]}),
1652+
}
1653+
)
1654+
expected = DataTree.from_dict(
1655+
{
1656+
"/first": xr.Dataset({"a": 2}, coords={"x": 2}),
1657+
"/second": xr.Dataset({"b": 4}, coords={"x": 2}),
1658+
}
1659+
)
1660+
actual = tree.sel(x=2)
1661+
assert_equal(actual, expected)
1662+
1663+
1664+
class TestDSMethodInheritance:
15841665

15851666
@pytest.mark.xfail(reason="reduce methods not implemented yet")
15861667
def test_reduce_method(self):

0 commit comments

Comments
 (0)