Skip to content

Commit 1fedfd8

Browse files
authored
Refactor update coordinates to better handle multi-coordinate indexes (#8094)
* generic warning implicitly wrap a pd.MultiIndex * refactor update_coords (assign) Fix more cases with multi-coordinate indexes: - do not try to align existing indexed coordinates with the new coordinates that will fully replace them - raise early if the new coordinates would corrupt the existing indexed coordinates - isolate PandasMultiIndex special cases so that it will be easier to drop support for it later (and warn now about deprecation) * fix alignment of updated coordinates when DataArray objects are passed as new coordinate objects * refactor Dataset.assign Need to update (replace) coordinates and data variables separately to ensure it goes through all (indexed) coordinate update checks. * fix and update tests * nit * fix mypy? * update what's new * fix import error * fix performance regression * nit * use warning util func and improve messages
1 parent 42d42ba commit 1fedfd8

File tree

5 files changed

+205
-102
lines changed

5 files changed

+205
-102
lines changed

doc/whats-new.rst

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ What's New
1414
1515
np.random.seed(123456)
1616
17-
1817
.. _whats-new.2023.08.1:
1918

2019
v2023.08.1 (unreleased)
@@ -31,10 +30,19 @@ Breaking changes
3130
Deprecations
3231
~~~~~~~~~~~~
3332

33+
- Deprecate passing a :py:class:`pandas.MultiIndex` object directly to the
34+
:py:class:`Dataset` and :py:class:`DataArray` constructors as well as to
35+
:py:meth:`Dataset.assign` and :py:meth:`Dataset.assign_coords`.
36+
A new Xarray :py:class:`Coordinates` object has to be created first using
37+
:py:meth:`Coordinates.from_pandas_multiindex` (:pull:`8094`).
38+
By `Benoît Bovy <https://github.com/benbovy>`_.
3439

3540
Bug fixes
3641
~~~~~~~~~
3742

43+
- Improved handling of multi-coordinate indexes when updating coordinates, including bug fixes
44+
(and improved warnings for deprecated features) for pandas multi-indexes (:pull:`8094`).
45+
By `Benoît Bovy <https://github.com/benbovy>`_.
3846

3947
Documentation
4048
~~~~~~~~~~~~~
@@ -120,7 +128,6 @@ Breaking changes
120128
numbagg 0.1 0.2.1
121129
===================== ========= ========
122130

123-
124131
Documentation
125132
~~~~~~~~~~~~~
126133

xarray/core/coordinates.py

Lines changed: 115 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import warnings
43
from collections.abc import Hashable, Iterator, Mapping, Sequence
54
from contextlib import contextmanager
65
from typing import (
@@ -24,7 +23,7 @@
2423
)
2524
from xarray.core.merge import merge_coordinates_without_align, merge_coords
2625
from xarray.core.types import Self, T_DataArray
27-
from xarray.core.utils import Frozen, ReprObject
26+
from xarray.core.utils import Frozen, ReprObject, emit_user_level_warning
2827
from xarray.core.variable import Variable, as_variable, calculate_dimensions
2928

3029
if TYPE_CHECKING:
@@ -83,7 +82,7 @@ def variables(self):
8382
def _update_coords(self, coords, indexes):
8483
raise NotImplementedError()
8584

86-
def _maybe_drop_multiindex_coords(self, coords):
85+
def _drop_coords(self, coord_names):
8786
raise NotImplementedError()
8887

8988
def __iter__(self) -> Iterator[Hashable]:
@@ -379,9 +378,9 @@ def _update_coords(
379378
# redirect to DatasetCoordinates._update_coords
380379
self._data.coords._update_coords(coords, indexes)
381380

382-
def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None:
383-
# redirect to DatasetCoordinates._maybe_drop_multiindex_coords
384-
self._data.coords._maybe_drop_multiindex_coords(coords)
381+
def _drop_coords(self, coord_names):
382+
# redirect to DatasetCoordinates._drop_coords
383+
self._data.coords._drop_coords(coord_names)
385384

386385
def _merge_raw(self, other, reflexive):
387386
"""For use with binary arithmetic."""
@@ -454,22 +453,40 @@ def __setitem__(self, key: Hashable, value: Any) -> None:
454453

455454
def update(self, other: Mapping[Any, Any]) -> None:
456455
"""Update this Coordinates variables with other coordinate variables."""
457-
other_obj: Coordinates | Mapping[Hashable, Variable]
456+
457+
if not len(other):
458+
return
459+
460+
other_coords: Coordinates
458461

459462
if isinstance(other, Coordinates):
460-
# special case: default indexes won't be created
461-
other_obj = other
463+
# Coordinates object: just pass it (default indexes won't be created)
464+
other_coords = other
462465
else:
463-
other_obj = getattr(other, "variables", other)
466+
other_coords = create_coords_with_default_indexes(
467+
getattr(other, "variables", other)
468+
)
464469

465-
self._maybe_drop_multiindex_coords(set(other_obj))
470+
# Discard original indexed coordinates prior to merge allows to:
471+
# - fail early if the new coordinates don't preserve the integrity of existing
472+
# multi-coordinate indexes
473+
# - drop & replace coordinates without alignment (note: we must keep indexed
474+
# coordinates extracted from the DataArray objects passed as values to
475+
# `other` - if any - as those are still used for aligning the old/new coordinates)
476+
coords_to_align = drop_indexed_coords(set(other_coords) & set(other), self)
466477

467478
coords, indexes = merge_coords(
468-
[self.variables, other_obj],
479+
[coords_to_align, other_coords],
469480
priority_arg=1,
470-
indexes=self.xindexes,
481+
indexes=coords_to_align.xindexes,
471482
)
472483

484+
# special case for PandasMultiIndex: updating only its dimension coordinate
485+
# is still allowed but depreciated.
486+
# It is the only case where we need to actually drop coordinates here (multi-index levels)
487+
# TODO: remove when removing PandasMultiIndex's dimension coordinate.
488+
self._drop_coords(self._names - coords_to_align._names)
489+
473490
self._update_coords(coords, indexes)
474491

475492
def _overwrite_indexes(
@@ -610,15 +627,20 @@ def _update_coords(
610627
original_indexes.update(indexes)
611628
self._data._indexes = original_indexes
612629

613-
def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None:
614-
"""Drops variables in coords, and any associated variables as well."""
630+
def _drop_coords(self, coord_names):
631+
# should drop indexed coordinates only
632+
for name in coord_names:
633+
del self._data._variables[name]
634+
del self._data._indexes[name]
635+
self._data._coord_names.difference_update(coord_names)
636+
637+
def _drop_indexed_coords(self, coords_to_drop: set[Hashable]) -> None:
615638
assert self._data.xindexes is not None
616-
variables, indexes = drop_coords(
617-
coords, self._data._variables, self._data.xindexes
618-
)
619-
self._data._coord_names.intersection_update(variables)
620-
self._data._variables = variables
621-
self._data._indexes = indexes
639+
new_coords = drop_indexed_coords(coords_to_drop, self)
640+
for name in self._data._coord_names - new_coords._names:
641+
del self._data._variables[name]
642+
self._data._indexes = dict(new_coords.xindexes)
643+
self._data._coord_names.intersection_update(new_coords._names)
622644

623645
def __delitem__(self, key: Hashable) -> None:
624646
if key in self:
@@ -691,13 +713,11 @@ def _update_coords(
691713
original_indexes.update(indexes)
692714
self._data._indexes = original_indexes
693715

694-
def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None:
695-
"""Drops variables in coords, and any associated variables as well."""
696-
variables, indexes = drop_coords(
697-
coords, self._data._coords, self._data.xindexes
698-
)
699-
self._data._coords = variables
700-
self._data._indexes = indexes
716+
def _drop_coords(self, coord_names):
717+
# should drop indexed coordinates only
718+
for name in coord_names:
719+
del self._data._coords[name]
720+
del self._data._indexes[name]
701721

702722
@property
703723
def variables(self):
@@ -724,35 +744,48 @@ def _ipython_key_completions_(self):
724744
return self._data._ipython_key_completions_()
725745

726746

727-
def drop_coords(
728-
coords_to_drop: set[Hashable], variables, indexes: Indexes
729-
) -> tuple[dict, dict]:
730-
"""Drop index variables associated with variables in coords_to_drop."""
731-
# Only warn when we're dropping the dimension with the multi-indexed coordinate
732-
# If asked to drop a subset of the levels in a multi-index, we raise an error
733-
# later but skip the warning here.
734-
new_variables = dict(variables.copy())
735-
new_indexes = dict(indexes.copy())
736-
for key in coords_to_drop & set(indexes):
737-
maybe_midx = indexes[key]
738-
idx_coord_names = set(indexes.get_all_coords(key))
739-
if (
740-
isinstance(maybe_midx, PandasMultiIndex)
741-
and key == maybe_midx.dim
742-
and (idx_coord_names - coords_to_drop)
743-
):
744-
warnings.warn(
745-
f"Updating MultiIndexed coordinate {key!r} would corrupt indices for "
746-
f"other variables: {list(maybe_midx.index.names)!r}. "
747-
f"This will raise an error in the future. Use `.drop_vars({idx_coord_names!r})` before "
747+
def drop_indexed_coords(
748+
coords_to_drop: set[Hashable], coords: Coordinates
749+
) -> Coordinates:
750+
"""Drop indexed coordinates associated with coordinates in coords_to_drop.
751+
752+
This will raise an error in case it corrupts any passed index and its
753+
coordinate variables.
754+
755+
"""
756+
new_variables = dict(coords.variables)
757+
new_indexes = dict(coords.xindexes)
758+
759+
for idx, idx_coords in coords.xindexes.group_by_index():
760+
idx_drop_coords = set(idx_coords) & coords_to_drop
761+
762+
# special case for pandas multi-index: still allow but deprecate
763+
# dropping only its dimension coordinate.
764+
# TODO: remove when removing PandasMultiIndex's dimension coordinate.
765+
if isinstance(idx, PandasMultiIndex) and idx_drop_coords == {idx.dim}:
766+
idx_drop_coords.update(idx.index.names)
767+
emit_user_level_warning(
768+
f"updating coordinate {idx.dim!r} with a PandasMultiIndex would leave "
769+
f"the multi-index level coordinates {list(idx.index.names)!r} in an inconsistent state. "
770+
f"This will raise an error in the future. Use `.drop_vars({list(idx_coords)!r})` before "
748771
"assigning new coordinate values.",
749772
FutureWarning,
750-
stacklevel=4,
751773
)
752-
for k in idx_coord_names:
753-
del new_variables[k]
754-
del new_indexes[k]
755-
return new_variables, new_indexes
774+
775+
elif idx_drop_coords and len(idx_drop_coords) != len(idx_coords):
776+
idx_drop_coords_str = ", ".join(f"{k!r}" for k in idx_drop_coords)
777+
idx_coords_str = ", ".join(f"{k!r}" for k in idx_coords)
778+
raise ValueError(
779+
f"cannot drop or update coordinate(s) {idx_drop_coords_str}, which would corrupt "
780+
f"the following index built from coordinates {idx_coords_str}:\n"
781+
f"{idx}"
782+
)
783+
784+
for k in idx_drop_coords:
785+
del new_variables[k]
786+
del new_indexes[k]
787+
788+
return Coordinates._construct_direct(coords=new_variables, indexes=new_indexes)
756789

757790

758791
def assert_coordinate_consistent(
@@ -773,11 +806,15 @@ def assert_coordinate_consistent(
773806

774807

775808
def create_coords_with_default_indexes(
776-
coords: Mapping[Any, Any], data_vars: Mapping[Any, Variable] | None = None
809+
coords: Mapping[Any, Any], data_vars: Mapping[Any, Any] | None = None
777810
) -> Coordinates:
778-
"""Maybe create default indexes from a mapping of coordinates."""
811+
"""Returns a Coordinates object from a mapping of coordinates (arbitrary objects).
812+
813+
Create default (pandas) indexes for each of the input dimension coordinates.
814+
Extract coordinates from each input DataArray.
779815
780-
# Note: data_vars are needed here only because a pd.MultiIndex object
816+
"""
817+
# Note: data_vars is needed here only because a pd.MultiIndex object
781818
# can be promoted as coordinates.
782819
# TODO: It won't be relevant anymore when this behavior will be dropped
783820
# in favor of the more explicit ``Coordinates.from_pandas_multiindex()``.
@@ -791,34 +828,34 @@ def create_coords_with_default_indexes(
791828
indexes: dict[Hashable, Index] = {}
792829
variables: dict[Hashable, Variable] = {}
793830

794-
maybe_index_vars: dict[Hashable, Variable] = {}
795-
mindex_data_vars: list[Hashable] = []
831+
# promote any pandas multi-index in data_vars as coordinates
832+
coords_promoted: dict[Hashable, Any] = {}
833+
pd_mindex_keys: list[Hashable] = []
796834

797835
for k, v in all_variables.items():
798-
if k in coords:
799-
maybe_index_vars[k] = v
800-
elif isinstance(v, pd.MultiIndex):
801-
# TODO: eventually stop promoting multi-index passed via data variables
802-
mindex_data_vars.append(k)
803-
maybe_index_vars[k] = v
804-
805-
if mindex_data_vars:
806-
warnings.warn(
807-
f"passing one or more `pandas.MultiIndex` via data variable(s) {mindex_data_vars} "
808-
"will no longer create indexed coordinates in the future. "
809-
"If you want to keep this behavior, pass it as coordinates instead.",
836+
if isinstance(v, pd.MultiIndex):
837+
coords_promoted[k] = v
838+
pd_mindex_keys.append(k)
839+
elif k in coords:
840+
coords_promoted[k] = v
841+
842+
if pd_mindex_keys:
843+
pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys])
844+
emit_user_level_warning(
845+
f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or "
846+
"data variable(s) will no longer be implicitly promoted and wrapped into "
847+
"multiple indexed coordinates in the future "
848+
"(i.e., one coordinate for each multi-index level + one dimension coordinate). "
849+
"If you want to keep this behavior, you need to first wrap it explicitly using "
850+
"`mindex_coords = xarray.Coordinates.from_pandas_multiindex(mindex_obj, 'dim')` "
851+
"and pass it as coordinates, e.g., `xarray.Dataset(coords=mindex_coords)`, "
852+
"`dataset.assign_coords(mindex_coords)` or `dataarray.assign_coords(mindex_coords)`.",
810853
FutureWarning,
811854
)
812855

813-
maybe_index_vars = {
814-
k: v
815-
for k, v in all_variables.items()
816-
if k in coords or isinstance(v, pd.MultiIndex)
817-
}
818-
819856
dataarray_coords: list[DataArrayCoordinates] = []
820857

821-
for name, obj in maybe_index_vars.items():
858+
for name, obj in coords_promoted.items():
822859
if isinstance(obj, DataArray):
823860
dataarray_coords.append(obj.coords)
824861

xarray/core/dataset.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@
120120
from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes
121121
from xarray.core.dataarray import DataArray
122122
from xarray.core.groupby import DatasetGroupBy
123-
from xarray.core.merge import CoercibleMapping
123+
from xarray.core.merge import CoercibleMapping, CoercibleValue
124124
from xarray.core.parallelcompat import ChunkManagerEntrypoint
125125
from xarray.core.resample import DatasetResample
126126
from xarray.core.rolling import DatasetCoarsen, DatasetRolling
@@ -6879,6 +6879,10 @@ def assign(
68796879
possible, but you cannot reference other variables created within the
68806880
same ``assign`` call.
68816881
6882+
The new assigned variables that replace existing coordinates in the
6883+
original dataset are still listed as coordinates in the returned
6884+
Dataset.
6885+
68826886
See Also
68836887
--------
68846888
pandas.DataFrame.assign
@@ -6934,11 +6938,23 @@ def assign(
69346938
"""
69356939
variables = either_dict_or_kwargs(variables, variables_kwargs, "assign")
69366940
data = self.copy()
6941+
69376942
# do all calculations first...
69386943
results: CoercibleMapping = data._calc_assign_results(variables)
6939-
data.coords._maybe_drop_multiindex_coords(set(results.keys()))
6944+
6945+
# split data variables to add/replace vs. coordinates to replace
6946+
results_data_vars: dict[Hashable, CoercibleValue] = {}
6947+
results_coords: dict[Hashable, CoercibleValue] = {}
6948+
for k, v in results.items():
6949+
if k in data._coord_names:
6950+
results_coords[k] = v
6951+
else:
6952+
results_data_vars[k] = v
6953+
69406954
# ... and then assign
6941-
data.update(results)
6955+
data.coords.update(results_coords)
6956+
data.update(results_data_vars)
6957+
69426958
return data
69436959

69446960
def to_array(

0 commit comments

Comments
 (0)