From c24b1729b16a01b26f3bde1557b8f1778e748a7c Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 17 Aug 2023 11:21:51 +0200 Subject: [PATCH 01/12] generic warning implicitly wrap a pd.MultiIndex --- xarray/core/coordinates.py | 31 ++++++++++++++----------------- xarray/tests/test_dataset.py | 9 ++++++++- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index f03d98f781a..7cdd2776ee7 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -773,7 +773,7 @@ def assert_coordinate_consistent( def create_coords_with_default_indexes( - coords: Mapping[Any, Any], data_vars: Mapping[Any, Variable] | None = None + coords: Mapping[Any, Any], data_vars: Mapping[Any, Any] | None = None ) -> Coordinates: """Maybe create default indexes from a mapping of coordinates.""" @@ -791,31 +791,28 @@ def create_coords_with_default_indexes( indexes: dict[Hashable, Index] = {} variables: dict[Hashable, Variable] = {} - maybe_index_vars: dict[Hashable, Variable] = {} - mindex_data_vars: list[Hashable] = [] + maybe_index_vars: dict[Hashable, Any] = {} + pd_mindex_keys: list[Hashable] = [] for k, v in all_variables.items(): - if k in coords: - maybe_index_vars[k] = v - elif isinstance(v, pd.MultiIndex): + if isinstance(v, pd.MultiIndex): # TODO: eventually stop promoting multi-index passed via data variables - mindex_data_vars.append(k) + maybe_index_vars[k] = v + pd_mindex_keys.append(k) + elif k in coords: maybe_index_vars[k] = v - if mindex_data_vars: + if pd_mindex_keys: + pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys]) warnings.warn( - f"passing one or more `pandas.MultiIndex` via data variable(s) {mindex_data_vars} " - "will no longer create indexed coordinates in the future. " - "If you want to keep this behavior, pass it as coordinates instead.", + f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or " + "data variable(s) will no longer be implicitly wrapped into " + "multiple indexed coordinates in the future. " + "If you want to keep this behavior, you need to first wrap it explicitly " + "using `xarray.Coordinates.from_pandas_multiindex()`.", FutureWarning, ) - maybe_index_vars = { - k: v - for k, v in all_variables.items() - if k in coords or isinstance(v, pd.MultiIndex) - } - dataarray_coords: list[DataArrayCoordinates] = [] for name, obj in maybe_index_vars.items(): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 5304c54971a..33a803a1a14 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -648,10 +648,17 @@ def test_constructor_multiindex(self) -> None: assert_identical(ds, coords.to_dataset()) with pytest.warns( - FutureWarning, match=".*`pandas.MultiIndex` via data variable.*" + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly wrapped.*", ): Dataset(data_vars={"x": midx}) + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly wrapped.*", + ): + Dataset(coords={"x": midx}) + def test_constructor_custom_index(self) -> None: class CustomIndex(Index): ... From 62b710b03a33ec83e432bce02e8f284318bafe37 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 14:05:07 +0200 Subject: [PATCH 02/12] refactor update_coords (assign) Fix more cases with multi-coordinate indexes: - do not try to align existing indexed coordinates with the new coordinates that will fully replace them - raise early if the new coordinates would corrupt the existing indexed coordinates - isolate PandasMultiIndex special cases so that it will be easier to drop support for it later (and warn now about deprecation) --- xarray/core/coordinates.py | 153 +++++++++++++++++++++-------------- xarray/core/dataset.py | 2 +- xarray/tests/test_dataset.py | 26 +++--- 3 files changed, 112 insertions(+), 69 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 7cdd2776ee7..05465d5a256 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -83,7 +83,7 @@ def variables(self): def _update_coords(self, coords, indexes): raise NotImplementedError() - def _maybe_drop_multiindex_coords(self, coords): + def _drop_coords(self, coord_names): raise NotImplementedError() def __iter__(self) -> Iterator[Hashable]: @@ -379,9 +379,9 @@ def _update_coords( # redirect to DatasetCoordinates._update_coords self._data.coords._update_coords(coords, indexes) - def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: - # redirect to DatasetCoordinates._maybe_drop_multiindex_coords - self._data.coords._maybe_drop_multiindex_coords(coords) + def _drop_coords(self, coord_names): + # redirect to DatasetCoordinates._drop_coords + self._data.coords._drop_coords(coord_names) def _merge_raw(self, other, reflexive): """For use with binary arithmetic.""" @@ -454,22 +454,36 @@ def __setitem__(self, key: Hashable, value: Any) -> None: def update(self, other: Mapping[Any, Any]) -> None: """Update this Coordinates variables with other coordinate variables.""" - other_obj: Coordinates | Mapping[Hashable, Variable] + other_coords: Coordinates if isinstance(other, Coordinates): - # special case: default indexes won't be created - other_obj = other + # Coordinates object: just pass it (default indexes won't be created) + other_coords = other else: - other_obj = getattr(other, "variables", other) + other_coords = create_coords_with_default_indexes( + getattr(other, "variables", other) + ) - self._maybe_drop_multiindex_coords(set(other_obj)) + # discard original indexed coordinates allows to + # - fail early if the new coordinates don't preserve the integrity of existing + # multi-coordinate indexes + # - skip unnecessary alignment operations and/or avoid alignment errors + # in `merge_coords` below + coords_noconflict = drop_indexed_coords(set(other_coords), self) + coords_to_drop = self._names - coords_noconflict._names coords, indexes = merge_coords( - [self.variables, other_obj], + [coords_noconflict, other_coords], priority_arg=1, - indexes=self.xindexes, + indexes=coords_noconflict.xindexes, ) + # special case for PandasMultiIndex: updating only its dimension coordinate + # is still allowed but depreciated. + # It is the only case where we need to drop coordinates here (multi-index levels) + # TODO: remove when removing PandasMultiIndex's dimension coordinate. + self._drop_coords(coords_to_drop) + self._update_coords(coords, indexes) def _overwrite_indexes( @@ -610,15 +624,20 @@ def _update_coords( original_indexes.update(indexes) self._data._indexes = original_indexes - def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: - """Drops variables in coords, and any associated variables as well.""" + def _drop_coords(self, coord_names): + # should drop indexed coordinates only + for name in coord_names: + del self._data._variables[name] + del self._data._indexes[name] + self._data._coord_names.difference_update(coord_names) + + def _drop_indexed_coords(self, coords_to_drop: set[Hashable]) -> None: assert self._data.xindexes is not None - variables, indexes = drop_coords( - coords, self._data._variables, self._data.xindexes - ) - self._data._coord_names.intersection_update(variables) - self._data._variables = variables - self._data._indexes = indexes + new_coords = drop_indexed_coords(coords_to_drop, self) + for name in self._data._coord_names - new_coords._names: + del self._data._variables[name] + self._data._indexes = dict(new_coords.xindexes) + self._data._coord_names.intersection_update(new_coords._names) def __delitem__(self, key: Hashable) -> None: if key in self: @@ -691,13 +710,11 @@ def _update_coords( original_indexes.update(indexes) self._data._indexes = original_indexes - def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: - """Drops variables in coords, and any associated variables as well.""" - variables, indexes = drop_coords( - coords, self._data._coords, self._data.xindexes - ) - self._data._coords = variables - self._data._indexes = indexes + def _drop_coords(self, coord_names): + # should drop indexed coordinates only + for name in coord_names: + del self._data._coords[name] + del self._data._indexes[name] @property def variables(self): @@ -724,35 +741,49 @@ def _ipython_key_completions_(self): return self._data._ipython_key_completions_() -def drop_coords( - coords_to_drop: set[Hashable], variables, indexes: Indexes -) -> tuple[dict, dict]: - """Drop index variables associated with variables in coords_to_drop.""" - # Only warn when we're dropping the dimension with the multi-indexed coordinate - # If asked to drop a subset of the levels in a multi-index, we raise an error - # later but skip the warning here. - new_variables = dict(variables.copy()) - new_indexes = dict(indexes.copy()) - for key in coords_to_drop & set(indexes): - maybe_midx = indexes[key] - idx_coord_names = set(indexes.get_all_coords(key)) - if ( - isinstance(maybe_midx, PandasMultiIndex) - and key == maybe_midx.dim - and (idx_coord_names - coords_to_drop) - ): +def drop_indexed_coords( + coords_to_drop: set[Hashable], coords: Coordinates +) -> Coordinates: + """Drop indexed coordinates associated with coordinates in coords_to_drop. + + This will raise an error in case it corrupts any passed index and its + coordinate variables. + + """ + new_variables = dict(coords.variables) + new_indexes = dict(coords.xindexes) + + for idx, idx_coords in coords.xindexes.group_by_index(): + idx_drop_coords = set(idx_coords) & coords_to_drop + + # special case for pandas multi-index: still allow but deprecate + # dropping only its dimension coordinate. + # TODO: remove when removing PandasMultiIndex's dimension coordinate. + if isinstance(idx, PandasMultiIndex) and idx_drop_coords == {idx.dim}: + idx_drop_coords.update(idx.index.names) warnings.warn( - f"Updating MultiIndexed coordinate {key!r} would corrupt indices for " - f"other variables: {list(maybe_midx.index.names)!r}. " - f"This will raise an error in the future. Use `.drop_vars({idx_coord_names!r})` before " + f"Updating MultiIndexed coordinate {idx.dim!r} would corrupt indices for " + f"other variables: {list(idx.index.names)!r}. " + f"This will raise an error in the future. Use `.drop_vars({list(idx_coords)!r})` before " "assigning new coordinate values.", FutureWarning, stacklevel=4, ) - for k in idx_coord_names: - del new_variables[k] - del new_indexes[k] - return new_variables, new_indexes + + elif idx_drop_coords and len(idx_drop_coords) != len(idx_coords): + idx_drop_coords_str = ", ".join(f"{k!r}" for k in idx_drop_coords) + idx_coords_str = ", ".join(f"{k!r}" for k in idx_coords) + raise ValueError( + f"cannot drop or update coordinate(s) {idx_drop_coords_str}, which would corrupt " + f"the following index built from coordinates {idx_coords_str}:\n" + f"{idx}" + ) + + for k in idx_drop_coords: + del new_variables[k] + del new_indexes[k] + + return Coordinates._construct_direct(coords=new_variables, indexes=new_indexes) def assert_coordinate_consistent( @@ -775,9 +806,13 @@ def assert_coordinate_consistent( def create_coords_with_default_indexes( coords: Mapping[Any, Any], data_vars: Mapping[Any, Any] | None = None ) -> Coordinates: - """Maybe create default indexes from a mapping of coordinates.""" + """Returns a Coordinates object from a mapping of coordinates (arbitrary objects). - # Note: data_vars are needed here only because a pd.MultiIndex object + Create default (pandas) indexes for each of the input dimension coordinates. + Extract coordinates from each input DataArray. + + """ + # Note: data_vars is needed here only because a pd.MultiIndex object # can be promoted as coordinates. # TODO: It won't be relevant anymore when this behavior will be dropped # in favor of the more explicit ``Coordinates.from_pandas_multiindex()``. @@ -791,31 +826,31 @@ def create_coords_with_default_indexes( indexes: dict[Hashable, Index] = {} variables: dict[Hashable, Variable] = {} - maybe_index_vars: dict[Hashable, Any] = {} + # promote any pandas multi-index in data_vars as coordinates + coords_promoted: dict[Hashable, Any] = {} pd_mindex_keys: list[Hashable] = [] for k, v in all_variables.items(): if isinstance(v, pd.MultiIndex): - # TODO: eventually stop promoting multi-index passed via data variables - maybe_index_vars[k] = v + coords_promoted[k] = v pd_mindex_keys.append(k) elif k in coords: - maybe_index_vars[k] = v + coords_promoted[k] = v if pd_mindex_keys: pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys]) warnings.warn( f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or " - "data variable(s) will no longer be implicitly wrapped into " + "data variable(s) will no longer be implicitly promoted and wrapped into " "multiple indexed coordinates in the future. " "If you want to keep this behavior, you need to first wrap it explicitly " - "using `xarray.Coordinates.from_pandas_multiindex()`.", + "using `xarray.Coordinates.from_pandas_multiindex()` and pass it as coordinates.", FutureWarning, ) dataarray_coords: list[DataArrayCoordinates] = [] - for name, obj in maybe_index_vars.items(): + for name, obj in coords_promoted.items(): if isinstance(obj, DataArray): dataarray_coords.append(obj.coords) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bdf2d8babe1..651093dca30 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6936,7 +6936,7 @@ def assign( data = self.copy() # do all calculations first... results: CoercibleMapping = data._calc_assign_results(variables) - data.coords._maybe_drop_multiindex_coords(set(results.keys())) + data.coords._drop_indexed_coords(set(results.keys())) # ... and then assign data.update(results) return data diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 33a803a1a14..9a5eb47f23f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -649,13 +649,13 @@ def test_constructor_multiindex(self) -> None: with pytest.warns( FutureWarning, - match=".*`pandas.MultiIndex`.*no longer be implicitly wrapped.*", + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): Dataset(data_vars={"x": midx}) with pytest.warns( FutureWarning, - match=".*`pandas.MultiIndex`.*no longer be implicitly wrapped.*", + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): Dataset(coords={"x": midx}) @@ -879,7 +879,7 @@ def test_coords_modify(self) -> None: assert_array_equal(actual["z"], ["a", "b"]) actual = data.copy(deep=True) - with pytest.raises(ValueError, match=r"conflicting sizes"): + with pytest.raises(ValueError, match=r"conflicting dimension sizes"): actual.coords["x"] = ("x", [-1]) assert_identical(actual, data) # should not be modified @@ -916,9 +916,7 @@ def test_coords_setitem_with_new_dimension(self) -> None: def test_coords_setitem_multiindex(self) -> None: data = create_test_multiindex() - with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " - ): + with pytest.raises(ValueError, match=r"cannot drop or update.*corrupt.*index "): data.coords["level_1"] = range(4) def test_coords_set(self) -> None: @@ -4251,12 +4249,22 @@ def test_assign_attrs(self) -> None: def test_assign_multiindex_level(self) -> None: data = create_test_multiindex() - with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " - ): + with pytest.raises(ValueError, match=r"cannot drop or update.*corrupt.*index "): data.assign(level_1=range(4)) data.assign_coords(level_1=range(4)) + def test_assign_coords_new_multiindex(self) -> None: + ds = Dataset() + midx = pd.MultiIndex.from_arrays( + [["a", "a", "b", "b"], [0, 1, 0, 1]], names=("one", "two") + ) + + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", + ): + ds.assign_coords({"x": midx}) + def test_assign_coords_existing_multiindex(self) -> None: data = create_test_multiindex() with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): From 775b59730cd9f203a7ea4389117aedb44e40a372 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 15:30:47 +0200 Subject: [PATCH 03/12] fix alignment of updated coordinates when DataArray objects are passed as new coordinate objects --- xarray/core/coordinates.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 05465d5a256..a2d0c85d845 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -464,25 +464,25 @@ def update(self, other: Mapping[Any, Any]) -> None: getattr(other, "variables", other) ) - # discard original indexed coordinates allows to + # Discard original indexed coordinates prior to merge allows to: # - fail early if the new coordinates don't preserve the integrity of existing # multi-coordinate indexes - # - skip unnecessary alignment operations and/or avoid alignment errors - # in `merge_coords` below - coords_noconflict = drop_indexed_coords(set(other_coords), self) - coords_to_drop = self._names - coords_noconflict._names + # - drop & replace coordinates without alignment (note: we must keep indexed + # coordinates extracted from the DataArray objects passed as values to + # `other` - if any - as those are still used for aligning the old/new coordinates) + coords_to_align = drop_indexed_coords(set(other_coords) & set(other), self) coords, indexes = merge_coords( - [coords_noconflict, other_coords], + [coords_to_align, other_coords], priority_arg=1, - indexes=coords_noconflict.xindexes, + indexes=coords_to_align.xindexes, ) # special case for PandasMultiIndex: updating only its dimension coordinate # is still allowed but depreciated. - # It is the only case where we need to drop coordinates here (multi-index levels) + # It is the only case where we need to actually drop coordinates here (multi-index levels) # TODO: remove when removing PandasMultiIndex's dimension coordinate. - self._drop_coords(coords_to_drop) + self._drop_coords(self._names - coords_to_align._names) self._update_coords(coords, indexes) From 36e901d2f22e48843a3ee5d8c1edc031fa727208 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 15:31:57 +0200 Subject: [PATCH 04/12] refactor Dataset.assign Need to update (replace) coordinates and data variables separately to ensure it goes through all (indexed) coordinate update checks. --- xarray/core/dataset.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 651093dca30..3b02a76e247 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6879,6 +6879,10 @@ def assign( possible, but you cannot reference other variables created within the same ``assign`` call. + The new assigned variables that replace existing coordinates in the + original dataset are still listed as coordinates in the returned + Dataset. + See Also -------- pandas.DataFrame.assign @@ -6934,11 +6938,23 @@ def assign( """ variables = either_dict_or_kwargs(variables, variables_kwargs, "assign") data = self.copy() + # do all calculations first... results: CoercibleMapping = data._calc_assign_results(variables) - data.coords._drop_indexed_coords(set(results.keys())) + + # split data variables to add vs. coordinates to replace + results_data_vars: CoercibleMapping = {} + results_coords: CoercibleMapping = {} + for k, v in results.items(): + if k in data._coord_names: + results_coords[k] = v + else: + results_data_vars[k] = v + # ... and then assign - data.update(results) + data.coords.update(results_coords) + data.update(results_data_vars) + return data def to_array( From 40d81c444dbdc06c8f7fed2c8e36c1f72d30cf3d Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 15:35:12 +0200 Subject: [PATCH 05/12] fix and update tests --- xarray/tests/test_dataarray.py | 6 +++--- xarray/tests/test_dataset.py | 38 +++++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 56b948dbf16..9eee58bfee0 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1435,7 +1435,7 @@ def test_coords(self) -> None: assert_identical(da, expected) with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " + ValueError, match=r"cannot drop or update coordinate.*corrupt.*index " ): self.mda["level_1"] = ("x", np.arange(4)) self.mda.coords["level_1"] = ("x", np.arange(4)) @@ -1555,7 +1555,7 @@ def test_assign_coords(self) -> None: assert_identical(actual, expected) with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " + ValueError, match=r"cannot drop or update coordinate.*corrupt.*index " ): self.mda.assign_coords(level_1=("x", range(4))) @@ -1608,7 +1608,7 @@ def test_set_coords_update_index(self) -> None: def test_set_coords_multiindex_level(self) -> None: with pytest.raises( - ValueError, match=r"cannot set or update variable.*corrupt.*index " + ValueError, match=r"cannot drop or update coordinate.*corrupt.*index " ): self.mda["level_1"] = range(4) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 9a5eb47f23f..b16c6c1f078 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -199,7 +199,7 @@ def create_test_multiindex() -> Dataset: mindex = pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=("level_1", "level_2") ) - return Dataset({}, {"x": mindex}) + return Dataset({}, Coordinates.from_pandas_multiindex(mindex, "x")) def create_test_stacked_array() -> tuple[DataArray, DataArray]: @@ -4253,28 +4253,50 @@ def test_assign_multiindex_level(self) -> None: data.assign(level_1=range(4)) data.assign_coords(level_1=range(4)) - def test_assign_coords_new_multiindex(self) -> None: - ds = Dataset() + def test_assign_new_multiindex(self) -> None: + midx = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [0, 1, 0, 1]]) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + + ds = Dataset(coords={"x": [1, 2]}) + expected = Dataset(coords=midx_coords) + + with pytest.warns( + FutureWarning, + match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", + ): + actual = ds.assign(x=midx) + assert_identical(actual, expected) + + @pytest.mark.parametrize("orig_coords", [{}, {"x": range(4)}]) + def test_assign_coords_new_multiindex(self, orig_coords) -> None: + ds = Dataset(coords=orig_coords) midx = pd.MultiIndex.from_arrays( [["a", "a", "b", "b"], [0, 1, 0, 1]], names=("one", "two") ) + midx_coords = Coordinates.from_pandas_multiindex(midx, "x") + + expected = Dataset(coords=midx_coords) with pytest.warns( FutureWarning, match=".*`pandas.MultiIndex`.*no longer be implicitly promoted.*", ): - ds.assign_coords({"x": midx}) + actual = ds.assign_coords({"x": midx}) + assert_identical(actual, expected) + + actual = ds.assign_coords(midx_coords) + assert_identical(actual, expected) def test_assign_coords_existing_multiindex(self) -> None: data = create_test_multiindex() with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): - data.assign_coords(x=range(4)) + updated = data.assign_coords(x=range(4)) + # https://github.com/pydata/xarray/issues/7097 (coord names updated) + assert len(updated.coords) == 1 with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): - data.assign(x=range(4)) - + updated = data.assign(x=range(4)) # https://github.com/pydata/xarray/issues/7097 (coord names updated) - updated = data.assign_coords(x=range(4)) assert len(updated.coords) == 1 def test_assign_all_multiindex_coords(self) -> None: From f6e9368944d6c2bebb737fea167f10e60ff8e4ac Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 15:46:26 +0200 Subject: [PATCH 06/12] nit --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3b02a76e247..1f793428838 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6942,7 +6942,7 @@ def assign( # do all calculations first... results: CoercibleMapping = data._calc_assign_results(variables) - # split data variables to add vs. coordinates to replace + # split data variables to add/replace vs. coordinates to replace results_data_vars: CoercibleMapping = {} results_coords: CoercibleMapping = {} for k, v in results.items(): From 81843d1a8d54ade2703e80c671ddc2bc56619108 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 16:08:46 +0200 Subject: [PATCH 07/12] fix mypy? --- xarray/core/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1f793428838..b0ea1e26435 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -72,6 +72,7 @@ ) from xarray.core.indexing import is_fancy_indexer, map_index_queries from xarray.core.merge import ( + CoercibleValue, dataset_merge_method, dataset_update_method, merge_coordinates_without_align, @@ -6943,8 +6944,8 @@ def assign( results: CoercibleMapping = data._calc_assign_results(variables) # split data variables to add/replace vs. coordinates to replace - results_data_vars: CoercibleMapping = {} - results_coords: CoercibleMapping = {} + results_data_vars: dict[Hashable, CoercibleValue] = {} + results_coords: dict[Hashable, CoercibleValue] = {} for k, v in results.items(): if k in data._coord_names: results_coords[k] = v From f8dde9686aff1632410caf16169382298c868ea8 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 16:15:53 +0200 Subject: [PATCH 08/12] update what's new --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 564c68bfc35..eac86b81b67 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -83,6 +83,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Improved handling of multi-coordinate indexes when updating coordinates, including bug fixes + (and improved warnings for deprecated features) for pandas multi-indexes (:pull:`8094`). + By `BenoƮt Bovy `_. Documentation ~~~~~~~~~~~~~ From 57c0e3bc721d66f10cc91e16f94c51c7ba61825e Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 21 Aug 2023 16:19:35 +0200 Subject: [PATCH 09/12] fix import error --- xarray/core/dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b0ea1e26435..9a53f24c67c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -72,7 +72,6 @@ ) from xarray.core.indexing import is_fancy_indexer, map_index_queries from xarray.core.merge import ( - CoercibleValue, dataset_merge_method, dataset_update_method, merge_coordinates_without_align, @@ -121,7 +120,7 @@ from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.dataarray import DataArray from xarray.core.groupby import DatasetGroupBy - from xarray.core.merge import CoercibleMapping + from xarray.core.merge import CoercibleMapping, CoercibleValue from xarray.core.parallelcompat import ChunkManagerEntrypoint from xarray.core.resample import DatasetResample from xarray.core.rolling import DatasetCoarsen, DatasetRolling From bbe70017a7244ecee11915131ae13f047758f919 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 22 Aug 2023 10:30:40 +0200 Subject: [PATCH 10/12] fix performance regression --- xarray/core/coordinates.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index a2d0c85d845..4e9f6bd34c7 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -454,6 +454,11 @@ def __setitem__(self, key: Hashable, value: Any) -> None: def update(self, other: Mapping[Any, Any]) -> None: """Update this Coordinates variables with other coordinate variables.""" + + # prevent unecessary merge / align operations if there is nothing to update + if not len(other): + return + other_coords: Coordinates if isinstance(other, Coordinates): From 6200d9b2e1ca09d9b53be98e013151cfd74acea3 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 22 Aug 2023 10:34:36 +0200 Subject: [PATCH 11/12] nit --- xarray/core/coordinates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 4e9f6bd34c7..0fc9982d6f0 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -455,7 +455,6 @@ def __setitem__(self, key: Hashable, value: Any) -> None: def update(self, other: Mapping[Any, Any]) -> None: """Update this Coordinates variables with other coordinate variables.""" - # prevent unecessary merge / align operations if there is nothing to update if not len(other): return From 1fd24815ea377c7cb835aead389e3837a2960405 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 23 Aug 2023 10:24:04 +0200 Subject: [PATCH 12/12] use warning util func and improve messages --- xarray/core/coordinates.py | 21 +++++++++++---------- xarray/tests/test_dataarray.py | 4 +++- xarray/tests/test_dataset.py | 8 ++++++-- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 0fc9982d6f0..7c14a8c3d0a 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from collections.abc import Hashable, Iterator, Mapping, Sequence from contextlib import contextmanager from typing import ( @@ -24,7 +23,7 @@ ) from xarray.core.merge import merge_coordinates_without_align, merge_coords from xarray.core.types import Self, T_DataArray -from xarray.core.utils import Frozen, ReprObject +from xarray.core.utils import Frozen, ReprObject, emit_user_level_warning from xarray.core.variable import Variable, as_variable, calculate_dimensions if TYPE_CHECKING: @@ -765,13 +764,12 @@ def drop_indexed_coords( # TODO: remove when removing PandasMultiIndex's dimension coordinate. if isinstance(idx, PandasMultiIndex) and idx_drop_coords == {idx.dim}: idx_drop_coords.update(idx.index.names) - warnings.warn( - f"Updating MultiIndexed coordinate {idx.dim!r} would corrupt indices for " - f"other variables: {list(idx.index.names)!r}. " + emit_user_level_warning( + f"updating coordinate {idx.dim!r} with a PandasMultiIndex would leave " + f"the multi-index level coordinates {list(idx.index.names)!r} in an inconsistent state. " f"This will raise an error in the future. Use `.drop_vars({list(idx_coords)!r})` before " "assigning new coordinate values.", FutureWarning, - stacklevel=4, ) elif idx_drop_coords and len(idx_drop_coords) != len(idx_coords): @@ -843,12 +841,15 @@ def create_coords_with_default_indexes( if pd_mindex_keys: pd_mindex_keys_fmt = ",".join([f"'{k}'" for k in pd_mindex_keys]) - warnings.warn( + emit_user_level_warning( f"the `pandas.MultiIndex` object(s) passed as {pd_mindex_keys_fmt} coordinate(s) or " "data variable(s) will no longer be implicitly promoted and wrapped into " - "multiple indexed coordinates in the future. " - "If you want to keep this behavior, you need to first wrap it explicitly " - "using `xarray.Coordinates.from_pandas_multiindex()` and pass it as coordinates.", + "multiple indexed coordinates in the future " + "(i.e., one coordinate for each multi-index level + one dimension coordinate). " + "If you want to keep this behavior, you need to first wrap it explicitly using " + "`mindex_coords = xarray.Coordinates.from_pandas_multiindex(mindex_obj, 'dim')` " + "and pass it as coordinates, e.g., `xarray.Dataset(coords=mindex_coords)`, " + "`dataset.assign_coords(mindex_coords)` or `dataarray.assign_coords(mindex_coords)`.", FutureWarning, ) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index fd22b3486cd..f615ae70b6b 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1570,7 +1570,9 @@ def test_assign_coords(self) -> None: def test_assign_coords_existing_multiindex(self) -> None: data = self.mda - with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): + with pytest.warns( + FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" + ): data.assign_coords(x=range(4)) def test_assign_coords_custom_index(self) -> None: diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index b16c6c1f078..3110c4e2882 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4289,12 +4289,16 @@ def test_assign_coords_new_multiindex(self, orig_coords) -> None: def test_assign_coords_existing_multiindex(self) -> None: data = create_test_multiindex() - with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): + with pytest.warns( + FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" + ): updated = data.assign_coords(x=range(4)) # https://github.com/pydata/xarray/issues/7097 (coord names updated) assert len(updated.coords) == 1 - with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): + with pytest.warns( + FutureWarning, match=r"updating coordinate.*MultiIndex.*inconsistent" + ): updated = data.assign(x=range(4)) # https://github.com/pydata/xarray/issues/7097 (coord names updated) assert len(updated.coords) == 1