From 5201eec20d624df50a32b9861a2cb7933d5e6935 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 16 Nov 2020 11:28:26 -0800 Subject: [PATCH 1/4] BUG: raise on missing for listlike indexing with CategoricalIndex --- pandas/core/indexes/category.py | 16 +----- pandas/core/indexing.py | 28 ++++------- pandas/tests/indexing/test_categorical.py | 61 +++++++++++++---------- pandas/tests/indexing/test_loc.py | 7 ++- 4 files changed, 54 insertions(+), 58 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 06df8f85cded7..a1f806509dd83 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -583,23 +583,11 @@ def _convert_list_indexer(self, keyarr): # the categories if self.categories._defer_to_indexing: + # See tests.indexing.interval.test_interval:test_loc_getitem_frame indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - msg = "a list-indexer must only include values that are in the categories" - if self.hasnans: - msg += " or NA" - try: - codes = self._data._validate_setitem_value(keyarr) - except (ValueError, TypeError) as err: - if "Index data must be 1-dimensional" in str(err): - # e.g. test_setitem_ndarray_3d - raise - raise KeyError(msg) - if not self.hasnans and (codes == -1).any(): - raise KeyError(msg) - - return self.get_indexer(keyarr) + return self.get_indexer_for(keyarr) @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side: str, kind): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a8951e342e0da..9b824ebad1eb9 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1246,9 +1246,7 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): indexer, keyarr = ax._convert_listlike_indexer(key) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer( - keyarr, indexer, axis, raise_missing=raise_missing - ) + # _validate_read_indexer is a no-op if no -1s, so skip return ax[indexer], indexer if ax._index_as_unique: @@ -1309,21 +1307,15 @@ def _validate_read_indexer( not_found = list(set(key) - set(ax)) raise KeyError(f"{not_found} not in index") - # we skip the warning on Categorical - # as this check is actually done (check for - # non-missing values), but a bit later in the - # code, so we want to avoid warning & then - # just raising - if not ax.is_categorical(): - not_found = key[missing_mask] - - with option_context("display.max_seq_items", 10, "display.width", 80): - raise KeyError( - "Passing list-likes to .loc or [] with any missing labels " - "is no longer supported. " - f"The following labels were missing: {not_found}. " - "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 - ) + not_found = key[missing_mask] + + with option_context("display.max_seq_items", 10, "display.width", 80): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported. " + f"The following labels were missing: {not_found}. " + "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) @doc(IndexingMixin.iloc) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9b9ece68b887e..332c9f0e47a5f 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -261,10 +263,14 @@ def test_loc_listlike(self): expected = self.df.iloc[[4, 0, 1, 5]] tm.assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.loc[["a", "b", "e"]] - exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") - expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) - tm.assert_frame_equal(result, expected, check_index_type=True) + # listlike containing an element in the categories but not in the values + msg = ( + "The following labels were missing: CategoricalIndex(['e'], " + "categories=['c', 'a', 'b', 'e'], ordered=False, name='B', " + "dtype='category')" + ) + with pytest.raises(KeyError, match=re.escape(msg)): + self.df2.loc[["a", "b", "e"]] # element in the categories but not in the values with pytest.raises(KeyError, match=r"^'e'$"): @@ -278,15 +284,11 @@ def test_loc_listlike(self): expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) tm.assert_frame_equal(result, expected) - df = self.df2.copy() - result = df.loc[["a", "b", "e"]] - exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") - expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) - tm.assert_frame_equal(result, expected, check_index_type=True) - # not all labels in the categories - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): + msg = ( + "The following labels were missing: Index(['d'], dtype='object', name='B')" + ) + with pytest.raises(KeyError, match=re.escape(msg)): self.df2.loc[["a", "d"]] def test_loc_listlike_dtypes(self): @@ -309,8 +311,8 @@ def test_loc_listlike_dtypes(self): exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): + msg = "The following labels were missing: Index(['x'], dtype='object')" + with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] # duplicated categories and codes @@ -332,8 +334,7 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] # contains unused category @@ -347,13 +348,6 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - res = df.loc[["a", "e"]] - exp = DataFrame( - {"A": [1, 3, np.nan], "B": [5, 7, np.nan]}, - index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), - ) - tm.assert_frame_equal(res, exp, check_index_type=True) - # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( @@ -362,10 +356,27 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] + def test_loc_getitem_listlike_unused_category_raises_keyerro(self): + # key that is an *unused* category raises + index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) + df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) + + with pytest.raises(KeyError, match="e"): + # For comparison, check the scalar behavior + df.loc["e"] + + msg = ( + "Passing list-likes to .loc or [] with any missing labels is no " + "longer supported. The following labels were missing: " + "CategoricalIndex(['e'], categories=['a', 'b', 'c', 'd', 'e'], " + "ordered=False, dtype='category'). See https" + ) + with pytest.raises(KeyError, match=re.escape(msg)): + df.loc[["a", "e"]] + def test_ix_categorical_index(self): # GH 12531 df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b45eddc3ac49c..28846bcf2f14d 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1674,7 +1674,12 @@ def test_loc_getitem_list_of_labels_categoricalindex_with_na(self, box): ser2 = ser[:-1] ci2 = ci[1:] # but if there are no NAs present, this should raise KeyError - msg = "a list-indexer must only include values that are in the categories" + msg = ( + r"Passing list-likes to .loc or \[\] with any missing labels is no " + "longer supported. The following labels were missing: " + r"(Categorical)?Index\(\[nan\], .*\). " + "See https" + ) with pytest.raises(KeyError, match=msg): ser2.loc[box(ci2)] From 04901e11cdbe371fe2514e4afbdc061185833555 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 16 Nov 2020 14:00:17 -0800 Subject: [PATCH 2/4] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 28f7df98cb86b..3f2fd73aacc4b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -584,6 +584,7 @@ Indexing - Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`) - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) - Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`???`) Missing ^^^^^^^ From 7a74b01f2fcbccc67a268696dcc12593c5ef1519 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 16 Nov 2020 14:02:57 -0800 Subject: [PATCH 3/4] GH ref --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3f2fd73aacc4b..9f703a4fb19e8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -584,7 +584,7 @@ Indexing - Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`) - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) - Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`???`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) Missing ^^^^^^^ From ad8c211ebc053b435e7e389a7743a805c665a0bc Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 16 Nov 2020 19:23:12 -0800 Subject: [PATCH 4/4] split test --- pandas/tests/indexing/test_categorical.py | 24 +++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 332c9f0e47a5f..94fc3960f24c5 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -256,13 +256,14 @@ def test_slicing_doc_examples(self): ) tm.assert_frame_equal(result, expected) - def test_loc_listlike(self): - + def test_loc_getitem_listlike_labels(self): # list of labels result = self.df.loc[["c", "a"]] expected = self.df.iloc[[4, 0, 1, 5]] tm.assert_frame_equal(result, expected, check_index_type=True) + def test_loc_getitem_listlike_unused_category(self): + # GH#37901 a label that is in index.categories but not in index # listlike containing an element in the categories but not in the values msg = ( "The following labels were missing: CategoricalIndex(['e'], " @@ -272,18 +273,12 @@ def test_loc_listlike(self): with pytest.raises(KeyError, match=re.escape(msg)): self.df2.loc[["a", "b", "e"]] + def test_loc_getitem_label_unused_category(self): # element in the categories but not in the values with pytest.raises(KeyError, match=r"^'e'$"): self.df2.loc["e"] - # assign is ok - df = self.df2.copy() - df.loc["e"] = 20 - result = df.loc[["a", "b", "e"]] - exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") - expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) - tm.assert_frame_equal(result, expected) - + def test_loc_getitem_non_category(self): # not all labels in the categories msg = ( "The following labels were missing: Index(['d'], dtype='object', name='B')" @@ -291,6 +286,15 @@ def test_loc_listlike(self): with pytest.raises(KeyError, match=re.escape(msg)): self.df2.loc[["a", "d"]] + def test_loc_setitem_expansion_label_unused_category(self): + # assigning with a label that is in the categories but not in the index + df = self.df2.copy() + df.loc["e"] = 20 + result = df.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) + tm.assert_frame_equal(result, expected) + def test_loc_listlike_dtypes(self): # GH 11586