diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 6239ddf9442e7..9f120ff9c8ca1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -243,7 +243,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`) -- +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 69768e1b26bc7..b11bbf35312c9 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import numpy as np from pandas.core.algorithms import unique1d @@ -11,9 +9,6 @@ recode_for_categories, ) -if TYPE_CHECKING: - from pandas.core.indexes.api import CategoricalIndex - def recode_for_groupby( c: Categorical, sort: bool, observed: bool @@ -77,7 +72,7 @@ def recode_for_groupby( # sort=False should order groups in as-encountered order (GH-8868) # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories - all_codes = np.arange(c.categories.nunique(), dtype=np.int8) + all_codes = np.arange(c.categories.nunique()) # GH 38140: exclude nan from indexer for categories unique_notnan_codes = unique1d(c.codes[c.codes != -1]) if c.ordered: @@ -90,32 +85,3 @@ def recode_for_groupby( take_codes = unique_notnan_codes return Categorical(c, c.unique().categories.take(take_codes)), None - - -def recode_from_groupby( - c: Categorical, sort: bool, ci: CategoricalIndex -) -> CategoricalIndex: - """ - Reverse the codes_to_groupby to account for sort / observed. - - Parameters - ---------- - c : Categorical - sort : bool - The value of the sort parameter groupby was called with. - ci : CategoricalIndex - The codes / categories to recode - - Returns - ------- - CategoricalIndex - """ - # we re-order to the original category orderings - if sort: - # error: "CategoricalIndex" has no attribute "set_categories" - return ci.set_categories(c.categories) # type: ignore[attr-defined] - - # we are not sorting, so add unobserved to the end - new_cats = c.categories[~c.categories.isin(ci.categories)] - # error: "CategoricalIndex" has no attribute "add_categories" - return ci.add_categories(new_cats) # type: ignore[attr-defined] diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index dc7679a1744ea..7ae6495f15541 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -39,10 +39,7 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import ops -from pandas.core.groupby.categorical import ( - recode_for_groupby, - recode_from_groupby, -) +from pandas.core.groupby.categorical import recode_for_groupby from pandas.core.indexes.api import ( CategoricalIndex, Index, @@ -462,6 +459,7 @@ class Grouping: _group_index: Index | None = None _passed_categorical: bool _all_grouper: Categorical | None + _orig_cats: Index | None _index: Index def __init__( @@ -479,6 +477,7 @@ def __init__( self._orig_grouper = grouper self.grouping_vector = _convert_grouper(index, grouper) self._all_grouper = None + self._orig_cats = None self._index = index self._sort = sort self.obj = obj @@ -529,6 +528,7 @@ def __init__( # a passed Categorical self._passed_categorical = True + self._orig_cats = self.grouping_vector.categories self.grouping_vector, self._all_grouper = recode_for_groupby( self.grouping_vector, sort, observed ) @@ -646,7 +646,9 @@ def result_index(self) -> Index: if self._all_grouper is not None: group_idx = self.group_index assert isinstance(group_idx, CategoricalIndex) - return recode_from_groupby(self._all_grouper, self._sort, group_idx) + categories = self._all_grouper.categories + # set_categories is dynamically added + return group_idx.set_categories(categories) # type: ignore[attr-defined] return self.group_index @cache_readonly @@ -678,6 +680,8 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques = Categorical.from_codes( codes=ucodes, categories=categories, ordered=cat.ordered ) + if not self._observed: + uniques = uniques.reorder_categories(self._orig_cats) return cat.codes, uniques elif isinstance(self.grouping_vector, ops.BaseGrouper): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index a3821fc2216ec..092fd4a4d6be0 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -781,7 +781,8 @@ def test_preserve_categories(): # ordered=False df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False, name="A") - nosort_index = CategoricalIndex(list("bac"), list("bac"), ordered=False, name="A") + # GH#48749 - don't change order of categories + nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A") tm.assert_index_equal( df.groupby("A", sort=True, observed=False).first().index, sort_index ) @@ -964,7 +965,8 @@ def test_sort2(): index = CategoricalIndex( ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], - categories=["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], + # GH#48749 - don't change order of categories + categories=["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ) expected_nosort = DataFrame( @@ -1041,27 +1043,35 @@ def test_sort_datetimelike(): # ordered = False df["dt"] = Categorical(df["dt"], ordered=False) - index = [ - datetime(2011, 1, 1), - datetime(2011, 2, 1), - datetime(2011, 5, 1), - datetime(2011, 7, 1), - ] + sort_index = CategoricalIndex( + [ + datetime(2011, 1, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 7, 1), + ], + name="dt", + ) result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=sort_index ) - result_sort.index = CategoricalIndex(index, name="dt") - index = [ - datetime(2011, 7, 1), - datetime(2011, 2, 1), - datetime(2011, 5, 1), - datetime(2011, 1, 1), - ] + nosort_index = CategoricalIndex( + [ + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 1, 1), + ], + # GH#48749 - don't change order of categories + categories=sort_index.categories, + name="dt", + ) result_nosort = DataFrame( - [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] + [[10, 10], [5, 30], [6, 40], [1, 60]], + columns=["foo", "bar"], + index=nosort_index, ) - result_nosort.index = CategoricalIndex(index, categories=index, name="dt") col = "dt" tm.assert_frame_equal( @@ -1836,3 +1846,203 @@ def test_groupby_categorical_dropna(observed, dropna): expected.index.name = "x" tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) +@pytest.mark.parametrize("ordered", [True, False]) +def test_category_order_reducer( + request, as_index, sort, observed, reduction_func, index_kind, ordered +): + # GH#48749 + if ( + reduction_func in ("idxmax", "idxmin") + and not observed + and index_kind == "range" + ): + msg = "GH#10694 - idxmax/min fail with unused categories" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + elif index_kind != "range" and not as_index: + pytest.skip(reason="Result doesn't have categories, nothing to test") + df = DataFrame( + { + "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered), + "b": range(4), + } + ) + if index_kind == "range": + keys = ["a"] + elif index_kind == "single": + keys = ["a"] + df = df.set_index(keys) + elif index_kind == "multi": + keys = ["a", "a2"] + df["a2"] = df["a"] + df = df.set_index(keys) + args = get_groupby_method_args(reduction_func, df) + gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + msg = "is deprecated and will be removed in a future version" + warn = FutureWarning if reduction_func == "mad" else None + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, reduction_func)(*args) + if as_index: + result = op_result.index.get_level_values("a").categories + else: + result = op_result["a"].cat.categories + expected = Index([1, 4, 3, 2]) + tm.assert_index_equal(result, expected) + + if index_kind == "multi": + result = op_result.index.get_level_values("a2").categories + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("index_kind", ["single", "multi"]) +@pytest.mark.parametrize("ordered", [True, False]) +def test_category_order_transformer( + as_index, sort, observed, transformation_func, index_kind, ordered +): + # GH#48749 + df = DataFrame( + { + "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered), + "b": range(4), + } + ) + if index_kind == "single": + keys = ["a"] + df = df.set_index(keys) + elif index_kind == "multi": + keys = ["a", "a2"] + df["a2"] = df["a"] + df = df.set_index(keys) + args = get_groupby_method_args(transformation_func, df) + gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + msg = "is deprecated and will be removed in a future version" + warn = FutureWarning if transformation_func == "tshift" else None + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, transformation_func)(*args) + result = op_result.index.get_level_values("a").categories + expected = Index([1, 4, 3, 2]) + tm.assert_index_equal(result, expected) + + if index_kind == "multi": + result = op_result.index.get_level_values("a2").categories + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) +@pytest.mark.parametrize("method", ["head", "tail"]) +@pytest.mark.parametrize("ordered", [True, False]) +def test_category_order_head_tail( + as_index, sort, observed, method, index_kind, ordered +): + # GH#48749 + df = DataFrame( + { + "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered), + "b": range(4), + } + ) + if index_kind == "range": + keys = ["a"] + elif index_kind == "single": + keys = ["a"] + df = df.set_index(keys) + elif index_kind == "multi": + keys = ["a", "a2"] + df["a2"] = df["a"] + df = df.set_index(keys) + gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + op_result = getattr(gb, method)() + if index_kind == "range": + result = op_result["a"].cat.categories + else: + result = op_result.index.get_level_values("a").categories + expected = Index([1, 4, 3, 2]) + tm.assert_index_equal(result, expected) + + if index_kind == "multi": + result = op_result.index.get_level_values("a2").categories + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) +@pytest.mark.parametrize("method", ["apply", "agg", "transform"]) +@pytest.mark.parametrize("ordered", [True, False]) +def test_category_order_apply(as_index, sort, observed, method, index_kind, ordered): + # GH#48749 + if (method == "transform" and index_kind == "range") or ( + not as_index and index_kind != "range" + ): + pytest.skip("No categories in result, nothing to test") + df = DataFrame( + { + "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered), + "b": range(4), + } + ) + if index_kind == "range": + keys = ["a"] + elif index_kind == "single": + keys = ["a"] + df = df.set_index(keys) + elif index_kind == "multi": + keys = ["a", "a2"] + df["a2"] = df["a"] + df = df.set_index(keys) + gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + if (method == "transform" or not as_index) and index_kind == "range": + result = op_result["a"].cat.categories + else: + result = op_result.index.get_level_values("a").categories + expected = Index([1, 4, 3, 2]) + tm.assert_index_equal(result, expected) + + if index_kind == "multi": + result = op_result.index.get_level_values("a2").categories + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) +def test_many_categories(as_index, sort, index_kind, ordered): + # GH#48749 - Test when the grouper has many categories + if index_kind != "range" and not as_index: + pytest.skip(reason="Result doesn't have categories, nothing to test") + categories = np.arange(9999, -1, -1) + grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered) + df = DataFrame({"a": grouper, "b": range(4)}) + if index_kind == "range": + keys = ["a"] + elif index_kind == "single": + keys = ["a"] + df = df.set_index(keys) + elif index_kind == "multi": + keys = ["a", "a2"] + df["a2"] = df["a"] + df = df.set_index(keys) + gb = df.groupby(keys, as_index=as_index, sort=sort, observed=True) + result = gb.sum() + + # Test is setup so that data and index are the same values + # TODO: GH#49223 - Order of values should be the same for all index_kinds + if index_kind == "range": + data = [3, 2, 1] if ordered else [2, 1, 3] + else: + data = [3, 2, 1] if sort else [2, 1, 3] + + index = CategoricalIndex( + data, categories=grouper.categories, ordered=ordered, name="a" + ) + if as_index: + expected = DataFrame({"b": data}) + if index_kind == "multi": + expected.index = MultiIndex.from_frame(DataFrame({"a": index, "a2": index})) + else: + expected.index = index + elif index_kind == "multi": + expected = DataFrame({"a": Series(index), "a2": Series(index), "b": data}) + else: + expected = DataFrame({"a": Series(index), "b": data}) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index a6088e4999402..50eb9aabcc55c 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -465,7 +465,7 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series): if dtype == "category": index = pd.CategoricalIndex( [uniques[e] for e in summed], - list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}), + df["key"].cat.categories, name="key", ) elif isinstance(dtype, str) and dtype.startswith("Sparse"):