From 6ae5ef918a73c987448f3e0a178cadaf627d9703 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 5 Oct 2022 17:46:03 -0400 Subject: [PATCH 01/11] BUG: groupby(..., dropna=False) drops null values with categorical grouper --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/algorithms.py | 22 +++ pandas/core/groupby/groupby.py | 25 ++- pandas/core/groupby/grouper.py | 50 ++++- pandas/tests/groupby/test_categorical.py | 8 +- pandas/tests/groupby/test_groupby_dropna.py | 194 +++++++++++++++++++- pandas/tests/test_algos.py | 8 + 7 files changed, 282 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 715ba95eb950b..3a8c95920fbd4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -702,7 +702,7 @@ Groupby/resample/rolling - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`) -- +- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cc5ff2e756cfa..0ec71dc064b5f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -407,6 +407,28 @@ def unique(values): return unique_with_mask(values) +def nunique_ints(values: ArrayLike) -> int: + """ + Return the number of unique values for integer array-likes. + + Significantly faster than pandas.unique for long enough sequences. + No checks are done to ensure input is integral. + + Parameters + ---------- + values : 1d array-like + + Returns + ------- + int : The number of unique values in ``values`` + """ + if len(values) == 0: + return 0 + values = _ensure_arraylike(values) + result = (np.bincount(values.ravel()) != 0).sum() + return result + + def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): """See algorithms.unique for docs. Takes a mask for masked arrays.""" values = _ensure_arraylike(values) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d10931586d5e0..83f0cb845379d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -42,6 +42,7 @@ class providing the base-class of operations. Timestamp, lib, ) +from pandas._libs.algos import rank_1d import pandas._libs.groupby as libgroupby from pandas._typing import ( AnyArrayLike, @@ -2373,12 +2374,15 @@ def size(self) -> DataFrame | Series: else: result = self._obj_1d_constructor(result) + with com.temp_setattr(self, "as_index", True): + # size already has the desired behavior in GH#49519, but this makes the + # as_index=False path of _reindex_output fail on categorical groupers. + result = self._reindex_output(result, fill_value=0) if not self.as_index: # error: Incompatible types in assignment (expression has # type "DataFrame", variable has type "Series") result = result.rename("size").reset_index() # type: ignore[assignment] - - return self._reindex_output(result, fill_value=0) + return result @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) @@ -3384,6 +3388,10 @@ def ngroup(self, ascending: bool = True): else: dtype = np.int64 + if any(ping._passed_categorical for ping in self.grouper.groupings): + # comp_ids reflect non-observed groups, we need only observed + comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 + result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) if not ascending: result = self.ngroups - 1 - result @@ -4073,7 +4081,7 @@ def _reindex_output( names = names + [None] index = MultiIndex.from_product(levels_list, names=names) if self.sort: - index = index.sortlevel()[0] + index = index.sort_values() if self.as_index: # Always holds for SeriesGroupBy unless GH#36507 is implemented @@ -4095,12 +4103,12 @@ def _reindex_output( # reindex `output`, and then reset the in-axis grouper columns. # Select in-axis groupers - in_axis_grps = ( + in_axis_grps = list( (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis ) - g_nums, g_names = zip(*in_axis_grps) - - output = output.drop(labels=list(g_names), axis=1) + if len(in_axis_grps) > 0: + g_nums, g_names = zip(*in_axis_grps) + output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) output = output.set_index(self.grouper.result_index).reindex( @@ -4109,7 +4117,8 @@ def _reindex_output( # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) - output = output.reset_index(level=g_nums) + if len(in_axis_grps) > 0: + output = output.reset_index(level=g_nums) return output.reset_index(drop=True) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 688dcb44c31f3..f64edfccd15e4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -620,6 +620,9 @@ def group_arraylike(self) -> ArrayLike: # retain dtype for categories, including unobserved ones return self.result_index._values + elif self._passed_categorical: + return self.group_index + return self._codes_and_uniques[1] @cache_readonly @@ -629,21 +632,39 @@ def result_index(self) -> Index: if self._all_grouper is not None: group_idx = self.group_index assert isinstance(group_idx, CategoricalIndex) - categories = self._all_grouper.categories + cats = self._orig_cats # set_categories is dynamically added - return group_idx.set_categories(categories) # type: ignore[attr-defined] + return group_idx.set_categories(cats) # type: ignore[attr-defined] return self.group_index @cache_readonly def group_index(self) -> Index: - if self._group_index is not None: + if self._group_index is not None and self._all_grouper is None: # _group_index is set in __init__ for MultiIndex cases return self._group_index - uniques = self._codes_and_uniques[1] + codes, uniques = self._codes_and_uniques + if not self._dropna and self._passed_categorical: + assert isinstance(uniques, Categorical) + if self._sort and (codes == len(uniques)).any(): + # Add NA value on the end when sorting + uniques = Categorical.from_codes( + np.append(uniques.codes, [-1]), uniques.categories + ) + else: + # Need to determine proper placement of NA value when not sorting + cat = self.grouping_vector + na_idx = (cat.codes < 0).argmax() + if cat.codes[na_idx] < 0: + # count number of unique codes that comes before the nan value + na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) + uniques = Categorical.from_codes( + np.insert(uniques.codes, na_unique_idx, -1), uniques.categories + ) return Index._with_infer(uniques, name=self.name) - @cache_readonly + # @cache_readonly + @property def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper @@ -663,9 +684,26 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques = Categorical.from_codes( codes=ucodes, categories=categories, ordered=cat.ordered ) + if not self._dropna and np.any(cat.codes < 0): + if self._sort: + # Replace NA (negative) codes with `largest code + 1` + na_code = len(categories) + codes = np.where(cat.codes < 0, na_code, cat.codes) + else: + # Insert NA code into the codes based on first appearance + # A negative code must exist, no need to check codes[na_idx] < 0 + na_idx = (cat.codes < 0).argmax() + # count number of unique codes that comes before the nan value + na_code = algorithms.nunique_ints(cat.codes[:na_idx]) + codes = np.where(cat.codes >= na_code, cat.codes + 1, cat.codes) + codes = np.where(codes < 0, na_code, codes) + else: + codes = cat.codes + if not self._observed: uniques = uniques.reorder_categories(self._orig_cats) - return cat.codes, uniques + + return codes, uniques elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ca794d4ae5a3e..af88cd8245195 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -828,6 +828,7 @@ def test_preserve_categories(): df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False, name="A") # GH#48749 - don't change order of categories + # GH#42482 - don't sort result when sort=False, even when ordered=True nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A") tm.assert_index_equal( df.groupby("A", sort=True, observed=False).first().index, sort_index @@ -1213,7 +1214,7 @@ def test_seriesgroupby_observed_true(df_cat, operation): lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A") lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B") index = MultiIndex.from_arrays([lev_a, lev_b]) - expected = Series(data=[2, 4, 1, 3], index=index, name="C") + expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index() grouped = df_cat.groupby(["A", "B"], observed=True)["C"] result = getattr(grouped, operation)(sum) @@ -1856,10 +1857,7 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - msg = "is deprecated and will be removed in a future version" - warn = FutureWarning if reduction_func == "mad" else None - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, reduction_func)(*args) + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 50eb9aabcc55c..41989dbb1604e 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -7,6 +7,7 @@ import pandas as pd import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args @pytest.mark.parametrize( @@ -424,7 +425,7 @@ def test_groupby_drop_nan_with_multi_index(): ], ) @pytest.mark.parametrize("test_series", [True, False]) -def test_no_sort_keep_na(request, sequence_index, dtype, test_series): +def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): # GH#46584, GH#48794 # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz" @@ -433,11 +434,6 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series): [{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)] ) - if dtype == "category" and "z" in sequence: - # Only xfail when nulls are present - msg = "dropna=False not correct for categorical, GH#48645" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - # Unique values to use for grouper, depends on dtype if dtype in ("string", "string[pyarrow]"): uniques = {"x": "x", "y": "y", "z": pd.NA} @@ -452,7 +448,7 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series): "a": [0, 1, 2, 3], } ) - gb = df.groupby("key", dropna=False, sort=False) + gb = df.groupby("key", dropna=False, sort=False, as_index=as_index) if test_series: gb = gb["a"] result = gb.sum() @@ -477,6 +473,10 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series): expected = pd.Series(summed.values(), index=index, name="a", dtype=None) if not test_series: expected = expected.to_frame() + if not as_index: + expected = expected.reset_index() + if dtype is not None and dtype.startswith("Sparse"): + expected["key"] = expected["key"].astype(dtype) tm.assert_equal(result, expected) @@ -498,3 +498,183 @@ def test_null_is_null_for_dtype( tm.assert_series_equal(result, expected["a"]) else: tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) +def test_categorical_reducers( + request, reduction_func, observed, sort, as_index, index_kind +): + # GH#36327 + if ( + reduction_func in ("idxmin", "idxmax") + and not observed + and index_kind != "multi" + ): + msg = "GH#10694 - idxmin/max broken for categorical with observed=False" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + + # Ensure there is at least one null value by appending to the end + values = np.append(np.random.choice([1, 2, None], size=19), None) + df = pd.DataFrame( + {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)} + ) + + # Strategy: Compare to dropna=True by filling null values with a new code + df_filled = df.copy() + df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4) + + if index_kind == "range": + keys = ["x"] + elif index_kind == "single": + keys = ["x"] + df = df.set_index("x") + df_filled = df_filled.set_index("x") + else: + keys = ["x", "x2"] + df["x2"] = df["x"] + df = df.set_index(["x", "x2"]) + df_filled["x2"] = df_filled["x"] + df_filled = df_filled.set_index(["x", "x2"]) + args = get_groupby_method_args(reduction_func, df) + args_filled = get_groupby_method_args(reduction_func, df_filled) + + gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) + expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() + expected["x"] = expected["x"].replace(4, None) + if index_kind == "multi": + expected["x2"] = expected["x2"].replace(4, None) + if as_index: + if index_kind == "multi": + expected = expected.set_index(["x", "x2"]) + else: + expected = expected.set_index("x") + else: + if index_kind != "range" and reduction_func != "size": + # size, unlike other methods, has the desired behavior in GH#49519 + expected = expected.drop(columns="x") + if index_kind == "multi": + expected = expected.drop(columns="x2") + if reduction_func in ("idxmax", "idxmin") and index_kind != "range": + # expected was computed with a RangeIndex; need to translate to index values + values = expected["y"].values.tolist() + if index_kind == "single": + values = [np.nan if e == 4 else e for e in values] + else: + values = [(np.nan, np.nan) if e == (4, 4) else e for e in values] + expected["y"] = values + if reduction_func == "size": + # size, unlike other methods, has the desired behavior in GH#49519 + expected = expected.rename(columns={0: "size"}) + if as_index: + expected = expected["size"].rename(None) + + gb_keepna = df.groupby( + keys, dropna=False, observed=observed, sort=sort, as_index=as_index + ) + result = getattr(gb_keepna, reduction_func)(*args) + + # size will return a Series, others are DataFrame + tm.assert_equal(result, expected) + + +def test_categorical_transformers( + request, transformation_func, observed, sort, as_index +): + # GH#36327 + if transformation_func == "fillna": + msg = "GH#49651 fillna may incorrectly reorders results when dropna=False" + request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False)) + + values = np.append(np.random.choice([1, 2, None], size=19), None) + df = pd.DataFrame( + {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)} + ) + args = get_groupby_method_args(transformation_func, df) + + # Compute result for null group + null_group_values = df[df["x"].isnull()]["y"] + if transformation_func == "cumcount": + null_group_data = list(range(len(null_group_values))) + elif transformation_func == "ngroup": + if sort: + if observed: + na_group = df["x"].nunique(dropna=False) - 1 + else: + # TODO: Should this be 3? + na_group = df["x"].nunique(dropna=False) - 1 + else: + na_group = df.iloc[: null_group_values.index[0]]["x"].nunique() + null_group_data = len(null_group_values) * [na_group] + else: + null_group_data = getattr(null_group_values, transformation_func)(*args) + null_group_result = pd.DataFrame({"y": null_group_data}) + + gb_keepna = df.groupby( + "x", dropna=False, observed=observed, sort=sort, as_index=as_index + ) + gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort) + result = getattr(gb_keepna, transformation_func)(*args) + expected = getattr(gb_dropna, transformation_func)(*args) + for iloc, value in zip( + df[df["x"].isnull()].index.tolist(), null_group_result.values + ): + if expected.ndim == 1: + expected.iloc[iloc] = value + else: + expected.iloc[iloc, 0] = value + if transformation_func not in ("rank", "diff", "pct_change", "shift"): + expected = expected.astype(int) + if transformation_func == "ngroup": + expected[df["x"].notnull() & expected.ge(na_group)] += 1 + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("method", ["head", "tail"]) +def test_categorical_head_tail(method, observed, sort, as_index): + # GH#36327 + values = np.random.choice([1, 2, None], 30) + df = pd.DataFrame( + {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} + ) + gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index) + result = getattr(gb, method)() + + if method == "tail": + values = values[::-1] + # Take the top 5 values from each group + mask = ( + ((values == 1) & ((values == 1).cumsum() <= 5)) + | ((values == 2) & ((values == 2).cumsum() <= 5)) + # flake8 doesn't like the vectorized check for None, thinks we should use `is` + | ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711 + ) + if method == "tail": + mask = mask[::-1] + expected = df[mask] + + tm.assert_frame_equal(result, expected) + + +def test_categorical_agg(): + # GH#36327 + values = np.random.choice([1, 2, None], 30) + df = pd.DataFrame( + {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} + ) + gb = df.groupby("x", dropna=False) + result = gb.agg(lambda x: x.sum()) + expected = gb.sum() + tm.assert_frame_equal(result, expected) + + +def test_categorical_transform(): + # GH#36327 + values = np.random.choice([1, 2, None], 30) + df = pd.DataFrame( + {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} + ) + gb = df.groupby("x", dropna=False) + result = gb.transform(lambda x: x.sum()) + expected = gb.transform("sum") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b982d247c2707..df83a5e410e71 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -852,6 +852,14 @@ def test_unique_masked(self, any_numeric_ea_dtype): tm.assert_extension_array_equal(result, expected) +def test_nunique_ints(index_or_series_or_array): + # GH#36327 + values = index_or_series_or_array(np.random.randint(0, 20, 30)) + result = algos.nunique_ints(values) + expected = len(algos.unique(values)) + assert result == expected + + class TestIsin: def test_invalid(self): From 20e17ab11d109587c7ccba3320209c1efd0c2feb Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 07:56:10 -0500 Subject: [PATCH 02/11] Use intp --- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 41989dbb1604e..04d086d38b93f 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -623,7 +623,7 @@ def test_categorical_transformers( else: expected.iloc[iloc, 0] = value if transformation_func not in ("rank", "diff", "pct_change", "shift"): - expected = expected.astype(int) + expected = expected.astype("intp") if transformation_func == "ngroup": expected[df["x"].notnull() & expected.ge(na_group)] += 1 From 825461985e25d1d2209aa8d5b09e5a2212f1cfa3 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 12 Nov 2022 23:54:41 -0500 Subject: [PATCH 03/11] Fixups --- pandas/core/algorithms.py | 2 +- pandas/core/groupby/grouper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0ec71dc064b5f..178fe6f1c7b98 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -424,7 +424,7 @@ def nunique_ints(values: ArrayLike) -> int: """ if len(values) == 0: return 0 - values = _ensure_arraylike(values) + values = _ensure_data(values) result = (np.bincount(values.ravel()) != 0).sum() return result diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f64edfccd15e4..672bd938144c6 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -621,7 +621,7 @@ def group_arraylike(self) -> ArrayLike: return self.result_index._values elif self._passed_categorical: - return self.group_index + return self.group_index._values return self._codes_and_uniques[1] From f679fd736a96f3d23cacf72482a4174e83eac4dc Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 13 Nov 2022 00:02:33 -0500 Subject: [PATCH 04/11] Use intp --- pandas/tests/groupby/test_groupby_dropna.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 04d086d38b93f..e3229bdc52738 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -626,6 +626,7 @@ def test_categorical_transformers( expected = expected.astype("intp") if transformation_func == "ngroup": expected[df["x"].notnull() & expected.ge(na_group)] += 1 + expected = expected.astype("intp") tm.assert_equal(result, expected) From 34760ca27cc6c12a4db538e13f68b3e3a73ee8c8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 15 Nov 2022 20:07:38 -0500 Subject: [PATCH 05/11] int64 --- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index e3229bdc52738..908639a252496 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -626,7 +626,7 @@ def test_categorical_transformers( expected = expected.astype("intp") if transformation_func == "ngroup": expected[df["x"].notnull() & expected.ge(na_group)] += 1 - expected = expected.astype("intp") + expected = expected.astype("int64") tm.assert_equal(result, expected) From 93f306c16cabea6987e7b2caff78750e136dacf2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 19 Nov 2022 12:26:44 -0500 Subject: [PATCH 06/11] dtype fix --- pandas/tests/groupby/test_groupby_dropna.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 908639a252496..9d9c900057f34 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -622,10 +622,9 @@ def test_categorical_transformers( expected.iloc[iloc] = value else: expected.iloc[iloc, 0] = value - if transformation_func not in ("rank", "diff", "pct_change", "shift"): - expected = expected.astype("intp") if transformation_func == "ngroup": expected[df["x"].notnull() & expected.ge(na_group)] += 1 + if transformation_func not in ("rank", "diff", "pct_change", "shift"): expected = expected.astype("int64") tm.assert_equal(result, expected) From f3a3ebba9519b5a7eb942c0f060a52a0bed72f5a Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 19 Nov 2022 20:07:22 -0500 Subject: [PATCH 07/11] Breakup op to debug on CI --- pandas/core/algorithms.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 76f07942f117c..762c0d89b5f8a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -425,7 +425,11 @@ def nunique_ints(values: ArrayLike) -> int: if len(values) == 0: return 0 values = _ensure_data(values) - result = (np.bincount(values.ravel()) != 0).sum() + a = values.ravel() + b = np.bincount(a) + c = b != 0 + result = c.sum() + # result = (np.bincount(values.ravel()) != 0).sum() return result From 4bfeaa1d7ccb2c3275cb909190ad433ed01fdbb2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 20 Nov 2022 08:05:48 -0500 Subject: [PATCH 08/11] Trying with intp --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 76f07942f117c..b18f4a9d810ee 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -425,7 +425,7 @@ def nunique_ints(values: ArrayLike) -> int: if len(values) == 0: return 0 values = _ensure_data(values) - result = (np.bincount(values.ravel()) != 0).sum() + result = (np.bincount(values.ravel().astype("intp")) != 0).sum() return result From 4d72402d74a7345df2957460d3d45da6e9dbaf41 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 28 Nov 2022 21:01:58 -0500 Subject: [PATCH 09/11] Restore cache decorator --- pandas/core/groupby/grouper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 672bd938144c6..ef2f53a905d3f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -663,8 +663,7 @@ def group_index(self) -> Index: ) return Index._with_infer(uniques, name=self.name) - # @cache_readonly - @property + @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper From 1e3bff33ba98deaefb20929ed7fcac9e6d52e80f Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 28 Nov 2022 21:41:57 -0500 Subject: [PATCH 10/11] Add bincount comment --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 397bac447c402..f40b5a094889c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -425,6 +425,7 @@ def nunique_ints(values: ArrayLike) -> int: if len(values) == 0: return 0 values = _ensure_data(values) + # bincount requires intp result = (np.bincount(values.ravel().astype("intp")) != 0).sum() return result From c8ba7adc4c603b627f0a6a391f1df157f08b6391 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 29 Nov 2022 20:47:16 -0500 Subject: [PATCH 11/11] Rework recoding logic --- pandas/core/groupby/grouper.py | 32 +++++++++++---------- pandas/tests/groupby/test_groupby_dropna.py | 4 +++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 672bd938144c6..dacabf10cef35 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -684,21 +684,23 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques = Categorical.from_codes( codes=ucodes, categories=categories, ordered=cat.ordered ) - if not self._dropna and np.any(cat.codes < 0): - if self._sort: - # Replace NA (negative) codes with `largest code + 1` - na_code = len(categories) - codes = np.where(cat.codes < 0, na_code, cat.codes) - else: - # Insert NA code into the codes based on first appearance - # A negative code must exist, no need to check codes[na_idx] < 0 - na_idx = (cat.codes < 0).argmax() - # count number of unique codes that comes before the nan value - na_code = algorithms.nunique_ints(cat.codes[:na_idx]) - codes = np.where(cat.codes >= na_code, cat.codes + 1, cat.codes) - codes = np.where(codes < 0, na_code, codes) - else: - codes = cat.codes + + codes = cat.codes + if not self._dropna: + na_mask = codes < 0 + if np.any(na_mask): + if self._sort: + # Replace NA codes with `largest code + 1` + na_code = len(categories) + codes = np.where(na_mask, na_code, codes) + else: + # Insert NA code into the codes based on first appearance + # A negative code must exist, no need to check codes[na_idx] < 0 + na_idx = na_mask.argmax() + # count number of unique codes that comes before the nan value + na_code = algorithms.nunique_ints(codes[:na_idx]) + codes = np.where(codes >= na_code, codes + 1, codes) + codes = np.where(na_mask, na_code, codes) if not self._observed: uniques = uniques.reorder_categories(self._orig_cats) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 9d9c900057f34..5418a2a60dc80 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -537,6 +537,10 @@ def test_categorical_reducers( df_filled = df_filled.set_index(["x", "x2"]) args = get_groupby_method_args(reduction_func, df) args_filled = get_groupby_method_args(reduction_func, df_filled) + if reduction_func == "corrwith" and index_kind == "range": + # Don't include the grouping columns so we can call reset_index + args = (args[0].drop(columns=keys),) + args_filled = (args_filled[0].drop(columns=keys),) gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()