From 6ae5ef918a73c987448f3e0a178cadaf627d9703 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 5 Oct 2022 17:46:03 -0400
Subject: [PATCH 01/11] BUG: groupby(..., dropna=False) drops null values with
 categorical grouper

---
 doc/source/whatsnew/v2.0.0.rst              |   2 +-
 pandas/core/algorithms.py                   |  22 +++
 pandas/core/groupby/groupby.py              |  25 ++-
 pandas/core/groupby/grouper.py              |  50 ++++-
 pandas/tests/groupby/test_categorical.py    |   8 +-
 pandas/tests/groupby/test_groupby_dropna.py | 194 +++++++++++++++++++-
 pandas/tests/test_algos.py                  |   8 +
 7 files changed, 282 insertions(+), 27 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 715ba95eb950b..3a8c95920fbd4 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -702,7 +702,7 @@ Groupby/resample/rolling
 - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`)
 - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`)
 - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`)
--
+- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index cc5ff2e756cfa..0ec71dc064b5f 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -407,6 +407,28 @@ def unique(values):
     return unique_with_mask(values)
 
 
+def nunique_ints(values: ArrayLike) -> int:
+    """
+    Return the number of unique values for integer array-likes.
+
+    Significantly faster than pandas.unique for long enough sequences.
+    No checks are done to ensure input is integral.
+
+    Parameters
+    ----------
+    values : 1d array-like
+
+    Returns
+    -------
+    int : The number of unique values in ``values``
+    """
+    if len(values) == 0:
+        return 0
+    values = _ensure_arraylike(values)
+    result = (np.bincount(values.ravel()) != 0).sum()
+    return result
+
+
 def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
     """See algorithms.unique for docs. Takes a mask for masked arrays."""
     values = _ensure_arraylike(values)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index d10931586d5e0..83f0cb845379d 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -42,6 +42,7 @@ class providing the base-class of operations.
     Timestamp,
     lib,
 )
+from pandas._libs.algos import rank_1d
 import pandas._libs.groupby as libgroupby
 from pandas._typing import (
     AnyArrayLike,
@@ -2373,12 +2374,15 @@ def size(self) -> DataFrame | Series:
         else:
             result = self._obj_1d_constructor(result)
 
+        with com.temp_setattr(self, "as_index", True):
+            # size already has the desired behavior in GH#49519, but this makes the
+            # as_index=False path of _reindex_output fail on categorical groupers.
+            result = self._reindex_output(result, fill_value=0)
         if not self.as_index:
             # error: Incompatible types in assignment (expression has
             # type "DataFrame", variable has type "Series")
             result = result.rename("size").reset_index()  # type: ignore[assignment]
-
-        return self._reindex_output(result, fill_value=0)
+        return result
 
     @final
     @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
@@ -3384,6 +3388,10 @@ def ngroup(self, ascending: bool = True):
             else:
                 dtype = np.int64
 
+            if any(ping._passed_categorical for ping in self.grouper.groupings):
+                # comp_ids reflect non-observed groups, we need only observed
+                comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
+
             result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
             if not ascending:
                 result = self.ngroups - 1 - result
@@ -4073,7 +4081,7 @@ def _reindex_output(
             names = names + [None]
         index = MultiIndex.from_product(levels_list, names=names)
         if self.sort:
-            index = index.sortlevel()[0]
+            index = index.sort_values()
 
         if self.as_index:
             # Always holds for SeriesGroupBy unless GH#36507 is implemented
@@ -4095,12 +4103,12 @@ def _reindex_output(
         # reindex `output`, and then reset the in-axis grouper columns.
 
         # Select in-axis groupers
-        in_axis_grps = (
+        in_axis_grps = list(
             (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
         )
-        g_nums, g_names = zip(*in_axis_grps)
-
-        output = output.drop(labels=list(g_names), axis=1)
+        if len(in_axis_grps) > 0:
+            g_nums, g_names = zip(*in_axis_grps)
+            output = output.drop(labels=list(g_names), axis=1)
 
         # Set a temp index and reindex (possibly expanding)
         output = output.set_index(self.grouper.result_index).reindex(
@@ -4109,7 +4117,8 @@ def _reindex_output(
 
         # Reset in-axis grouper columns
         # (using level numbers `g_nums` because level names may not be unique)
-        output = output.reset_index(level=g_nums)
+        if len(in_axis_grps) > 0:
+            output = output.reset_index(level=g_nums)
 
         return output.reset_index(drop=True)
 
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 688dcb44c31f3..f64edfccd15e4 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -620,6 +620,9 @@ def group_arraylike(self) -> ArrayLike:
             # retain dtype for categories, including unobserved ones
             return self.result_index._values
 
+        elif self._passed_categorical:
+            return self.group_index
+
         return self._codes_and_uniques[1]
 
     @cache_readonly
@@ -629,21 +632,39 @@ def result_index(self) -> Index:
         if self._all_grouper is not None:
             group_idx = self.group_index
             assert isinstance(group_idx, CategoricalIndex)
-            categories = self._all_grouper.categories
+            cats = self._orig_cats
             # set_categories is dynamically added
-            return group_idx.set_categories(categories)  # type: ignore[attr-defined]
+            return group_idx.set_categories(cats)  # type: ignore[attr-defined]
         return self.group_index
 
     @cache_readonly
     def group_index(self) -> Index:
-        if self._group_index is not None:
+        if self._group_index is not None and self._all_grouper is None:
             # _group_index is set in __init__ for MultiIndex cases
             return self._group_index
 
-        uniques = self._codes_and_uniques[1]
+        codes, uniques = self._codes_and_uniques
+        if not self._dropna and self._passed_categorical:
+            assert isinstance(uniques, Categorical)
+            if self._sort and (codes == len(uniques)).any():
+                # Add NA value on the end when sorting
+                uniques = Categorical.from_codes(
+                    np.append(uniques.codes, [-1]), uniques.categories
+                )
+            else:
+                # Need to determine proper placement of NA value when not sorting
+                cat = self.grouping_vector
+                na_idx = (cat.codes < 0).argmax()
+                if cat.codes[na_idx] < 0:
+                    # count number of unique codes that comes before the nan value
+                    na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
+                    uniques = Categorical.from_codes(
+                        np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
+                    )
         return Index._with_infer(uniques, name=self.name)
 
-    @cache_readonly
+    # @cache_readonly
+    @property
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         if self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
@@ -663,9 +684,26 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
             uniques = Categorical.from_codes(
                 codes=ucodes, categories=categories, ordered=cat.ordered
             )
+            if not self._dropna and np.any(cat.codes < 0):
+                if self._sort:
+                    # Replace NA (negative) codes with `largest code + 1`
+                    na_code = len(categories)
+                    codes = np.where(cat.codes < 0, na_code, cat.codes)
+                else:
+                    # Insert NA code into the codes based on first appearance
+                    # A negative code must exist, no need to check codes[na_idx] < 0
+                    na_idx = (cat.codes < 0).argmax()
+                    # count number of unique codes that comes before the nan value
+                    na_code = algorithms.nunique_ints(cat.codes[:na_idx])
+                    codes = np.where(cat.codes >= na_code, cat.codes + 1, cat.codes)
+                    codes = np.where(codes < 0, na_code, codes)
+            else:
+                codes = cat.codes
+
             if not self._observed:
                 uniques = uniques.reorder_categories(self._orig_cats)
-            return cat.codes, uniques
+
+            return codes, uniques
 
         elif isinstance(self.grouping_vector, ops.BaseGrouper):
             # we have a list of groupers
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index ca794d4ae5a3e..af88cd8245195 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -828,6 +828,7 @@ def test_preserve_categories():
     df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
     sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
     # GH#48749 - don't change order of categories
+    # GH#42482 - don't sort result when sort=False, even when ordered=True
     nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
     tm.assert_index_equal(
         df.groupby("A", sort=True, observed=False).first().index, sort_index
@@ -1213,7 +1214,7 @@ def test_seriesgroupby_observed_true(df_cat, operation):
     lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
     lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
     index = MultiIndex.from_arrays([lev_a, lev_b])
-    expected = Series(data=[2, 4, 1, 3], index=index, name="C")
+    expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
 
     grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
     result = getattr(grouped, operation)(sum)
@@ -1856,10 +1857,7 @@ def test_category_order_reducer(
         df = df.set_index(keys)
     args = get_groupby_method_args(reduction_func, df)
     gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
-    msg = "is deprecated and will be removed in a future version"
-    warn = FutureWarning if reduction_func == "mad" else None
-    with tm.assert_produces_warning(warn, match=msg):
-        op_result = getattr(gb, reduction_func)(*args)
+    op_result = getattr(gb, reduction_func)(*args)
     if as_index:
         result = op_result.index.get_level_values("a").categories
     else:
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 50eb9aabcc55c..41989dbb1604e 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
 
 
 @pytest.mark.parametrize(
@@ -424,7 +425,7 @@ def test_groupby_drop_nan_with_multi_index():
     ],
 )
 @pytest.mark.parametrize("test_series", [True, False])
-def test_no_sort_keep_na(request, sequence_index, dtype, test_series):
+def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index):
     # GH#46584, GH#48794
 
     # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
@@ -433,11 +434,6 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series):
         [{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
     )
 
-    if dtype == "category" and "z" in sequence:
-        # Only xfail when nulls are present
-        msg = "dropna=False not correct for categorical, GH#48645"
-        request.node.add_marker(pytest.mark.xfail(reason=msg))
-
     # Unique values to use for grouper, depends on dtype
     if dtype in ("string", "string[pyarrow]"):
         uniques = {"x": "x", "y": "y", "z": pd.NA}
@@ -452,7 +448,7 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series):
             "a": [0, 1, 2, 3],
         }
     )
-    gb = df.groupby("key", dropna=False, sort=False)
+    gb = df.groupby("key", dropna=False, sort=False, as_index=as_index)
     if test_series:
         gb = gb["a"]
     result = gb.sum()
@@ -477,6 +473,10 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series):
     expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
     if not test_series:
         expected = expected.to_frame()
+    if not as_index:
+        expected = expected.reset_index()
+        if dtype is not None and dtype.startswith("Sparse"):
+            expected["key"] = expected["key"].astype(dtype)
 
     tm.assert_equal(result, expected)
 
@@ -498,3 +498,183 @@ def test_null_is_null_for_dtype(
         tm.assert_series_equal(result, expected["a"])
     else:
         tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
+def test_categorical_reducers(
+    request, reduction_func, observed, sort, as_index, index_kind
+):
+    # GH#36327
+    if (
+        reduction_func in ("idxmin", "idxmax")
+        and not observed
+        and index_kind != "multi"
+    ):
+        msg = "GH#10694 - idxmin/max broken for categorical with observed=False"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+
+    # Ensure there is at least one null value by appending to the end
+    values = np.append(np.random.choice([1, 2, None], size=19), None)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
+    )
+
+    # Strategy: Compare to dropna=True by filling null values with a new code
+    df_filled = df.copy()
+    df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
+
+    if index_kind == "range":
+        keys = ["x"]
+    elif index_kind == "single":
+        keys = ["x"]
+        df = df.set_index("x")
+        df_filled = df_filled.set_index("x")
+    else:
+        keys = ["x", "x2"]
+        df["x2"] = df["x"]
+        df = df.set_index(["x", "x2"])
+        df_filled["x2"] = df_filled["x"]
+        df_filled = df_filled.set_index(["x", "x2"])
+    args = get_groupby_method_args(reduction_func, df)
+    args_filled = get_groupby_method_args(reduction_func, df_filled)
+
+    gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
+    expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
+    expected["x"] = expected["x"].replace(4, None)
+    if index_kind == "multi":
+        expected["x2"] = expected["x2"].replace(4, None)
+    if as_index:
+        if index_kind == "multi":
+            expected = expected.set_index(["x", "x2"])
+        else:
+            expected = expected.set_index("x")
+    else:
+        if index_kind != "range" and reduction_func != "size":
+            # size, unlike other methods, has the desired behavior in GH#49519
+            expected = expected.drop(columns="x")
+            if index_kind == "multi":
+                expected = expected.drop(columns="x2")
+    if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
+        # expected was computed with a RangeIndex; need to translate to index values
+        values = expected["y"].values.tolist()
+        if index_kind == "single":
+            values = [np.nan if e == 4 else e for e in values]
+        else:
+            values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
+        expected["y"] = values
+    if reduction_func == "size":
+        # size, unlike other methods, has the desired behavior in GH#49519
+        expected = expected.rename(columns={0: "size"})
+        if as_index:
+            expected = expected["size"].rename(None)
+
+    gb_keepna = df.groupby(
+        keys, dropna=False, observed=observed, sort=sort, as_index=as_index
+    )
+    result = getattr(gb_keepna, reduction_func)(*args)
+
+    # size will return a Series, others are DataFrame
+    tm.assert_equal(result, expected)
+
+
+def test_categorical_transformers(
+    request, transformation_func, observed, sort, as_index
+):
+    # GH#36327
+    if transformation_func == "fillna":
+        msg = "GH#49651 fillna may incorrectly reorders results when dropna=False"
+        request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False))
+
+    values = np.append(np.random.choice([1, 2, None], size=19), None)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
+    )
+    args = get_groupby_method_args(transformation_func, df)
+
+    # Compute result for null group
+    null_group_values = df[df["x"].isnull()]["y"]
+    if transformation_func == "cumcount":
+        null_group_data = list(range(len(null_group_values)))
+    elif transformation_func == "ngroup":
+        if sort:
+            if observed:
+                na_group = df["x"].nunique(dropna=False) - 1
+            else:
+                # TODO: Should this be 3?
+                na_group = df["x"].nunique(dropna=False) - 1
+        else:
+            na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
+        null_group_data = len(null_group_values) * [na_group]
+    else:
+        null_group_data = getattr(null_group_values, transformation_func)(*args)
+    null_group_result = pd.DataFrame({"y": null_group_data})
+
+    gb_keepna = df.groupby(
+        "x", dropna=False, observed=observed, sort=sort, as_index=as_index
+    )
+    gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
+    result = getattr(gb_keepna, transformation_func)(*args)
+    expected = getattr(gb_dropna, transformation_func)(*args)
+    for iloc, value in zip(
+        df[df["x"].isnull()].index.tolist(), null_group_result.values
+    ):
+        if expected.ndim == 1:
+            expected.iloc[iloc] = value
+        else:
+            expected.iloc[iloc, 0] = value
+    if transformation_func not in ("rank", "diff", "pct_change", "shift"):
+        expected = expected.astype(int)
+    if transformation_func == "ngroup":
+        expected[df["x"].notnull() & expected.ge(na_group)] += 1
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["head", "tail"])
+def test_categorical_head_tail(method, observed, sort, as_index):
+    # GH#36327
+    values = np.random.choice([1, 2, None], 30)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
+    )
+    gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
+    result = getattr(gb, method)()
+
+    if method == "tail":
+        values = values[::-1]
+    # Take the top 5 values from each group
+    mask = (
+        ((values == 1) & ((values == 1).cumsum() <= 5))
+        | ((values == 2) & ((values == 2).cumsum() <= 5))
+        # flake8 doesn't like the vectorized check for None, thinks we should use `is`
+        | ((values == None) & ((values == None).cumsum() <= 5))  # noqa: E711
+    )
+    if method == "tail":
+        mask = mask[::-1]
+    expected = df[mask]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_agg():
+    # GH#36327
+    values = np.random.choice([1, 2, None], 30)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
+    )
+    gb = df.groupby("x", dropna=False)
+    result = gb.agg(lambda x: x.sum())
+    expected = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_transform():
+    # GH#36327
+    values = np.random.choice([1, 2, None], 30)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
+    )
+    gb = df.groupby("x", dropna=False)
+    result = gb.transform(lambda x: x.sum())
+    expected = gb.transform("sum")
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index b982d247c2707..df83a5e410e71 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -852,6 +852,14 @@ def test_unique_masked(self, any_numeric_ea_dtype):
         tm.assert_extension_array_equal(result, expected)
 
 
+def test_nunique_ints(index_or_series_or_array):
+    # GH#36327
+    values = index_or_series_or_array(np.random.randint(0, 20, 30))
+    result = algos.nunique_ints(values)
+    expected = len(algos.unique(values))
+    assert result == expected
+
+
 class TestIsin:
     def test_invalid(self):
 

From 20e17ab11d109587c7ccba3320209c1efd0c2feb Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 12 Nov 2022 07:56:10 -0500
Subject: [PATCH 02/11] Use intp

---
 pandas/tests/groupby/test_groupby_dropna.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 41989dbb1604e..04d086d38b93f 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -623,7 +623,7 @@ def test_categorical_transformers(
         else:
             expected.iloc[iloc, 0] = value
     if transformation_func not in ("rank", "diff", "pct_change", "shift"):
-        expected = expected.astype(int)
+        expected = expected.astype("intp")
     if transformation_func == "ngroup":
         expected[df["x"].notnull() & expected.ge(na_group)] += 1
 

From 825461985e25d1d2209aa8d5b09e5a2212f1cfa3 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sat, 12 Nov 2022 23:54:41 -0500
Subject: [PATCH 03/11] Fixups

---
 pandas/core/algorithms.py      | 2 +-
 pandas/core/groupby/grouper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 0ec71dc064b5f..178fe6f1c7b98 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -424,7 +424,7 @@ def nunique_ints(values: ArrayLike) -> int:
     """
     if len(values) == 0:
         return 0
-    values = _ensure_arraylike(values)
+    values = _ensure_data(values)
     result = (np.bincount(values.ravel()) != 0).sum()
     return result
 
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index f64edfccd15e4..672bd938144c6 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -621,7 +621,7 @@ def group_arraylike(self) -> ArrayLike:
             return self.result_index._values
 
         elif self._passed_categorical:
-            return self.group_index
+            return self.group_index._values
 
         return self._codes_and_uniques[1]
 

From f679fd736a96f3d23cacf72482a4174e83eac4dc Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sun, 13 Nov 2022 00:02:33 -0500
Subject: [PATCH 04/11] Use intp

---
 pandas/tests/groupby/test_groupby_dropna.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 04d086d38b93f..e3229bdc52738 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -626,6 +626,7 @@ def test_categorical_transformers(
         expected = expected.astype("intp")
     if transformation_func == "ngroup":
         expected[df["x"].notnull() & expected.ge(na_group)] += 1
+        expected = expected.astype("intp")
 
     tm.assert_equal(result, expected)
 

From 34760ca27cc6c12a4db538e13f68b3e3a73ee8c8 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 15 Nov 2022 20:07:38 -0500
Subject: [PATCH 05/11] int64

---
 pandas/tests/groupby/test_groupby_dropna.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index e3229bdc52738..908639a252496 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -626,7 +626,7 @@ def test_categorical_transformers(
         expected = expected.astype("intp")
     if transformation_func == "ngroup":
         expected[df["x"].notnull() & expected.ge(na_group)] += 1
-        expected = expected.astype("intp")
+        expected = expected.astype("int64")
 
     tm.assert_equal(result, expected)
 

From 93f306c16cabea6987e7b2caff78750e136dacf2 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 19 Nov 2022 12:26:44 -0500
Subject: [PATCH 06/11] dtype fix

---
 pandas/tests/groupby/test_groupby_dropna.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 908639a252496..9d9c900057f34 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -622,10 +622,9 @@ def test_categorical_transformers(
             expected.iloc[iloc] = value
         else:
             expected.iloc[iloc, 0] = value
-    if transformation_func not in ("rank", "diff", "pct_change", "shift"):
-        expected = expected.astype("intp")
     if transformation_func == "ngroup":
         expected[df["x"].notnull() & expected.ge(na_group)] += 1
+    if transformation_func not in ("rank", "diff", "pct_change", "shift"):
         expected = expected.astype("int64")
 
     tm.assert_equal(result, expected)

From f3a3ebba9519b5a7eb942c0f060a52a0bed72f5a Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sat, 19 Nov 2022 20:07:22 -0500
Subject: [PATCH 07/11] Breakup op to debug on CI

---
 pandas/core/algorithms.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 76f07942f117c..762c0d89b5f8a 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -425,7 +425,11 @@ def nunique_ints(values: ArrayLike) -> int:
     if len(values) == 0:
         return 0
     values = _ensure_data(values)
-    result = (np.bincount(values.ravel()) != 0).sum()
+    a = values.ravel()
+    b = np.bincount(a)
+    c = b != 0
+    result = c.sum()
+    # result = (np.bincount(values.ravel()) != 0).sum()
     return result
 
 

From 4bfeaa1d7ccb2c3275cb909190ad433ed01fdbb2 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 20 Nov 2022 08:05:48 -0500
Subject: [PATCH 08/11] Trying with intp

---
 pandas/core/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 76f07942f117c..b18f4a9d810ee 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -425,7 +425,7 @@ def nunique_ints(values: ArrayLike) -> int:
     if len(values) == 0:
         return 0
     values = _ensure_data(values)
-    result = (np.bincount(values.ravel()) != 0).sum()
+    result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
     return result
 
 

From 4d72402d74a7345df2957460d3d45da6e9dbaf41 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Mon, 28 Nov 2022 21:01:58 -0500
Subject: [PATCH 09/11] Restore cache decorator

---
 pandas/core/groupby/grouper.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 672bd938144c6..ef2f53a905d3f 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -663,8 +663,7 @@ def group_index(self) -> Index:
                     )
         return Index._with_infer(uniques, name=self.name)
 
-    # @cache_readonly
-    @property
+    @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         if self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper

From 1e3bff33ba98deaefb20929ed7fcac9e6d52e80f Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Mon, 28 Nov 2022 21:41:57 -0500
Subject: [PATCH 10/11] Add bincount comment

---
 pandas/core/algorithms.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 397bac447c402..f40b5a094889c 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -425,6 +425,7 @@ def nunique_ints(values: ArrayLike) -> int:
     if len(values) == 0:
         return 0
     values = _ensure_data(values)
+    # bincount requires intp
     result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
     return result
 

From c8ba7adc4c603b627f0a6a391f1df157f08b6391 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 29 Nov 2022 20:47:16 -0500
Subject: [PATCH 11/11] Rework recoding logic

---
 pandas/core/groupby/grouper.py              | 32 +++++++++++----------
 pandas/tests/groupby/test_groupby_dropna.py |  4 +++
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 672bd938144c6..dacabf10cef35 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -684,21 +684,23 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
             uniques = Categorical.from_codes(
                 codes=ucodes, categories=categories, ordered=cat.ordered
             )
-            if not self._dropna and np.any(cat.codes < 0):
-                if self._sort:
-                    # Replace NA (negative) codes with `largest code + 1`
-                    na_code = len(categories)
-                    codes = np.where(cat.codes < 0, na_code, cat.codes)
-                else:
-                    # Insert NA code into the codes based on first appearance
-                    # A negative code must exist, no need to check codes[na_idx] < 0
-                    na_idx = (cat.codes < 0).argmax()
-                    # count number of unique codes that comes before the nan value
-                    na_code = algorithms.nunique_ints(cat.codes[:na_idx])
-                    codes = np.where(cat.codes >= na_code, cat.codes + 1, cat.codes)
-                    codes = np.where(codes < 0, na_code, codes)
-            else:
-                codes = cat.codes
+
+            codes = cat.codes
+            if not self._dropna:
+                na_mask = codes < 0
+                if np.any(na_mask):
+                    if self._sort:
+                        # Replace NA codes with `largest code + 1`
+                        na_code = len(categories)
+                        codes = np.where(na_mask, na_code, codes)
+                    else:
+                        # Insert NA code into the codes based on first appearance
+                        # A negative code must exist, no need to check codes[na_idx] < 0
+                        na_idx = na_mask.argmax()
+                        # count number of unique codes that comes before the nan value
+                        na_code = algorithms.nunique_ints(codes[:na_idx])
+                        codes = np.where(codes >= na_code, codes + 1, codes)
+                        codes = np.where(na_mask, na_code, codes)
 
             if not self._observed:
                 uniques = uniques.reorder_categories(self._orig_cats)
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 9d9c900057f34..5418a2a60dc80 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -537,6 +537,10 @@ def test_categorical_reducers(
         df_filled = df_filled.set_index(["x", "x2"])
     args = get_groupby_method_args(reduction_func, df)
     args_filled = get_groupby_method_args(reduction_func, df_filled)
+    if reduction_func == "corrwith" and index_kind == "range":
+        # Don't include the grouping columns so we can call reset_index
+        args = (args[0].drop(columns=keys),)
+        args_filled = (args_filled[0].drop(columns=keys),)
 
     gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
     expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()