pandas-dev · jreback · Mar 22, 2022 · Feb 4, 2022 · Feb 5, 2022 · Feb 9, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -82,32 +82,40 @@ did not have the same index as the input.
 
 .. code-block:: ipython
 
+    In [3]: df.groupby('a', dropna=True).transform('sum')
+    Out[3]:
+       b
+    0  5
+    1  5
+    2  5
+
     In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum())
     Out[3]:
        b
     0  5
     1  5
 
+    In [3]: df.groupby('a', dropna=True).transform('ffill')
+    Out[3]:
+                         b
+    0                    2
+    1                    3
+    2 -9223372036854775808
+
     In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
     Out[3]:
        b
     0  2
     1  3
 
-    In [3]: df.groupby('a', dropna=True).transform('sum')
-    Out[3]:
-       b
-    0  5
-    1  5
-    2  5
-
 *New behavior*:
 
 .. ipython:: python
 
+    df.groupby('a', dropna=True).transform('sum')
     df.groupby('a', dropna=True).transform(lambda x: x.sum())
+    df.groupby('a', dropna=True).transform('ffill')
     df.groupby('a', dropna=True).transform(lambda x: x)
-    df.groupby('a', dropna=True).transform('sum')
 
 .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
 

@@ -70,7 +70,6 @@ class OutputKey:
         "mean",
         "median",
         "min",
-        "ngroup",
         "nth",
         "nunique",
         "prod",
@@ -113,6 +112,7 @@ def maybe_normalize_deprecated_kernels(kernel):
         "diff",
         "ffill",
         "fillna",
+        "ngroup",
         "pad",
         "pct_change",
         "rank",

@@ -62,6 +62,7 @@ class providing the base-class of operations.
 )
 from pandas.util._exceptions import find_stack_level
 
+from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_datetime64_dtype,
@@ -950,7 +951,18 @@ def curried(x):
             if name in base.plotting_methods:
                 return self.apply(curried)
 
-            return self._python_apply_general(curried, self._obj_with_exclusions)
+            result = self._python_apply_general(curried, self._obj_with_exclusions)
+
+            if result.ndim == 1 and self.obj.ndim == 1 and result.name != self.obj.name:
+                # apply sets the name on each group as the key; if there is one
+                # group this name will come through on Series results
+                result.name = None
+
+            if self.grouper.has_dropped_na and name in base.transformation_kernels:
+                # result will have dropped rows due to nans, fill with null
+                # and ensure index is ordered same as the input
+                result = self._set_result_index_ordered(result)
+            return result
 
         wrapper.__name__ = name
         return wrapper
@@ -2608,7 +2620,11 @@ def blk_func(values: ArrayLike) -> ArrayLike:
                 #  then there will be no -1s in indexer, so we can use
                 #  the original dtype (no need to ensure_dtype_can_hold_na)
                 if isinstance(values, np.ndarray):
-                    out = np.empty(values.shape, dtype=values.dtype)
+                    dtype = values.dtype
+                    if self.grouper.has_dropped_na:
+                        # dropped null groups give rise to nan in the result
+                        dtype = ensure_dtype_can_hold_na(values.dtype)
+                    out = np.empty(values.shape, dtype=dtype)
                 else:
                     out = type(values)._empty(values.shape, dtype=values.dtype)
 
@@ -3114,9 +3130,13 @@ def ngroup(self, ascending: bool = True):
         """
         with self._group_selection_context():
             index = self._selected_obj.index
-            result = self._obj_1d_constructor(
-                self.grouper.group_info[0], index, dtype=np.int64
-            )
+            comp_ids = self.grouper.group_info[0]
+            if self.grouper.has_dropped_na:
+                comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
+                dtype = np.float64
+            else:
+                dtype = np.int64
+            result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
             if not ascending:
                 result = self.ngroups - 1 - result
             return result

@@ -37,6 +37,7 @@
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.cast import (
+    ensure_dtype_can_hold_na,
     maybe_cast_pointwise_result,
     maybe_downcast_to_dtype,
 )
@@ -104,7 +105,8 @@ class WrappedCythonOp:
     #  back to the original dtype.
     cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"])
 
-    def __init__(self, kind: str, how: str):
+    def __init__(self, grouper: BaseGrouper, kind: str, how: str):
+        self.grouper = grouper
         self.kind = kind
         self.how = how
 
@@ -194,7 +196,9 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
             values = ensure_float64(values)
 
         elif values.dtype.kind in ["i", "u"]:
-            if how in ["add", "var", "prod", "mean", "ohlc"]:
+            if how in ["add", "var", "prod", "mean", "ohlc"] or (
+                self.kind == "transform" and self.grouper.has_dropped_na
+            ):
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 
@@ -260,6 +264,9 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:
     def _get_out_dtype(self, dtype: np.dtype) -> np.dtype:
         how = self.how
 
+        if self.kind == "transform" and self.grouper.has_dropped_na:
+            dtype = ensure_dtype_can_hold_na(dtype)
+
         if how == "rank":
             out_dtype = "float64"
         else:
@@ -462,6 +469,11 @@ def _cython_op_ndim_compat(
             # otherwise we have OHLC
             return res.T
 
+        if self.kind == "transform" and self.grouper.has_dropped_na:
+            mask = comp_ids == -1
+            # make mask 2d
+            mask = mask[None, :]
+
         return self._call_cython_op(
             values,
             min_count=min_count,
@@ -592,6 +604,10 @@ def _call_cython_op(
 
         result = result.T
 
+        if self.how == "rank" and self.grouper.has_dropped_na:
+            # TODO: Wouldn't need this if group_rank supported mask
+            result = np.where(comp_ids < 0, np.nan, result)
+
         if self.how not in self.cast_blocklist:
             # e.g. if we are int64 and need to restore to datetime64/timedelta64
             # "rank" is the only member of cast_blocklist we get here
@@ -969,7 +985,7 @@ def _cython_operation(
         """
         assert kind in ["transform", "aggregate"]
 
-        cy_op = WrappedCythonOp(kind=kind, how=how)
+        cy_op = WrappedCythonOp(grouper=self, kind=kind, how=how)
 
         ids, _, _ = self.group_info
         ngroups = self.ngroups

diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
@@ -139,6 +139,10 @@ def test_transform_bad_dtype(op, frame_or_series, request):
                 raises=ValueError, reason="GH 40418: rank does not raise a TypeError"
             )
         )
+    elif op == "ngroup":
+        request.node.add_marker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
 
     obj = DataFrame({"A": 3 * [object]})  # DataFrame that will fail on most transforms
     obj = tm.get_obj(obj, frame_or_series)
@@ -157,9 +161,14 @@ def test_transform_bad_dtype(op, frame_or_series, request):
 
 
 @pytest.mark.parametrize("op", frame_kernels_raise)
-def test_transform_partial_failure_typeerror(op):
+def test_transform_partial_failure_typeerror(request, op):
     # GH 35964
 
+    if op == "ngroup":
+        request.node.add_marker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
+
     # Using object makes most transform kernels fail
     df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]})
 

diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
@@ -243,8 +243,12 @@ def test_agg_cython_table_transform_frame(df, func, expected, axis):
 
 
 @pytest.mark.parametrize("op", series_transform_kernels)
-def test_transform_groupby_kernel_series(string_series, op):
+def test_transform_groupby_kernel_series(request, string_series, op):
     # GH 35964
+    if op == "ngroup":
+        request.node.add_marker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
     # TODO(2.0) Remove after pad/backfill deprecation enforced
     op = maybe_normalize_deprecated_kernels(op)
     args = [0.0] if op == "fillna" else []
@@ -255,9 +259,15 @@ def test_transform_groupby_kernel_series(string_series, op):
 
 
 @pytest.mark.parametrize("op", frame_transform_kernels)
-def test_transform_groupby_kernel_frame(axis, float_frame, op):
+def test_transform_groupby_kernel_frame(request, axis, float_frame, op):
     # TODO(2.0) Remove after pad/backfill deprecation enforced
     op = maybe_normalize_deprecated_kernels(op)
+
+    if op == "ngroup":
+        request.node.add_marker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
+
     # GH 35964
 
     args = [0.0] if op == "fillna" else []

diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py
@@ -658,6 +658,8 @@ def test_non_unique_index():
     )
     result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
     expected = Series(
-        [1.0] * 4, index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, name="value"
+        [1.0, 1.0, 1.0, np.nan],
+        index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
+        name="value",
     )
     tm.assert_series_equal(result, expected)