diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 52c8b82ab47f9..86c44685e7df9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -429,8 +429,9 @@ Groupby/resample/rolling - Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`) - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) -- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`) -- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`) +- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`) +- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`) +- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) - Reshaping diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 05023028dccb3..f751d57c186b0 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1477,7 +1477,8 @@ def group_min( cdef group_cummin_max( iu_64_floating_t[:, ::1] out, ndarray[iu_64_floating_t, ndim=2] values, - uint8_t[:, ::1] mask, + const uint8_t[:, ::1] mask, + uint8_t[:, ::1] result_mask, const intp_t[::1] labels, int ngroups, bint is_datetimelike, @@ -1496,6 +1497,9 @@ cdef group_cummin_max( mask : np.ndarray[bool] or None If not None, indices represent missing values, otherwise the mask will not be used + result_mask : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. labels : np.ndarray[np.intp] Labels to group by. ngroups : int @@ -1558,7 +1562,7 @@ cdef group_cummin_max( if not skipna and na_possible and seen_na[lab, j]: if uses_mask: - mask[i, j] = 1 # FIXME: shouldn't alter inplace + result_mask[i, j] = 1 # Set to 0 ensures that we are deterministic and can # downcast if appropriate out[i, j] = 0 @@ -1595,19 +1599,21 @@ def group_cummin( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: """See group_cummin_max.__doc__""" group_cummin_max( - out, - values, - mask, - labels, - ngroups, - is_datetimelike, - skipna, - compute_max=False + out=out, + values=values, + mask=mask, + result_mask=result_mask, + labels=labels, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + skipna=skipna, + compute_max=False, ) @@ -1619,17 +1625,19 @@ def group_cummax( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: """See group_cummin_max.__doc__""" group_cummin_max( - out, - values, - mask, - labels, - ngroups, - is_datetimelike, - skipna, - compute_max=True + out=out, + values=values, + mask=mask, + result_mask=result_mask, + labels=labels, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + skipna=skipna, + compute_max=True, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e984279c17a65..7a25a6b9008dd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -418,9 +418,13 @@ def _masked_ea_wrap_cython_operation( """ orig_values = values - # Copy to ensure input and result masks don't end up shared - mask = values._mask.copy() - result_mask = np.zeros(ngroups, dtype=bool) + # libgroupby functions are responsible for NOT altering mask + mask = values._mask + if self.kind != "aggregate": + result_mask = mask.copy() + else: + result_mask = np.zeros(ngroups, dtype=bool) + arr = values._data res_values = self._cython_op_ndim_compat( @@ -438,12 +442,7 @@ def _masked_ea_wrap_cython_operation( # dtype; last attempt ran into trouble on 32bit linux build res_values = res_values.astype(dtype.type, copy=False) - if self.kind != "aggregate": - out_mask = mask - else: - out_mask = result_mask - - return orig_values._maybe_mask_result(res_values, out_mask) + return orig_values._maybe_mask_result(res_values, result_mask) @final def _cython_op_ndim_compat( @@ -579,6 +578,7 @@ def _call_cython_op( ngroups=ngroups, is_datetimelike=is_datetimelike, mask=mask, + result_mask=result_mask, **kwargs, ) else: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1555e9d02c8ca..ec7f3200b682b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -843,11 +843,15 @@ def test_cummax(dtypes_for_minmax): def test_cummin_max_skipna(method, dtype, groups, expected_data): # GH-34047 df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + orig = df.copy() gb = df.groupby(groups)["a"] result = getattr(gb, method)(skipna=False) expected = Series(expected_data, dtype=dtype, name="a") + # check we didn't accidentally alter df + tm.assert_frame_equal(df, orig) + tm.assert_series_equal(result, expected)