Skip to content

BUG: GroupBy.cummin altering nullable arrays inplace #46220

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,9 @@ Groupby/resample/rolling
- Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`)
- Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`)
- Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`)
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`)
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`)
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`)
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`)
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`)
-

Reshaping
Expand Down
48 changes: 28 additions & 20 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1477,7 +1477,8 @@ def group_min(
cdef group_cummin_max(
iu_64_floating_t[:, ::1] out,
ndarray[iu_64_floating_t, ndim=2] values,
uint8_t[:, ::1] mask,
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask,
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike,
Expand All @@ -1496,6 +1497,9 @@ cdef group_cummin_max(
mask : np.ndarray[bool] or None
If not None, indices represent missing values,
otherwise the mask will not be used
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
labels : np.ndarray[np.intp]
Labels to group by.
ngroups : int
Expand Down Expand Up @@ -1558,7 +1562,7 @@ cdef group_cummin_max(

if not skipna and na_possible and seen_na[lab, j]:
if uses_mask:
mask[i, j] = 1 # FIXME: shouldn't alter inplace
result_mask[i, j] = 1
# Set to 0 ensures that we are deterministic and can
# downcast if appropriate
out[i, j] = 0
Expand Down Expand Up @@ -1595,19 +1599,21 @@ def group_cummin(
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
values,
mask,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=False
out=out,
values=values,
mask=mask,
result_mask=result_mask,
labels=labels,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
skipna=skipna,
compute_max=False,
)


Expand All @@ -1619,17 +1625,19 @@ def group_cummax(
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
values,
mask,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=True
out=out,
values=values,
mask=mask,
result_mask=result_mask,
labels=labels,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
skipna=skipna,
compute_max=True,
)
18 changes: 9 additions & 9 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,9 +418,13 @@ def _masked_ea_wrap_cython_operation(
"""
orig_values = values

# Copy to ensure input and result masks don't end up shared
mask = values._mask.copy()
result_mask = np.zeros(ngroups, dtype=bool)
# libgroupby functions are responsible for NOT altering mask
mask = values._mask
if self.kind != "aggregate":
result_mask = mask.copy()
else:
result_mask = np.zeros(ngroups, dtype=bool)

arr = values._data

res_values = self._cython_op_ndim_compat(
Expand All @@ -438,12 +442,7 @@ def _masked_ea_wrap_cython_operation(
# dtype; last attempt ran into trouble on 32bit linux build
res_values = res_values.astype(dtype.type, copy=False)

if self.kind != "aggregate":
out_mask = mask
else:
out_mask = result_mask

return orig_values._maybe_mask_result(res_values, out_mask)
return orig_values._maybe_mask_result(res_values, result_mask)

@final
def _cython_op_ndim_compat(
Expand Down Expand Up @@ -579,6 +578,7 @@ def _call_cython_op(
ngroups=ngroups,
is_datetimelike=is_datetimelike,
mask=mask,
result_mask=result_mask,
**kwargs,
)
else:
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,11 +843,15 @@ def test_cummax(dtypes_for_minmax):
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
orig = df.copy()
gb = df.groupby(groups)["a"]

result = getattr(gb, method)(skipna=False)
expected = Series(expected_data, dtype=dtype, name="a")

# check we didn't accidentally alter df
tm.assert_frame_equal(df, orig)

tm.assert_series_equal(result, expected)


Expand Down