Skip to content

CLN: Cleanups in groupby due to numeric_only deprecations #49761

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,12 +671,8 @@ class String:
["str", "string[python]"],
[
"sum",
"prod",
"min",
"max",
"mean",
"median",
"var",
"first",
"last",
"any",
Expand Down
78 changes: 28 additions & 50 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ def skew(
self,
axis: Axis | lib.NoDefault = lib.no_default,
skipna: bool = True,
numeric_only: bool | None = None,
numeric_only: bool = False,
**kwargs,
) -> Series:
result = self._op_via_apply(
Expand Down Expand Up @@ -1357,9 +1357,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:

# We could use `mgr.apply` here and not have to set_axis, but
# we would have to do shape gymnastics for ArrayManager compat
res_mgr = mgr.grouped_reduce(
arr_func, ignore_failures=numeric_only is lib.no_default
)
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=False)
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < orig_mgr_len:
Expand Down Expand Up @@ -1785,84 +1783,64 @@ def nunique(self, dropna: bool = True) -> DataFrame:

@doc(
_shared_docs["idxmax"],
numeric_only_default="True for axis=0, False for axis=1",
numeric_only_default="False",
)
def idxmax(
self,
axis: Axis = 0,
skipna: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
) -> DataFrame:
axis = DataFrame._get_axis_number(axis)
if numeric_only is lib.no_default:
# Cannot use self._resolve_numeric_only; we must pass None to
# DataFrame.idxmax for backwards compatibility
numeric_only_arg = None if axis == 0 else False
else:
numeric_only_arg = numeric_only

def func(df):
with warnings.catch_warnings():
# Suppress numeric_only warnings here, will warn below
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax")
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmax"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
)
self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only)
return result

@doc(
_shared_docs["idxmin"],
numeric_only_default="True for axis=0, False for axis=1",
numeric_only_default="False",
)
def idxmin(
self,
axis: Axis = 0,
skipna: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
) -> DataFrame:
axis = DataFrame._get_axis_number(axis)
if numeric_only is lib.no_default:
# Cannot use self._resolve_numeric_only; we must pass None to
# DataFrame.idxmin for backwards compatibility
numeric_only_arg = None if axis == 0 else False
else:
numeric_only_arg = numeric_only

def func(df):
with warnings.catch_warnings():
# Suppress numeric_only warnings here, will warn below
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin")
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmin"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
)
self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only)
return result

boxplot = boxplot_frame_groupby
Expand Down
36 changes: 19 additions & 17 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,12 @@ class providing the base-class of operations.
Parameters
----------
numeric_only : bool, default {no}
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data.
Include only float, int, boolean columns.

.. versionchanged:: 2.0.0

numeric_only no longer accepts ``None``.

min_count : int, default {mc}
The required number of valid values to perform the operation. If fewer
than ``min_count`` non-NA values are present the result will be NA.
Expand Down Expand Up @@ -1654,7 +1658,6 @@ def _agg_general(
alt=npfunc,
numeric_only=numeric_only,
min_count=min_count,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -1705,7 +1708,6 @@ def _cython_agg_general(
alt: Callable,
numeric_only: bool | lib.NoDefault,
min_count: int = -1,
ignore_failures: bool = True,
**kwargs,
):
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
Expand Down Expand Up @@ -1749,7 +1751,7 @@ def array_func(values: ArrayLike) -> ArrayLike:

# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
new_mgr = data.grouped_reduce(array_func, ignore_failures=False)

if not is_ser and len(new_mgr) < orig_len:
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
Expand Down Expand Up @@ -2054,8 +2056,11 @@ def mean(
Parameters
----------
numeric_only : bool, default True
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data.
Include only float, int, boolean columns.

.. versionchanged:: 2.0.0

numeric_only no longer accepts ``None``.

engine : str, default None
* ``'cython'`` : Runs the operation through C-extensions from cython.
Expand Down Expand Up @@ -2123,7 +2128,6 @@ def mean(
"mean",
alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand All @@ -2139,8 +2143,11 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
Parameters
----------
numeric_only : bool, default True
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data.
Include only float, int, boolean columns.

.. versionchanged:: 2.0.0

numeric_only no longer accepts ``None``.

Returns
-------
Expand All @@ -2153,7 +2160,6 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
"median",
alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -2287,7 +2293,6 @@ def var(
"var",
alt=lambda x: Series(x).var(ddof=ddof),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
ddof=ddof,
)

Expand Down Expand Up @@ -3286,8 +3291,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
is_ser = obj.ndim == 1
mgr = self._get_data_to_aggregate()
data = mgr.get_numeric_data() if numeric_only_bool else mgr
ignore_failures = numeric_only_bool
res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
res_mgr = data.grouped_reduce(blk_func, ignore_failures=False)

if (
numeric_only is lib.no_default
Expand Down Expand Up @@ -3765,9 +3769,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
if numeric_only_bool:
mgr = mgr.get_numeric_data()

res_mgr = mgr.grouped_reduce(
blk_func, ignore_failures=numeric_only is lib.no_default
)
res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=False)

if not is_ser and len(res_mgr.items) != orig_mgr_len:
howstr = how.replace("group_", "")
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ def test_cython_agg_nothing_to_agg():

frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})

with tm.assert_produces_warning(FutureWarning):
result = frame[["b"]].groupby(frame["a"]).mean()
with pytest.raises(TypeError, match="Could not convert"):
frame[["b"]].groupby(frame["a"]).mean()
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
tm.assert_frame_equal(result, expected)

Expand Down
23 changes: 10 additions & 13 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ def test_basic(): # TODO: split this test
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.sum()
result = gb.sum()
tm.assert_frame_equal(result, expected)

# GH 8623
Expand Down Expand Up @@ -857,12 +855,14 @@ def test_preserve_categorical_dtype():
}
)
for col in ["C1", "C2"]:
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
result2 = (
df.groupby(by=col, as_index=True, observed=False).mean().reset_index()
)
result1 = df.groupby(by=col, as_index=False, observed=False).mean(
numeric_only=True
)
result2 = (
df.groupby(by=col, as_index=True, observed=False)
.mean(numeric_only=True)
.reset_index()
)
expected = exp_full.reindex(columns=result1.columns)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
Expand Down Expand Up @@ -1856,10 +1856,7 @@ def test_category_order_reducer(
df = df.set_index(keys)
args = get_groupby_method_args(reduction_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
msg = "is deprecated and will be removed in a future version"
warn = FutureWarning if reduction_func == "mad" else None
with tm.assert_produces_warning(warn, match=msg):
op_result = getattr(gb, reduction_func)(*args)
op_result = getattr(gb, reduction_func)(*args)
if as_index:
result = op_result.index.get_level_values("a").categories
else:
Expand Down