Skip to content

Commit 2f751ad

Browse files
authored
CLN: Cleanups in groupby due to numeric_only deprecations (#49761)
* DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample * Change to TypeError * Better error message * WIP * WIP * CLN: Cleanups in groupby due to numeric_only deprecations * revert * Remove ops from groupby.String * fixup * fixup
1 parent 025fbd0 commit 2f751ad

File tree

5 files changed

+60
-86
lines changed

5 files changed

+60
-86
lines changed

asv_bench/benchmarks/groupby.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -671,12 +671,8 @@ class String:
671671
["str", "string[python]"],
672672
[
673673
"sum",
674-
"prod",
675674
"min",
676675
"max",
677-
"mean",
678-
"median",
679-
"var",
680676
"first",
681677
"last",
682678
"any",

pandas/core/groupby/generic.py

Lines changed: 28 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -863,7 +863,7 @@ def skew(
863863
self,
864864
axis: Axis | lib.NoDefault = lib.no_default,
865865
skipna: bool = True,
866-
numeric_only: bool | None = None,
866+
numeric_only: bool = False,
867867
**kwargs,
868868
) -> Series:
869869
result = self._op_via_apply(
@@ -1357,9 +1357,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
13571357

13581358
# We could use `mgr.apply` here and not have to set_axis, but
13591359
# we would have to do shape gymnastics for ArrayManager compat
1360-
res_mgr = mgr.grouped_reduce(
1361-
arr_func, ignore_failures=numeric_only is lib.no_default
1362-
)
1360+
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=False)
13631361
res_mgr.set_axis(1, mgr.axes[1])
13641362

13651363
if len(res_mgr) < orig_mgr_len:
@@ -1785,84 +1783,64 @@ def nunique(self, dropna: bool = True) -> DataFrame:
17851783

17861784
@doc(
17871785
_shared_docs["idxmax"],
1788-
numeric_only_default="True for axis=0, False for axis=1",
1786+
numeric_only_default="False",
17891787
)
17901788
def idxmax(
17911789
self,
17921790
axis: Axis = 0,
17931791
skipna: bool = True,
1794-
numeric_only: bool | lib.NoDefault = lib.no_default,
1792+
numeric_only: bool = False,
17951793
) -> DataFrame:
17961794
axis = DataFrame._get_axis_number(axis)
1797-
if numeric_only is lib.no_default:
1798-
# Cannot use self._resolve_numeric_only; we must pass None to
1799-
# DataFrame.idxmax for backwards compatibility
1800-
numeric_only_arg = None if axis == 0 else False
1801-
else:
1802-
numeric_only_arg = numeric_only
18031795

18041796
def func(df):
1805-
with warnings.catch_warnings():
1806-
# Suppress numeric_only warnings here, will warn below
1807-
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax")
1808-
res = df._reduce(
1809-
nanops.nanargmax,
1810-
"argmax",
1811-
axis=axis,
1812-
skipna=skipna,
1813-
numeric_only=numeric_only_arg,
1814-
)
1815-
indices = res._values
1816-
index = df._get_axis(axis)
1817-
result = [index[i] if i >= 0 else np.nan for i in indices]
1818-
return df._constructor_sliced(result, index=res.index)
1797+
res = df._reduce(
1798+
nanops.nanargmax,
1799+
"argmax",
1800+
axis=axis,
1801+
skipna=skipna,
1802+
numeric_only=numeric_only,
1803+
)
1804+
indices = res._values
1805+
index = df._get_axis(axis)
1806+
result = [index[i] if i >= 0 else np.nan for i in indices]
1807+
return df._constructor_sliced(result, index=res.index)
18191808

18201809
func.__name__ = "idxmax"
18211810
result = self._python_apply_general(
18221811
func, self._obj_with_exclusions, not_indexed_same=True
18231812
)
1824-
self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only)
18251813
return result
18261814

18271815
@doc(
18281816
_shared_docs["idxmin"],
1829-
numeric_only_default="True for axis=0, False for axis=1",
1817+
numeric_only_default="False",
18301818
)
18311819
def idxmin(
18321820
self,
18331821
axis: Axis = 0,
18341822
skipna: bool = True,
1835-
numeric_only: bool | lib.NoDefault = lib.no_default,
1823+
numeric_only: bool = False,
18361824
) -> DataFrame:
18371825
axis = DataFrame._get_axis_number(axis)
1838-
if numeric_only is lib.no_default:
1839-
# Cannot use self._resolve_numeric_only; we must pass None to
1840-
# DataFrame.idxmin for backwards compatibility
1841-
numeric_only_arg = None if axis == 0 else False
1842-
else:
1843-
numeric_only_arg = numeric_only
18441826

18451827
def func(df):
1846-
with warnings.catch_warnings():
1847-
# Suppress numeric_only warnings here, will warn below
1848-
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin")
1849-
res = df._reduce(
1850-
nanops.nanargmin,
1851-
"argmin",
1852-
axis=axis,
1853-
skipna=skipna,
1854-
numeric_only=numeric_only_arg,
1855-
)
1856-
indices = res._values
1857-
index = df._get_axis(axis)
1858-
result = [index[i] if i >= 0 else np.nan for i in indices]
1859-
return df._constructor_sliced(result, index=res.index)
1828+
res = df._reduce(
1829+
nanops.nanargmin,
1830+
"argmin",
1831+
axis=axis,
1832+
skipna=skipna,
1833+
numeric_only=numeric_only,
1834+
)
1835+
indices = res._values
1836+
index = df._get_axis(axis)
1837+
result = [index[i] if i >= 0 else np.nan for i in indices]
1838+
return df._constructor_sliced(result, index=res.index)
18601839

18611840
func.__name__ = "idxmin"
18621841
result = self._python_apply_general(
18631842
func, self._obj_with_exclusions, not_indexed_same=True
18641843
)
1865-
self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only)
18661844
return result
18671845

18681846
boxplot = boxplot_frame_groupby

pandas/core/groupby/groupby.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,12 @@ class providing the base-class of operations.
324324
Parameters
325325
----------
326326
numeric_only : bool, default {no}
327-
Include only float, int, boolean columns. If None, will attempt to use
328-
everything, then use only numeric data.
327+
Include only float, int, boolean columns.
328+
329+
.. versionchanged:: 2.0.0
330+
331+
numeric_only no longer accepts ``None``.
332+
329333
min_count : int, default {mc}
330334
The required number of valid values to perform the operation. If fewer
331335
than ``min_count`` non-NA values are present the result will be NA.
@@ -1654,7 +1658,6 @@ def _agg_general(
16541658
alt=npfunc,
16551659
numeric_only=numeric_only,
16561660
min_count=min_count,
1657-
ignore_failures=numeric_only is lib.no_default,
16581661
)
16591662
return result.__finalize__(self.obj, method="groupby")
16601663

@@ -1705,7 +1708,6 @@ def _cython_agg_general(
17051708
alt: Callable,
17061709
numeric_only: bool | lib.NoDefault,
17071710
min_count: int = -1,
1708-
ignore_failures: bool = True,
17091711
**kwargs,
17101712
):
17111713
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
@@ -1749,7 +1751,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
17491751

17501752
# TypeError -> we may have an exception in trying to aggregate
17511753
# continue and exclude the block
1752-
new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
1754+
new_mgr = data.grouped_reduce(array_func, ignore_failures=False)
17531755

17541756
if not is_ser and len(new_mgr) < orig_len:
17551757
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
@@ -2054,8 +2056,11 @@ def mean(
20542056
Parameters
20552057
----------
20562058
numeric_only : bool, default True
2057-
Include only float, int, boolean columns. If None, will attempt to use
2058-
everything, then use only numeric data.
2059+
Include only float, int, boolean columns.
2060+
2061+
.. versionchanged:: 2.0.0
2062+
2063+
numeric_only no longer accepts ``None``.
20592064
20602065
engine : str, default None
20612066
* ``'cython'`` : Runs the operation through C-extensions from cython.
@@ -2123,7 +2128,6 @@ def mean(
21232128
"mean",
21242129
alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
21252130
numeric_only=numeric_only,
2126-
ignore_failures=numeric_only is lib.no_default,
21272131
)
21282132
return result.__finalize__(self.obj, method="groupby")
21292133

@@ -2139,8 +2143,11 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
21392143
Parameters
21402144
----------
21412145
numeric_only : bool, default True
2142-
Include only float, int, boolean columns. If None, will attempt to use
2143-
everything, then use only numeric data.
2146+
Include only float, int, boolean columns.
2147+
2148+
.. versionchanged:: 2.0.0
2149+
2150+
numeric_only no longer accepts ``None``.
21442151
21452152
Returns
21462153
-------
@@ -2153,7 +2160,6 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
21532160
"median",
21542161
alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
21552162
numeric_only=numeric_only,
2156-
ignore_failures=numeric_only is lib.no_default,
21572163
)
21582164
return result.__finalize__(self.obj, method="groupby")
21592165

@@ -2287,7 +2293,6 @@ def var(
22872293
"var",
22882294
alt=lambda x: Series(x).var(ddof=ddof),
22892295
numeric_only=numeric_only,
2290-
ignore_failures=numeric_only is lib.no_default,
22912296
ddof=ddof,
22922297
)
22932298

@@ -3286,8 +3291,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
32863291
is_ser = obj.ndim == 1
32873292
mgr = self._get_data_to_aggregate()
32883293
data = mgr.get_numeric_data() if numeric_only_bool else mgr
3289-
ignore_failures = numeric_only_bool
3290-
res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
3294+
res_mgr = data.grouped_reduce(blk_func, ignore_failures=False)
32913295

32923296
if (
32933297
numeric_only is lib.no_default
@@ -3765,9 +3769,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
37653769
if numeric_only_bool:
37663770
mgr = mgr.get_numeric_data()
37673771

3768-
res_mgr = mgr.grouped_reduce(
3769-
blk_func, ignore_failures=numeric_only is lib.no_default
3770-
)
3772+
res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=False)
37713773

37723774
if not is_ser and len(res_mgr.items) != orig_mgr_len:
37733775
howstr = how.replace("group_", "")

pandas/tests/groupby/aggregate/test_cython.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,9 @@ def test_cython_agg_nothing_to_agg():
101101

102102
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
103103

104-
with tm.assert_produces_warning(FutureWarning):
105-
result = frame[["b"]].groupby(frame["a"]).mean()
104+
with pytest.raises(TypeError, match="Could not convert"):
105+
frame[["b"]].groupby(frame["a"]).mean()
106+
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
106107
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
107108
tm.assert_frame_equal(result, expected)
108109

pandas/tests/groupby/test_categorical.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,7 @@ def test_basic(): # TODO: split this test
103103
gb = df.groupby("A", observed=False)
104104
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
105105
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
106-
msg = "The default value of numeric_only"
107-
with tm.assert_produces_warning(FutureWarning, match=msg):
108-
result = gb.sum()
106+
result = gb.sum()
109107
tm.assert_frame_equal(result, expected)
110108

111109
# GH 8623
@@ -857,12 +855,14 @@ def test_preserve_categorical_dtype():
857855
}
858856
)
859857
for col in ["C1", "C2"]:
860-
msg = "The default value of numeric_only"
861-
with tm.assert_produces_warning(FutureWarning, match=msg):
862-
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
863-
result2 = (
864-
df.groupby(by=col, as_index=True, observed=False).mean().reset_index()
865-
)
858+
result1 = df.groupby(by=col, as_index=False, observed=False).mean(
859+
numeric_only=True
860+
)
861+
result2 = (
862+
df.groupby(by=col, as_index=True, observed=False)
863+
.mean(numeric_only=True)
864+
.reset_index()
865+
)
866866
expected = exp_full.reindex(columns=result1.columns)
867867
tm.assert_frame_equal(result1, expected)
868868
tm.assert_frame_equal(result2, expected)
@@ -1856,10 +1856,7 @@ def test_category_order_reducer(
18561856
df = df.set_index(keys)
18571857
args = get_groupby_method_args(reduction_func, df)
18581858
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
1859-
msg = "is deprecated and will be removed in a future version"
1860-
warn = FutureWarning if reduction_func == "mad" else None
1861-
with tm.assert_produces_warning(warn, match=msg):
1862-
op_result = getattr(gb, reduction_func)(*args)
1859+
op_result = getattr(gb, reduction_func)(*args)
18631860
if as_index:
18641861
result = op_result.index.get_level_values("a").categories
18651862
else:

0 commit comments

Comments
 (0)