From 1ae124cd8d25525764f3d7a38cf4ea9ddf9dd224 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 10:08:22 -0500 Subject: [PATCH 01/10] DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/groupby/generic.py | 4 +- pandas/core/groupby/groupby.py | 7 +- pandas/tests/groupby/test_function.py | 155 +++++++++++---------- pandas/tests/groupby/test_groupby.py | 13 +- pandas/tests/groupby/test_min_max.py | 10 +- pandas/tests/resample/test_resample_api.py | 15 +- 7 files changed, 115 insertions(+), 90 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 032bcf09244e5..2ddd9c705f21d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -557,6 +557,7 @@ Removal of prior version deprecations/changes - Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`) - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) +- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 571559dc838f5..32802f6429c4c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1357,7 +1357,9 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # We could use `mgr.apply` here and not have to set_axis, but # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) + res_mgr = mgr.grouped_reduce( + arr_func, ignore_failures=numeric_only is lib.no_default + ) res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < orig_mgr_len: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d10931586d5e0..f16a8cce0863b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1664,6 +1664,7 @@ def _agg_general( alt=npfunc, numeric_only=numeric_only, min_count=min_count, + ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -2132,6 +2133,7 @@ def mean( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool), numeric_only=numeric_only, + ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -2161,6 +2163,7 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): "median", alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), numeric_only=numeric_only, + ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -3761,7 +3764,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: if numeric_only_bool: mgr = mgr.get_numeric_data() - res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) + res_mgr = mgr.grouped_reduce( + blk_func, ignore_failures=numeric_only is lib.no_default + ) if not is_ser and len(res_mgr.items) != orig_mgr_len: howstr = how.replace("group_", "") diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index f05874c3286c7..8e6122a66de09 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -163,14 +163,12 @@ def test_averages(self, df, method): "int", "float", "category_int", - "datetime", - "datetimetz", - "timedelta", ], ) - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = getattr(gb, method)(numeric_only=False) + with pytest.raises(TypeError, match="[Cc]ould not convert"): + getattr(gb, method)(numeric_only=False) + result = getattr(gb, method)() tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns @@ -252,30 +250,37 @@ def test_cummin_cummax(self, df, method): def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") - # cummin, cummax dont have numeric_only kwarg, always use False - warn = None - if method in ["cummin", "cummax"]: - # these dont have numeric_only kwarg, always use False - warn = FutureWarning - elif method in ["min", "max"]: - # these have numeric_only kwarg, but default to False - warn = FutureWarning - - with tm.assert_produces_warning( - warn, match="Dropping invalid columns", raise_on_extra_warnings=False - ): + if method in ("min", "max"): + # The methods default to numeric_only=False and raise TypeError + with pytest.raises(TypeError, match="Categorical is not ordered"): + getattr(gb, method)() + elif method in ("cummin", "cummax"): + # The methods default to numeric_only=False and raise NotImplementedError + msg = "function is not implemented for this dtype" + with pytest.raises(NotImplementedError, match=msg): + getattr(gb, method)() + else: result = getattr(gb, method)() - - tm.assert_index_equal(result.columns, expected_columns_numeric) - - # GH#41475 deprecated silently ignoring nuisance columns - warn = None - if len(expected_columns) < len(gb._obj_with_exclusions.columns): - warn = FutureWarning - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + tm.assert_index_equal(result.columns, expected_columns_numeric) + + if method in ("cumsum", "cumprod", "cummin", "cummax"): + msg = "function is not implemented for this dtype" + with pytest.raises(NotImplementedError, match=msg): + getattr(gb, method)(numeric_only=False) + elif method not in ("first", "last"): + msg = "|".join( + [ + "[Cc]ould not convert", + "Categorical is not ordered", + "category type does not support", + "can't multiply sequence", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(gb, method)(numeric_only=False) + else: result = getattr(gb, method)(numeric_only=False) - - tm.assert_index_equal(result.columns, expected_columns) + tm.assert_index_equal(result.columns, expected_columns) class TestGroupByNonCythonPaths: @@ -1323,45 +1328,45 @@ def test_groupby_sum_timedelta_with_nat(): @pytest.mark.parametrize( - "kernel, numeric_only_default, drops_nuisance, has_arg", + "kernel, numeric_only_default, has_arg", [ - ("all", False, False, False), - ("any", False, False, False), - ("bfill", False, False, False), - ("corr", True, False, True), - ("corrwith", True, False, True), - ("cov", True, False, True), - ("cummax", False, True, True), - ("cummin", False, True, True), - ("cumprod", True, True, True), - ("cumsum", True, True, True), - ("diff", False, False, False), - ("ffill", False, False, False), - ("fillna", False, False, False), - ("first", False, False, True), - ("idxmax", True, False, True), - ("idxmin", True, False, True), - ("last", False, False, True), - ("max", False, True, True), - ("mean", True, True, True), - ("median", True, True, True), - ("min", False, True, True), - ("nth", False, False, False), - ("nunique", False, False, False), - ("pct_change", False, False, False), - ("prod", True, True, True), - ("quantile", True, False, True), - ("sem", True, True, True), - ("skew", True, False, True), - ("std", True, True, True), - ("sum", True, True, True), - ("var", True, False, True), + ("all", False, False), + ("any", False, False), + ("bfill", False, False), + ("corr", True, True), + ("corrwith", True, True), + ("cov", True, True), + ("cummax", False, True), + ("cummin", False, True), + ("cumprod", True, True), + ("cumsum", True, True), + ("diff", False, False), + ("ffill", False, False), + ("fillna", False, False), + ("first", False, True), + ("idxmax", True, True), + ("idxmin", True, True), + ("last", False, True), + ("max", False, True), + ("mean", True, True), + ("median", True, True), + ("min", False, True), + ("nth", False, False), + ("nunique", False, False), + ("pct_change", False, False), + ("prod", True, True), + ("quantile", True, True), + ("sem", True, True), + ("skew", True, True), + ("std", True, True), + ("sum", True, True), + ("var", True, True), ], ) @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_deprecate_numeric_only( - kernel, numeric_only_default, drops_nuisance, has_arg, numeric_only, keys + kernel, numeric_only_default, has_arg, numeric_only, keys ): # GH#46072 # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False @@ -1380,10 +1385,9 @@ def test_deprecate_numeric_only( # Cases where b does not appear in the result numeric_only is True or (numeric_only is lib.no_default and numeric_only_default) - or drops_nuisance ) ): - if numeric_only is True or (not numeric_only_default and not drops_nuisance): + if numeric_only is True or not numeric_only_default: warn = None else: warn = FutureWarning @@ -1408,17 +1412,24 @@ def test_deprecate_numeric_only( assert "b" in result.columns elif has_arg or kernel in ("idxmax", "idxmin"): assert numeric_only is not True - assert not drops_nuisance # kernels that are successful on any dtype were above; this will fail - msg = ( - "(not allowed for this dtype" - "|must be a string or a number" - "|cannot be performed against 'object' dtypes" - "|must be a string or a real number" - "|unsupported operand type)" - ) - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) + if kernel in ("cummax", "cummin", "cumprod", "cumsum"): + msg = "function is not implemented for this dtype" + with pytest.raises(NotImplementedError, match=msg): + method(*args, **kwargs) + else: + msg = "|".join( + [ + "not allowed for this dtype", + "must be a string or a number", + "cannot be performed against 'object' dtypes", + "must be a string or a real number", + "unsupported operand type", + "not supported between instances of", + ] + ) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7fd52d3cf5bb8..96be7a0cb785c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -915,11 +915,13 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): grouped = df.groupby("A") - if agg_function in ("var", "std", "sem") and numeric_only is False: + no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") + if agg_function in no_drop_nuisance and numeric_only is False: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - klass = TypeError if agg_function == "var" else ValueError - with pytest.raises(klass, match="could not convert string to float"): + klass = ValueError if agg_function in ("std", "sem") else TypeError + msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"]) + with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: if numeric_only is lib.no_default: @@ -2049,10 +2051,13 @@ def get_result(): and isinstance(values, Categorical) and len(keys) == 1 ): + if op in ("min", "max"): + with pytest.raises(TypeError, match="Categorical is not ordered"): + get_result() + return # Categorical doesn't implement, so with numeric_only=True # these are dropped and we get an empty DataFrame back result = get_result() - expected = df.set_index(keys)[[]] # with numeric_only=True, these are dropped, and we get # an empty DataFrame back diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index b26ee057d2041..72772775b3fa1 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -48,15 +48,17 @@ def test_max_min_object_multiple_columns(using_array_manager): gb = df.groupby("A") - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = gb.max(numeric_only=False) + with pytest.raises(TypeError, match="not supported between instances"): + gb.max(numeric_only=False) + result = gb[["C"]].max() # "max" is valid for column "C" but not for "B" ei = Index([1, 2, 3], name="A") expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = gb.min(numeric_only=False) + with pytest.raises(TypeError, match="not supported between instances"): + gb.max(numeric_only=False) + result = gb[["C"]].min() # "min" is valid for column "C" but not for "B" ei = Index([1, 2, 3], name="A") expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5721de9e5f3bb..ca5444fd4e62f 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -821,8 +821,8 @@ def test_end_and_end_day_origin( ("sum", False, {"cat": ["cat_1cat_2"], "num": [25]}), ("sum", lib.no_default, {"num": [25]}), ("prod", True, {"num": [100]}), - ("prod", False, {"num": [100]}), - ("prod", lib.no_default, {"num": [100]}), + ("prod", False, "can't multiply sequence"), + ("prod", lib.no_default, "can't multiply sequence"), ("min", True, {"num": [5]}), ("min", False, {"cat": ["cat_1"], "num": [5]}), ("min", lib.no_default, {"cat": ["cat_1"], "num": [5]}), @@ -836,10 +836,10 @@ def test_end_and_end_day_origin( ("last", False, {"cat": ["cat_2"], "num": [20]}), ("last", lib.no_default, {"cat": ["cat_2"], "num": [20]}), ("mean", True, {"num": [12.5]}), - ("mean", False, {"num": [12.5]}), + ("mean", False, "Could not convert"), ("mean", lib.no_default, {"num": [12.5]}), ("median", True, {"num": [12.5]}), - ("median", False, {"num": [12.5]}), + ("median", False, "could not convert"), ("median", lib.no_default, {"num": [12.5]}), ("std", True, {"num": [10.606601717798213]}), ("std", False, "could not convert string to float"), @@ -876,15 +876,14 @@ def test_frame_downsample_method(method, numeric_only, expected_data): msg = ( f"default value of numeric_only in DataFrameGroupBy.{method} is deprecated" ) - elif method in ("prod", "mean", "median") and numeric_only is not True: - warn = FutureWarning - msg = f"Dropping invalid columns in DataFrameGroupBy.{method} is deprecated" else: warn = None msg = "" with tm.assert_produces_warning(warn, match=msg): if isinstance(expected_data, str): - klass = TypeError if method == "var" else ValueError + klass = ( + TypeError if method in ("var", "mean", "median", "prod") else ValueError + ) with pytest.raises(klass, match=expected_data): _ = func(**kwargs) else: From cbd87eb3e5eccff17e341d9f3ac92ba65bf6edf6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 10:40:03 -0500 Subject: [PATCH 02/10] Change to TypeError --- pandas/core/groupby/generic.py | 10 ++++-- pandas/tests/groupby/test_function.py | 52 ++++++++++++--------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 32802f6429c4c..97cdefe64ba5e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1357,9 +1357,13 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # We could use `mgr.apply` here and not have to set_axis, but # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce( - arr_func, ignore_failures=numeric_only is lib.no_default - ) + try: + res_mgr = mgr.grouped_reduce( + arr_func, ignore_failures=numeric_only is lib.no_default + ) + except NotImplementedError as err: + msg = f"{how} is not supported for at least one provided dtype" + raise TypeError(msg) from err res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < orig_mgr_len: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 8e6122a66de09..69c839c3b2273 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -250,30 +250,28 @@ def test_cummin_cummax(self, df, method): def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") - if method in ("min", "max"): + if method in ("min", "max", "cummin", "cummax"): # The methods default to numeric_only=False and raise TypeError - with pytest.raises(TypeError, match="Categorical is not ordered"): - getattr(gb, method)() - elif method in ("cummin", "cummax"): - # The methods default to numeric_only=False and raise NotImplementedError - msg = "function is not implemented for this dtype" - with pytest.raises(NotImplementedError, match=msg): + msg = "|".join( + [ + "Categorical is not ordered", + "is not supported for at least one provided dtype", + ] + ) + with pytest.raises(TypeError, match=msg): getattr(gb, method)() else: result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns_numeric) - if method in ("cumsum", "cumprod", "cummin", "cummax"): - msg = "function is not implemented for this dtype" - with pytest.raises(NotImplementedError, match=msg): - getattr(gb, method)(numeric_only=False) - elif method not in ("first", "last"): + if method not in ("first", "last"): msg = "|".join( [ "[Cc]ould not convert", "Categorical is not ordered", "category type does not support", "can't multiply sequence", + "is not supported for at least one provided dtype", ] ) with pytest.raises(TypeError, match=msg): @@ -1413,23 +1411,19 @@ def test_deprecate_numeric_only( elif has_arg or kernel in ("idxmax", "idxmin"): assert numeric_only is not True # kernels that are successful on any dtype were above; this will fail - if kernel in ("cummax", "cummin", "cumprod", "cumsum"): - msg = "function is not implemented for this dtype" - with pytest.raises(NotImplementedError, match=msg): - method(*args, **kwargs) - else: - msg = "|".join( - [ - "not allowed for this dtype", - "must be a string or a number", - "cannot be performed against 'object' dtypes", - "must be a string or a real number", - "unsupported operand type", - "not supported between instances of", - ] - ) - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) + msg = "|".join( + [ + "not allowed for this dtype", + "must be a string or a number", + "cannot be performed against 'object' dtypes", + "must be a string or a real number", + "unsupported operand type", + "not supported between instances of", + "is not supported for at least one provided dtype", + ] + ) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" From 4179204aacf76c59424ee1d34f6ca012845beffe Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 12:56:42 -0500 Subject: [PATCH 03/10] Better error message --- pandas/core/groupby/generic.py | 4 ++-- pandas/tests/groupby/test_function.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 97cdefe64ba5e..4ac701952e258 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1362,8 +1362,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: arr_func, ignore_failures=numeric_only is lib.no_default ) except NotImplementedError as err: - msg = f"{how} is not supported for at least one provided dtype" - raise TypeError(msg) from err + # For NotImplementedError, args[0] is the error message + raise TypeError(err.args[0]) from err res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < orig_mgr_len: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 69c839c3b2273..4adf2d5f38ead 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -255,7 +255,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): msg = "|".join( [ "Categorical is not ordered", - "is not supported for at least one provided dtype", + "function is not implemented for this dtype", ] ) with pytest.raises(TypeError, match=msg): @@ -271,7 +271,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "Categorical is not ordered", "category type does not support", "can't multiply sequence", - "is not supported for at least one provided dtype", + "function is not implemented for this dtype", ] ) with pytest.raises(TypeError, match=msg): @@ -1419,7 +1419,7 @@ def test_deprecate_numeric_only( "must be a string or a real number", "unsupported operand type", "not supported between instances of", - "is not supported for at least one provided dtype", + "function is not implemented for this dtype", ] ) with pytest.raises(TypeError, match=msg): From 93c8cc1f09803adbb539351d25e9c83cd11bc515 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 15 Nov 2022 19:46:27 -0500 Subject: [PATCH 04/10] WIP --- pandas/core/groupby/generic.py | 107 +++----- pandas/core/groupby/groupby.py | 299 ++++++----------------- pandas/core/groupby/ops.py | 1 + pandas/tests/groupby/test_categorical.py | 20 +- pandas/tests/groupby/test_function.py | 68 +----- pandas/tests/groupby/test_groupby.py | 166 ++++++------- 6 files changed, 211 insertions(+), 450 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4ac701952e258..b75c6e61b5bde 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -87,7 +87,6 @@ _agg_template, _apply_docs, _transform_template, - warn_dropping_nuisance_columns_deprecated, ) from pandas.core.groupby.grouper import get_grouper from pandas.core.indexes.api import ( @@ -437,7 +436,7 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): ) def _cython_transform( - self, how: str, numeric_only: bool = True, axis: AxisInt = 0, **kwargs + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs ): assert axis == 0 # handled by caller @@ -863,7 +862,7 @@ def skew( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True, - numeric_only: bool | None = None, + numeric_only: bool = False, **kwargs, ) -> Series: result = self._op_via_apply( @@ -1332,13 +1331,12 @@ def _wrap_applied_output_series( def _cython_transform( self, how: str, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, axis: AxisInt = 0, **kwargs, ) -> DataFrame: assert axis == 0 # handled by caller # TODO: no tests with self.ndim == 1 for DataFrameGroupBy - numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis) # With self.axis == 0, we have multi-block tests # e.g. test_rank_min_int, test_cython_transform_frame @@ -1346,8 +1344,7 @@ def _cython_transform( # With self.axis == 1, _get_data_to_aggregate does a transpose # so we always have a single block. mgr: Manager2D = self._get_data_to_aggregate() - orig_mgr_len = len(mgr) - if numeric_only_bool: + if numeric_only: mgr = mgr.get_numeric_data(copy=False) def arr_func(bvalues: ArrayLike) -> ArrayLike: @@ -1359,16 +1356,13 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # we would have to do shape gymnastics for ArrayManager compat try: res_mgr = mgr.grouped_reduce( - arr_func, ignore_failures=numeric_only is lib.no_default + arr_func, ignore_failures=False ) except NotImplementedError as err: # For NotImplementedError, args[0] is the error message raise TypeError(err.args[0]) from err res_mgr.set_axis(1, mgr.axes[1]) - if len(res_mgr) < orig_mgr_len: - warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) - res_df = self.obj._constructor(res_mgr) if self.axis == 1: res_df = res_df.T @@ -1498,15 +1492,8 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output = {} inds = [] for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)): - try: - output[i] = sgb.transform(wrapper) - except TypeError: - # e.g. trying to call nanmean with string values - warn_dropping_nuisance_columns_deprecated( - type(self), "transform", numeric_only=False - ) - else: - inds.append(i) + output[i] = sgb.transform(wrapper) + inds.append(i) if not output: raise TypeError("Transform function invalid for data types") @@ -1789,84 +1776,64 @@ def nunique(self, dropna: bool = True) -> DataFrame: @doc( _shared_docs["idxmax"], - numeric_only_default="True for axis=0, False for axis=1", + numeric_only_default="False", ) def idxmax( self, axis: Axis = 0, skipna: bool = True, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: axis = DataFrame._get_axis_number(axis) - if numeric_only is lib.no_default: - # Cannot use self._resolve_numeric_only; we must pass None to - # DataFrame.idxmax for backwards compatibility - numeric_only_arg = None if axis == 0 else False - else: - numeric_only_arg = numeric_only def func(df): - with warnings.catch_warnings(): - # Suppress numeric_only warnings here, will warn below - warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax") - res = df._reduce( - nanops.nanargmax, - "argmax", - axis=axis, - skipna=skipna, - numeric_only=numeric_only_arg, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) + res = df._reduce( + nanops.nanargmax, + "argmax", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + ) + indices = res._values + index = df._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmax" result = self._python_apply_general( func, self._obj_with_exclusions, not_indexed_same=True ) - self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only) return result @doc( _shared_docs["idxmin"], - numeric_only_default="True for axis=0, False for axis=1", + numeric_only_default="False", ) def idxmin( self, axis: Axis = 0, skipna: bool = True, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: axis = DataFrame._get_axis_number(axis) - if numeric_only is lib.no_default: - # Cannot use self._resolve_numeric_only; we must pass None to - # DataFrame.idxmin for backwards compatibility - numeric_only_arg = None if axis == 0 else False - else: - numeric_only_arg = numeric_only def func(df): - with warnings.catch_warnings(): - # Suppress numeric_only warnings here, will warn below - warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin") - res = df._reduce( - nanops.nanargmin, - "argmin", - axis=axis, - skipna=skipna, - numeric_only=numeric_only_arg, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) + res = df._reduce( + nanops.nanargmin, + "argmin", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + ) + indices = res._values + index = df._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmin" result = self._python_apply_general( func, self._obj_with_exclusions, not_indexed_same=True ) - self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only) return result boxplot = boxplot_frame_groupby @@ -2245,7 +2212,7 @@ def skew( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, **kwargs, ) -> DataFrame: result = self._op_via_apply( @@ -2268,7 +2235,7 @@ def corr( self, method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", min_periods: int = 1, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: result = self._op_via_apply( "corr", method=method, min_periods=min_periods, numeric_only=numeric_only @@ -2280,7 +2247,7 @@ def cov( self, min_periods: int | None = None, ddof: int | None = 1, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: result = self._op_via_apply( "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only @@ -2341,7 +2308,7 @@ def corrwith( axis: Axis = 0, drop: bool = False, method: CorrelationMethod = "pearson", - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: result = self._op_via_apply( "corrwith", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f16a8cce0863b..d595c2b9195d0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -324,8 +324,7 @@ class providing the base-class of operations. Parameters ---------- numeric_only : bool, default {no} - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + Include only float, int, boolean columns. min_count : int, default {mc} The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. @@ -1012,15 +1011,8 @@ def _op_via_apply(self, name: str, *args, **kwargs): if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: kwargs["axis"] = self.axis - numeric_only = kwargs.get("numeric_only", lib.no_default) - def curried(x): - with warnings.catch_warnings(): - # Catch any warnings from dispatch to DataFrame; we'll emit - # a warning for groupby below - match = "The default value of numeric_only " - warnings.filterwarnings("ignore", match, FutureWarning) - return f(x, *args, **kwargs) + return f(x, *args, **kwargs) # preserve the name so we can detect it when calling plot methods, # to avoid duplicates @@ -1042,13 +1034,6 @@ def curried(x): not_indexed_same=not is_transform, ) - if self._selected_obj.ndim != 1 and self.axis != 1 and result.ndim != 1: - missing = self._obj_with_exclusions.columns.difference(result.columns) - if len(missing) > 0: - warn_dropping_nuisance_columns_deprecated( - type(self), name, numeric_only - ) - if self.grouper.has_dropped_na and is_transform: # result will have dropped rows due to nans, fill with null # and ensure index is ordered same as the input @@ -1314,80 +1299,6 @@ def _wrap_applied_output( ): raise AbstractMethodError(self) - def _resolve_numeric_only( - self, how: str, numeric_only: bool | lib.NoDefault, axis: AxisInt - ) -> bool: - """ - Determine subclass-specific default value for 'numeric_only'. - - For SeriesGroupBy we want the default to be False (to match Series behavior). - For DataFrameGroupBy we want it to be True (for backwards-compat). - - Parameters - ---------- - numeric_only : bool or lib.no_default - axis : int - Axis passed to the groupby op (not self.axis). - - Returns - ------- - bool - """ - # GH#41291 - if numeric_only is lib.no_default: - # i.e. not explicitly passed by user - if self.obj.ndim == 2: - # i.e. DataFrameGroupBy - numeric_only = axis != 1 - # GH#42395 GH#43108 GH#43154 - # Regression from 1.2.5 to 1.3 caused object columns to be dropped - if self.axis: - obj = self._obj_with_exclusions.T - else: - obj = self._obj_with_exclusions - check = obj._get_numeric_data() - if len(obj.columns) and not len(check.columns) and not obj.empty: - numeric_only = False - - else: - numeric_only = False - - if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): - # GH#47500 - warnings.warn( - f"{type(self).__name__}.{how} called with " - f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will " - "raise a TypeError in a future version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - raise NotImplementedError( - f"{type(self).__name__}.{how} does not implement numeric_only" - ) - - return numeric_only - - def _maybe_warn_numeric_only_depr( - self, how: str, result: DataFrame | Series, numeric_only: bool | lib.NoDefault - ) -> None: - """Emit warning on numeric_only behavior deprecation when appropriate. - - Parameters - ---------- - how : str - Groupby kernel name. - result : - Result of the groupby operation. - numeric_only : bool or lib.no_default - Argument as passed by user. - """ - if ( - self._obj_with_exclusions.ndim != 1 - and result.ndim > 1 - and len(result.columns) < len(self._obj_with_exclusions.columns) - ): - warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) - # ----------------------------------------------------------------- # numba @@ -1627,18 +1538,7 @@ def _python_agg_general( for idx, obj in enumerate(self._iterate_slices()): name = obj.name - - try: - # if this function is invalid for this dtype, we will ignore it. - result = self.grouper.agg_series(obj, f) - except TypeError: - if raise_on_typeerror: - raise - warn_dropping_nuisance_columns_deprecated( - type(self), "agg", numeric_only=False - ) - continue - + result = self.grouper.agg_series(obj, f) key = base.OutputKey(label=name, position=idx) output[key] = result @@ -1650,7 +1550,7 @@ def _python_agg_general( @final def _agg_general( self, - numeric_only: bool | lib.NoDefault = True, + numeric_only: bool = False, min_count: int = -1, *, alias: str, @@ -1664,7 +1564,7 @@ def _agg_general( alt=npfunc, numeric_only=numeric_only, min_count=min_count, - ignore_failures=numeric_only is lib.no_default, + ignore_failures=False, ) return result.__finalize__(self.obj, method="groupby") @@ -1713,20 +1613,17 @@ def _cython_agg_general( self, how: str, alt: Callable, - numeric_only: bool | lib.NoDefault, + numeric_only: bool, min_count: int = -1, ignore_failures: bool = True, **kwargs, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy - numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) - data = self._get_data_to_aggregate() is_ser = data.ndim == 1 - orig_len = len(data) - if numeric_only_bool: + if numeric_only: if is_ser and not is_numeric_dtype(self._selected_obj.dtype): # GH#41291 match Series behavior kwd_name = "numeric_only" @@ -1749,9 +1646,6 @@ def array_func(values: ArrayLike) -> ArrayLike: **kwargs, ) except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg # TODO: shouldn't min_count matter? result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) @@ -1761,9 +1655,6 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) - if not is_ser and len(new_mgr) < orig_len: - warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) - res = self._wrap_agged_manager(new_mgr) if is_ser: res.index = self.grouper.result_index @@ -1772,7 +1663,7 @@ def array_func(values: ArrayLike) -> ArrayLike: return res def _cython_transform( - self, how: str, numeric_only: bool = True, axis: AxisInt = 0, **kwargs + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs ): raise AbstractMethodError(self) @@ -1953,6 +1844,7 @@ def result_to_bool( return self._get_cythonized_result( libgroupby.group_any_all, + # TODO: Double check this numeric_only=False, cython_dtype=np.dtype(np.int8), needs_mask=True, @@ -2054,7 +1946,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: @Substitution(see_also=_common_see_also) def mean( self, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, engine: str = "cython", engine_kwargs: dict[str, bool] | None = None, ): @@ -2063,9 +1955,12 @@ def mean( Parameters ---------- - numeric_only : bool, default True - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False`` and does not accept ``None``. engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. @@ -2122,8 +2017,6 @@ def mean( 2 4.0 Name: B, dtype: float64 """ - numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0) - if maybe_use_numba(engine): from pandas.core._numba.kernels import sliding_mean @@ -2131,16 +2024,16 @@ def mean( else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool), + alt=lambda x: Series(x).mean(numeric_only=numeric_only), numeric_only=numeric_only, - ignore_failures=numeric_only is lib.no_default, + ignore_failures=False, ) return result.__finalize__(self.obj, method="groupby") @final @Substitution(name="groupby") @Appender(_common_see_also) - def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): + def median(self, numeric_only: bool = False): """ Compute median of groups, excluding missing values. @@ -2148,22 +2041,23 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): Parameters ---------- - numeric_only : bool, default True - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only defaults to ``False`` and no longer accepts ``None``. Returns ------- Series or DataFrame Median of values within each group. """ - numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0) - result = self._cython_agg_general( "median", - alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), + alt=lambda x: Series(x).median(numeric_only=numeric_only), numeric_only=numeric_only, - ignore_failures=numeric_only is lib.no_default, + ignore_failures=False, ) return result.__finalize__(self.obj, method="groupby") @@ -2175,7 +2069,7 @@ def std( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ): """ Compute standard deviation of groups, excluding missing values. @@ -2204,11 +2098,15 @@ def std( .. versionadded:: 1.4.0 - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame @@ -2219,10 +2117,8 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - # Resolve numeric_only so that var doesn't warn - numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0) if ( - numeric_only_bool + numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype) ): @@ -2233,12 +2129,11 @@ def std( result = self._get_cythonized_result( libgroupby.group_var, cython_dtype=np.dtype(np.float64), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, needs_counts=True, post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, ) - self._maybe_warn_numeric_only_depr("std", result, numeric_only) return result @final @@ -2249,7 +2144,7 @@ def var( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ): """ Compute variance of groups, excluding missing values. @@ -2278,11 +2173,15 @@ def var( .. versionadded:: 1.4.0 - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame @@ -2297,14 +2196,14 @@ def var( "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only, - ignore_failures=numeric_only is lib.no_default, + ignore_failures=False, ddof=ddof, ) @final @Substitution(name="groupby") @Appender(_common_see_also) - def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default): + def sem(self, ddof: int = 1, numeric_only: bool = False): """ Compute standard error of the mean of groups, excluding missing values. @@ -2315,20 +2214,22 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default ddof : int, default 1 Degrees of freedom. - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame Standard error of the mean of values within each group. """ - # Reolve numeric_only so that std doesn't warn - numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0) if ( - numeric_only_bool + numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype) ): @@ -2336,8 +2237,7 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default f"{type(self).__name__}.sem called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) - result = self.std(ddof=ddof, numeric_only=numeric_only_bool) - self._maybe_warn_numeric_only_depr("sem", result, numeric_only) + result = self.std(ddof=ddof, numeric_only=numeric_only) if result.ndim == 1: result /= np.sqrt(self.count()) @@ -2384,10 +2284,10 @@ def size(self) -> DataFrame | Series: return self._reindex_output(result, fill_value=0) @final - @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) + @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0) def sum( self, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, min_count: int = 0, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -2414,9 +2314,9 @@ def sum( return self._reindex_output(result, fill_value=0) @final - @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) + @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0) def prod( - self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + self, numeric_only: bool = False, min_count: int = 0 ): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod @@ -2544,8 +2444,12 @@ def last(self, numeric_only: bool = False, min_count: int = -1): Parameters ---------- numeric_only : bool, default False - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts the value ``None``. + min_count : int, default -1 The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. @@ -3095,7 +2999,7 @@ def quantile( self, q: float | AnyArrayLike = 0.5, interpolation: str = "linear", - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ): """ Return group values at the given quantile, a la numpy.percentile. @@ -3106,11 +3010,15 @@ def quantile( Value(s) between 0 and 1 providing the quantile(s) to compute. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} Method to use when the desired quantile falls between two points. - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame @@ -3134,9 +3042,8 @@ def quantile( a 2.0 b 3.0 """ - numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0) if ( - numeric_only_bool + numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype) ): @@ -3284,26 +3191,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: obj = self._obj_with_exclusions is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() - data = mgr.get_numeric_data() if numeric_only_bool else mgr - ignore_failures = numeric_only_bool - res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) - - if ( - numeric_only is lib.no_default - and not is_ser - and len(res_mgr.items) != len(mgr.items) - ): - warn_dropping_nuisance_columns_deprecated( - type(self), "quantile", numeric_only - ) - - if len(res_mgr.items) == 0: - # re-call grouped_reduce to get the desired exception message - mgr.grouped_reduce(blk_func, ignore_failures=False) - # grouped_reduce _should_ raise, so this should not be reached - raise TypeError( # pragma: no cover - "All columns were dropped in grouped_reduce" - ) + data = mgr.get_numeric_data() if numeric_only else mgr + res_mgr = data.grouped_reduce(blk_func, ignore_failures=False) if is_ser: res = self._wrap_agged_manager(res_mgr) @@ -3602,9 +3491,8 @@ def cummin( skipna = kwargs.get("skipna", True) if axis != 0: f = lambda x: np.minimum.accumulate(x, axis) - numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) obj = self._selected_obj - if numeric_only_bool: + if numeric_only: obj = obj._get_numeric_data() return self._python_apply_general(f, obj, is_transform=True) @@ -3628,9 +3516,8 @@ def cummax( skipna = kwargs.get("skipna", True) if axis != 0: f = lambda x: np.maximum.accumulate(x, axis) - numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) obj = self._selected_obj - if numeric_only_bool: + if numeric_only: obj = obj._get_numeric_data() return self._python_apply_general(f, obj, is_transform=True) @@ -3643,7 +3530,7 @@ def _get_cythonized_result( self, base_func: Callable, cython_dtype: np.dtype, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, needs_counts: bool = False, needs_nullable: bool = False, needs_mask: bool = False, @@ -3659,7 +3546,7 @@ def _get_cythonized_result( base_func : callable, Cythonized function to be called cython_dtype : np.dtype Type of the array that will be modified by the Cython call. - numeric_only : bool, default True + numeric_only : bool, default False Whether only numeric datatypes should be computed needs_counts : bool, default False Whether the counts should be a part of the Cython call @@ -3691,7 +3578,6 @@ def _get_cythonized_result( `Series` or `DataFrame` with filled values """ how = base_func.__name__ - numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) if post_processing and not callable(post_processing): raise ValueError("'post_processing' must be a callable!") @@ -3761,25 +3647,13 @@ def blk_func(values: ArrayLike) -> ArrayLike: mgr = self._get_data_to_aggregate() orig_mgr_len = len(mgr) - if numeric_only_bool: + if numeric_only: mgr = mgr.get_numeric_data() res_mgr = mgr.grouped_reduce( - blk_func, ignore_failures=numeric_only is lib.no_default + blk_func, ignore_failures=False ) - if not is_ser and len(res_mgr.items) != orig_mgr_len: - howstr = how.replace("group_", "") - warn_dropping_nuisance_columns_deprecated(type(self), howstr, numeric_only) - - if len(res_mgr.items) == 0: - # We re-call grouped_reduce to get the right exception message - mgr.grouped_reduce(blk_func, ignore_failures=False) - # grouped_reduce _should_ raise, so this should not be reached - raise TypeError( # pragma: no cover - "All columns were dropped in grouped_reduce" - ) - if is_ser: out = self._wrap_agged_manager(res_mgr) else: @@ -4323,26 +4197,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex.from_product([idx, qs]) return mi - -def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None: - if numeric_only is not lib.no_default and not numeric_only: - # numeric_only was specified and falsey but still dropped nuisance columns - warnings.warn( - "Dropping invalid columns in " - f"{cls.__name__}.{how} is deprecated. " - "In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) - elif numeric_only is lib.no_default: - warnings.warn( - "The default value of numeric_only in " - f"{cls.__name__}.{how} is deprecated. " - "In a future version, numeric_only will default to False. " - f"Either specify numeric_only or select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c20fe34a178f5..e3d91e34e74bd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1028,6 +1028,7 @@ def agg_series( npvalues = lib.maybe_convert_objects(result, try_float=False) if preserve_dtype: + # numeric_only = True? out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ca794d4ae5a3e..fb3471c026497 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -103,9 +103,10 @@ def test_basic(): # TODO: split this test gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "category type does not support sum operations" + with pytest.raises(TypeError, match=msg): result = gb.sum() + result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) # GH 8623 @@ -857,12 +858,10 @@ def test_preserve_categorical_dtype(): } ) for col in ["C1", "C2"]: - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - result1 = df.groupby(by=col, as_index=False, observed=False).mean() - result2 = ( - df.groupby(by=col, as_index=True, observed=False).mean().reset_index() - ) + result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) + result2 = ( + df.groupby(by=col, as_index=True, observed=False).mean(numeric_only=True).reset_index() + ) expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) @@ -1856,10 +1855,7 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - msg = "is deprecated and will be removed in a future version" - warn = FutureWarning if reduction_func == "mad" else None - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, reduction_func)(*args) + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4adf2d5f38ead..72b36a70ec084 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -168,7 +168,7 @@ def test_averages(self, df, method): with pytest.raises(TypeError, match="[Cc]ould not convert"): getattr(gb, method)(numeric_only=False) - result = getattr(gb, method)() + result = getattr(gb, method)(numeric_only=True) tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns @@ -250,12 +250,14 @@ def test_cummin_cummax(self, df, method): def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") - if method in ("min", "max", "cummin", "cummax"): + if method in ("min", "max", "cummin", "cummax", "sum", "cumsum", "prod", "cumprod"): # The methods default to numeric_only=False and raise TypeError msg = "|".join( [ "Categorical is not ordered", "function is not implemented for this dtype", + "category type does not support sum operations", + "can't multiply sequence by non-int of type 'str'", ] ) with pytest.raises(TypeError, match=msg): @@ -1361,41 +1363,22 @@ def test_groupby_sum_timedelta_with_nat(): ("var", True, True), ], ) -@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) +@pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_deprecate_numeric_only( kernel, numeric_only_default, has_arg, numeric_only, keys ): # GH#46072 - # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False # has_arg: Whether the op has a numeric_only arg df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]}) args = get_groupby_method_args(kernel, df) - kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} + kwargs = {"numeric_only": numeric_only} gb = df.groupby(keys) method = getattr(gb, kernel) - if ( - has_arg - and (kernel not in ("idxmax", "idxmin") or numeric_only is True) - and ( - # Cases where b does not appear in the result - numeric_only is True - or (numeric_only is lib.no_default and numeric_only_default) - ) - ): - if numeric_only is True or not numeric_only_default: - warn = None - else: - warn = FutureWarning - if numeric_only is lib.no_default and numeric_only_default: - msg = f"The default value of numeric_only in DataFrameGroupBy.{kernel}" - else: - msg = f"Dropping invalid columns in DataFrameGroupBy.{kernel}" - with tm.assert_produces_warning(warn, match=msg): - result = method(*args, **kwargs) - + if has_arg and numeric_only: + result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg @@ -1403,13 +1386,12 @@ def test_deprecate_numeric_only( or ( # kernels that work on any dtype and don't have numeric_only arg kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") - and numeric_only is lib.no_default ) ): result = method(*args, **kwargs) assert "b" in result.columns elif has_arg or kernel in ("idxmax", "idxmin"): - assert numeric_only is not True + assert not numeric_only # kernels that are successful on any dtype were above; this will fail msg = "|".join( [ @@ -1424,17 +1406,6 @@ def test_deprecate_numeric_only( ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) - elif not has_arg and numeric_only is not lib.no_default: - with pytest.raises( - TypeError, match="got an unexpected keyword argument 'numeric_only'" - ): - method(*args, **kwargs) - else: - assert kernel in ("diff", "pct_change") - assert numeric_only is lib.no_default - # Doesn't have numeric_only argument and fails on nuisance columns - with pytest.raises(TypeError, match=r"unsupported operand type"): - method(*args, **kwargs) @pytest.mark.parametrize("dtype", [bool, int, float, object]) @@ -1524,24 +1495,9 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) elif dtype is object: - err_category = NotImplementedError - err_msg = f"{groupby_func} does not implement numeric_only" - if groupby_func.startswith("cum"): - # cum ops already exhibit future behavior - warn_category = None - warn_msg = "" - err_category = TypeError - err_msg = f"{groupby_func} is not supported for object dtype" - elif groupby_func == "skew": - warn_category = FutureWarning - warn_msg = "will raise a TypeError in the future" - else: - warn_category = FutureWarning - warn_msg = "This will raise a TypeError" - - with tm.assert_produces_warning(warn_category, match=warn_msg): - with pytest.raises(err_category, match=err_msg): - method(*args, numeric_only=True) + msg = "asdf" + with pytest.raises(TypeError, match=msg): + method(*args, numeric_only=True) else: result = method(*args, numeric_only=True) expected = method(*args, numeric_only=False) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 96be7a0cb785c..4458779ad90c5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -434,17 +434,20 @@ def test_frame_groupby_columns(tsframe): def test_frame_set_name_single(df): grouped = df.groupby("A") - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.mean() + msg = "Could not convert" + with pytest.raises(TypeError, match=msg): + grouped.mean() + result = grouped.mean(numeric_only=True) assert result.index.name == "A" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A", as_index=False).mean() + with pytest.raises(TypeError, match=msg): + df.groupby("A", as_index=False).mean() + result = df.groupby("A", as_index=False).mean(numeric_only=True) assert result.index.name != "A" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.agg(np.mean) + with pytest.raises(TypeError, match=msg): + grouped.agg(np.mean) + result = grouped.agg(np.mean, numeric_only=True) assert result.index.name == "A" result = grouped.agg({"C": np.mean, "D": np.std}) @@ -467,10 +470,13 @@ def test_multi_func(df): col2 = df["B"] grouped = df.groupby([col1.get, col2.get]) - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = grouped.mean() - expected = df.groupby(["A", "B"]).mean() + msg = "Could not convert" + with pytest.raises(TypeError, match=msg): + grouped.mean() + with pytest.raises(TypeError, match=msg): + df.groupby(["A", "B"]).mean() + agged = grouped.mean(numeric_only=True) + expected = df.groupby(["A", "B"]).mean(numeric_only=True) # TODO groupby get drops names tm.assert_frame_equal( @@ -665,16 +671,22 @@ def test_groupby_as_index_agg(df): # single-key - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.agg(np.mean) - expected = grouped.mean() + msg = "Could not convert" + with pytest.raises(TypeError, match=msg): + grouped.agg(np.mean) + with pytest.raises(TypeError, match=msg): + grouped.mean() + result = grouped.agg(np.mean) + expected = grouped.mean() tm.assert_frame_equal(result, expected) result2 = grouped.agg({"C": np.mean, "D": np.sum}) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected2 = grouped.mean() - expected2["D"] = grouped.sum()["D"] + with pytest.raises(TypeError, match=msg): + grouped.mean() + with pytest.raises(TypeError, match=msg): + grouped.sum() + expected2 = grouped.mean() + expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) @@ -793,10 +805,13 @@ def test_groupby_as_index_cython(df): # single-key grouped = data.groupby("A", as_index=False) - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.mean() - expected = data.groupby(["A"]).mean() + msg = "Could not convert" + with pytest.raises(TypeError, match=msg): + grouped.mean() + with pytest.raises(TypeError, match=msg): + data.groupby(["A"]).mean() + result = grouped.mean(numeric_only=True) + expected = data.groupby(["A"]).mean(numeric_only=True) expected.insert(0, "A", expected.index) expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected) @@ -863,21 +878,21 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_omit_nuisance(df): +def test_raises_on_nuisance(df): grouped = df.groupby("A") - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = grouped.agg(np.mean) - exp = grouped.mean() - tm.assert_frame_equal(agged, exp) + msg = "Could not convert" + with pytest.raises(TypeError, match=msg): + grouped.agg(np.mean) + with pytest.raises(TypeError, match=msg): + grouped.mean() df = df.loc[:, ["A", "C", "D"]] df["E"] = datetime.now() grouped = df.groupby("A") - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.agg(np.sum) - expected = grouped.sum() - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match=msg): + grouped.agg(np.sum) + with pytest.raises(TypeError, match=msg): + grouped.sum() # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) @@ -904,19 +919,12 @@ def test_keep_nuisance_agg(df, agg_function): "agg_function", ["sum", "mean", "prod", "std", "var", "sem", "median"], ) -@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) -def test_omit_nuisance_agg(df, agg_function, numeric_only): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_raises_on_nuisance_agg(df, agg_function, numeric_only): # GH 38774, GH 38815 - if numeric_only is lib.no_default or (not numeric_only and agg_function != "sum"): - # sum doesn't drop strings - warn = FutureWarning - else: - warn = None - grouped = df.groupby("A") - no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") - if agg_function in no_drop_nuisance and numeric_only is False: + if agg_function != "sum" and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False klass = ValueError if agg_function in ("std", "sem") else TypeError @@ -924,54 +932,33 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: - if numeric_only is lib.no_default: - msg = ( - f"The default value of numeric_only in DataFrameGroupBy.{agg_function}" - ) - else: - msg = "Dropping invalid columns" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(grouped, agg_function)(numeric_only=numeric_only) - if ( - (numeric_only is lib.no_default or not numeric_only) - # These methods drop non-numeric columns even when numeric_only is False - and agg_function not in ("mean", "prod", "median") - ): + result = getattr(grouped, agg_function)(numeric_only=numeric_only) + if not numeric_only: columns = ["A", "B", "C", "D"] else: columns = ["A", "C", "D"] - if agg_function == "sum" and numeric_only is False: - # sum doesn't drop nuisance string columns - warn = None - elif agg_function in ("sum", "std", "var", "sem") and numeric_only is not True: - warn = FutureWarning - else: - warn = None - msg = "The default value of numeric_only" - with tm.assert_produces_warning(warn, match=msg): - expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( - numeric_only=numeric_only - ) + expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( + numeric_only=numeric_only + ) tm.assert_frame_equal(result, expected) -def test_omit_nuisance_warnings(df): +def test_skew_raises_on_nuisance(df): # GH 38815 - with tm.assert_produces_warning(FutureWarning, filter_level="always"): - grouped = df.groupby("A") + grouped = df.groupby("A") + with pytest.raises(TypeError, match="could not convert string to float"): result = grouped.skew() - expected = df.loc[:, ["A", "C", "D"]].groupby("A").skew() - tm.assert_frame_equal(result, expected) + result = grouped.skew(numeic_only=True) + expected = df.loc[:, ["A", "C", "D"]].groupby("A").skew() + tm.assert_frame_equal(result, expected) -def test_omit_nuisance_python_multiple(three_group): +def test_raises_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = grouped.agg(np.mean) - exp = grouped.mean() - tm.assert_frame_equal(agged, exp) + with pytest.raises(TypeError, match="Could not convert"): + grouped.agg(np.mean) + with pytest.raises(TypeError, match="Could not convert"): + grouped.mean() def test_empty_groups_corner(mframe): @@ -987,10 +974,12 @@ def test_empty_groups_corner(mframe): ) grouped = df.groupby(["k1", "k2"]) - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.agg(np.mean) - expected = grouped.mean() + with pytest.raises(TypeError, match="Could not convert"): + grouped.agg(np.mean) + with pytest.raises(TypeError, match="Could not convert"): + grouped.mean() + result = grouped.agg(np.mean, numeric_only=True) + expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) grouped = mframe[3:5].groupby(level=0) @@ -1012,9 +1001,9 @@ def test_wrap_aggregated_output_multindex(mframe): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = df.groupby(keys).agg(np.mean) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby(keys).agg(np.mean, numeric_only=True) + agged = df.groupby(keys).agg(np.mean) assert isinstance(agged.columns, MultiIndex) def aggfun(ser): @@ -1022,8 +1011,9 @@ def aggfun(ser): raise TypeError return ser.sum() - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - agged2 = df.groupby(keys).aggregate(aggfun) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby(keys).aggregate(aggfun) + agged2 = df.groupby(keys).aggregate(aggfun, numeric_only=True) assert len(agged2.columns) + 1 == len(df.columns) From 5e8d988d1ab8442f0a08795cbc74b597fee5785a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 16 Nov 2022 11:41:41 -0500 Subject: [PATCH 05/10] WIP --- pandas/tests/groupby/test_function.py | 35 +++++--- pandas/tests/groupby/test_groupby.py | 123 ++++++++++---------------- 2 files changed, 67 insertions(+), 91 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 72b36a70ec084..d0838aa58d4a5 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -250,7 +250,18 @@ def test_cummin_cummax(self, df, method): def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") - if method in ("min", "max", "cummin", "cummax", "sum", "cumsum", "prod", "cumprod"): + if method in ( + "min", + "max", + "cummin", + "cummax", + "sum", + "cumsum", + "prod", + "cumprod", + "mean", + "median", + ): # The methods default to numeric_only=False and raise TypeError msg = "|".join( [ @@ -1380,16 +1391,14 @@ def test_deprecate_numeric_only( if has_arg and numeric_only: result = method(*args, **kwargs) assert "b" not in result.columns - elif ( + elif kernel in ("first", "last"): # kernels that work on any dtype and have numeric_only arg - kernel in ("first", "last") - or ( - # kernels that work on any dtype and don't have numeric_only arg - kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") - ) - ): result = method(*args, **kwargs) assert "b" in result.columns + elif kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique"): + # kernels that work on any dtype and don't have numeric_only arg + result = method(*args) + assert "b" in result.columns elif has_arg or kernel in ("idxmax", "idxmin"): assert not numeric_only # kernels that are successful on any dtype were above; this will fail @@ -1411,9 +1420,9 @@ def test_deprecate_numeric_only( @pytest.mark.parametrize("dtype", [bool, int, float, object]) def test_deprecate_numeric_only_series(dtype, groupby_func, request): # GH#46560 - if groupby_func in ("backfill", "pad"): - pytest.skip("method is deprecated") - elif groupby_func == "corrwith": + # if groupby_func in ("backfill", "pad"): + # pytest.skip("method is deprecated") + if groupby_func == "corrwith": msg = "corrwith is not implemented on SeriesGroupBy" request.node.add_marker(pytest.mark.xfail(reason=msg)) @@ -1495,8 +1504,8 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) elif dtype is object: - msg = "asdf" - with pytest.raises(TypeError, match=msg): + msg = "does not implement numeric_only" + with pytest.raises(NotImplementedError, match=msg): method(*args, numeric_only=True) else: result = method(*args, numeric_only=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4458779ad90c5..961a3c703d676 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1897,9 +1897,6 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -@pytest.mark.filterwarnings("ignore:The default value of numeric_only:FutureWarning") -@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning") -@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_empty_groupby(columns, keys, values, method, op, request, using_array_manager): # GH8093 & GH26411 override_dtype = None @@ -1920,29 +1917,29 @@ def test_empty_groupby(columns, keys, values, method, op, request, using_array_m raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 1 - and not isinstance(columns, list) - ): - mark = pytest.mark.xfail( - raises=TypeError, match="'Categorical' does not implement" - ) - request.node.add_marker(mark) - elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: - mark = pytest.mark.xfail( - raises=AssertionError, match="(DataFrame|Series) are different" - ) - request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 2 - and op in ["min", "max", "sum"] - ): - mark = pytest.mark.xfail( - raises=AssertionError, match="(DataFrame|Series) are different" - ) - request.node.add_marker(mark) + # elif ( + # isinstance(values, Categorical) + # and len(keys) == 1 + # and not isinstance(columns, list) + # ): + # mark = pytest.mark.xfail( + # raises=TypeError, match="'Categorical' does not implement" + # ) + # request.node.add_marker(mark) + # elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: + # mark = pytest.mark.xfail( + # raises=AssertionError, match="(DataFrame|Series) are different" + # ) + # request.node.add_marker(mark) + # elif ( + # isinstance(values, Categorical) + # and len(keys) == 2 + # and op in ["min", "max", "sum"] + # ): + # mark = pytest.mark.xfail( + # raises=AssertionError, match="(DataFrame|Series) are different" + # ) + # request.node.add_marker(mark) elif isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these @@ -1983,7 +1980,7 @@ def get_result(): get_result() return - if op in ["prod", "sum", "skew"]: + if isinstance(values, Categorical): # GH#41291 if op == "skew": @@ -1996,76 +1993,46 @@ def get_result(): return else: # ie. DataFrameGroupBy - if op in ["prod", "sum"]: + if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness if df.dtypes[0].kind == "M": # GH#41291 # datetime64 -> prod and sum are invalid - result = get_result() + with pytest.raises(TypeError, match="datetime64 type does not support"): + get_result() - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - expected = df.set_index(keys)[[]] - tm.assert_equal(result, expected) + # # with numeric_only=True, these are dropped, and we get + # # an empty DataFrame back + # expected = df.set_index(keys)[[]] + # tm.assert_equal(result, expected) return elif isinstance(values, Categorical): # GH#41291 # Categorical doesn't implement sum or prod - result = get_result() - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - expected = df.set_index(keys)[[]] - if len(keys) != 1 and op == "prod": - # TODO: why just prod and not sum? - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=["A", "B"]) - expected = DataFrame([], columns=[], index=mi) - - tm.assert_equal(result, expected) + with pytest.raises(TypeError, match="category type does not support"): + get_result() return elif df.dtypes[0] == object: - # FIXME: the test is actually wrong here, xref #41341 result = get_result() # In this case we have list-of-list, will raise TypeError, # and subsequently be dropped as nuisance columns - expected = df.set_index(keys)[[]] + expected = df.set_index(keys)[["C"]] tm.assert_equal(result, expected) return - if ( - op in ["min", "max", "skew"] - and isinstance(values, Categorical) - and len(keys) == 1 - ): - if op in ("min", "max"): - with pytest.raises(TypeError, match="Categorical is not ordered"): - get_result() - return - # Categorical doesn't implement, so with numeric_only=True - # these are dropped and we get an empty DataFrame back - result = get_result() - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - if len(keys) != 1: - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=keys) - expected = DataFrame([], columns=[], index=mi) - else: - # all columns are dropped, but we end up with one row - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - ci = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=ci) - # expected = df.set_index(keys)[columns] - - tm.assert_equal(result, expected) - return + if op in ["min", "max", "skew"] and isinstance(values, Categorical): + msg = "|".join( + ["Categorical is not ordered", "dtype category does not support reduction"] + ) + with pytest.raises(TypeError, match=msg): + print(df) + print(keys) + print(columns) + print(df.dtypes) + get_result() + return result = get_result() expected = df.set_index(keys)[columns] From b3956c95a84db1811e56f1edbd7c0742d36b19d0 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 17 Nov 2022 21:51:38 -0500 Subject: [PATCH 06/10] CLN: Cleanups in groupby due to numeric_only deprecations --- pandas/core/groupby/generic.py | 77 +++++++------------ pandas/core/groupby/groupby.py | 36 +++++---- pandas/tests/groupby/aggregate/test_cython.py | 5 +- pandas/tests/groupby/test_categorical.py | 20 ++--- 4 files changed, 59 insertions(+), 79 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4ac701952e258..7ff00d8f98555 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -863,7 +863,7 @@ def skew( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True, - numeric_only: bool | None = None, + numeric_only: bool = False, **kwargs, ) -> Series: result = self._op_via_apply( @@ -1359,7 +1359,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # we would have to do shape gymnastics for ArrayManager compat try: res_mgr = mgr.grouped_reduce( - arr_func, ignore_failures=numeric_only is lib.no_default + arr_func, + ignore_failures=False, ) except NotImplementedError as err: # For NotImplementedError, args[0] is the error message @@ -1789,84 +1790,64 @@ def nunique(self, dropna: bool = True) -> DataFrame: @doc( _shared_docs["idxmax"], - numeric_only_default="True for axis=0, False for axis=1", + numeric_only_default="False", ) def idxmax( self, axis: Axis = 0, skipna: bool = True, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: axis = DataFrame._get_axis_number(axis) - if numeric_only is lib.no_default: - # Cannot use self._resolve_numeric_only; we must pass None to - # DataFrame.idxmax for backwards compatibility - numeric_only_arg = None if axis == 0 else False - else: - numeric_only_arg = numeric_only def func(df): - with warnings.catch_warnings(): - # Suppress numeric_only warnings here, will warn below - warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax") - res = df._reduce( - nanops.nanargmax, - "argmax", - axis=axis, - skipna=skipna, - numeric_only=numeric_only_arg, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) + res = df._reduce( + nanops.nanargmax, + "argmax", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + ) + indices = res._values + index = df._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmax" result = self._python_apply_general( func, self._obj_with_exclusions, not_indexed_same=True ) - self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only) return result @doc( _shared_docs["idxmin"], - numeric_only_default="True for axis=0, False for axis=1", + numeric_only_default="False", ) def idxmin( self, axis: Axis = 0, skipna: bool = True, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: axis = DataFrame._get_axis_number(axis) - if numeric_only is lib.no_default: - # Cannot use self._resolve_numeric_only; we must pass None to - # DataFrame.idxmin for backwards compatibility - numeric_only_arg = None if axis == 0 else False - else: - numeric_only_arg = numeric_only def func(df): - with warnings.catch_warnings(): - # Suppress numeric_only warnings here, will warn below - warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin") - res = df._reduce( - nanops.nanargmin, - "argmin", - axis=axis, - skipna=skipna, - numeric_only=numeric_only_arg, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) + res = df._reduce( + nanops.nanargmin, + "argmin", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + ) + indices = res._values + index = df._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmin" result = self._python_apply_general( func, self._obj_with_exclusions, not_indexed_same=True ) - self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only) return result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1d8271a845f9a..b5e904a7d3882 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -324,8 +324,12 @@ class providing the base-class of operations. Parameters ---------- numeric_only : bool, default {no} - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + min_count : int, default {mc} The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. @@ -1654,7 +1658,6 @@ def _agg_general( alt=npfunc, numeric_only=numeric_only, min_count=min_count, - ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -1705,7 +1708,6 @@ def _cython_agg_general( alt: Callable, numeric_only: bool | lib.NoDefault, min_count: int = -1, - ignore_failures: bool = True, **kwargs, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; @@ -1749,7 +1751,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block - new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) + new_mgr = data.grouped_reduce(array_func, ignore_failures=False) if not is_ser and len(new_mgr) < orig_len: warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) @@ -2054,8 +2056,11 @@ def mean( Parameters ---------- numeric_only : bool, default True - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. @@ -2123,7 +2128,6 @@ def mean( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool), numeric_only=numeric_only, - ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -2139,8 +2143,11 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): Parameters ---------- numeric_only : bool, default True - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. Returns ------- @@ -2153,7 +2160,6 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): "median", alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), numeric_only=numeric_only, - ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -2287,7 +2293,6 @@ def var( "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only, - ignore_failures=numeric_only is lib.no_default, ddof=ddof, ) @@ -3286,8 +3291,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() data = mgr.get_numeric_data() if numeric_only_bool else mgr - ignore_failures = numeric_only_bool - res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) + res_mgr = data.grouped_reduce(blk_func, ignore_failures=False) if ( numeric_only is lib.no_default @@ -3765,9 +3769,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: if numeric_only_bool: mgr = mgr.get_numeric_data() - res_mgr = mgr.grouped_reduce( - blk_func, ignore_failures=numeric_only is lib.no_default - ) + res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=False) if not is_ser and len(res_mgr.items) != orig_mgr_len: howstr = how.replace("group_", "") diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 0a3845617b32d..b8d2350cf6267 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -101,8 +101,9 @@ def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - with tm.assert_produces_warning(FutureWarning): - result = frame[["b"]].groupby(frame["a"]).mean() + with pytest.raises(TypeError, match="Could not convert"): + frame[["b"]].groupby(frame["a"]).mean() + result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates()) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ca794d4ae5a3e..fb3471c026497 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -103,9 +103,10 @@ def test_basic(): # TODO: split this test gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "category type does not support sum operations" + with pytest.raises(TypeError, match=msg): result = gb.sum() + result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) # GH 8623 @@ -857,12 +858,10 @@ def test_preserve_categorical_dtype(): } ) for col in ["C1", "C2"]: - msg = "The default value of numeric_only" - with tm.assert_produces_warning(FutureWarning, match=msg): - result1 = df.groupby(by=col, as_index=False, observed=False).mean() - result2 = ( - df.groupby(by=col, as_index=True, observed=False).mean().reset_index() - ) + result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) + result2 = ( + df.groupby(by=col, as_index=True, observed=False).mean(numeric_only=True).reset_index() + ) expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) @@ -1856,10 +1855,7 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - msg = "is deprecated and will be removed in a future version" - warn = FutureWarning if reduction_func == "mad" else None - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, reduction_func)(*args) + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: From 9e9cfc55e9d4d251858054b24093312ae1f8b10b Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 17 Nov 2022 22:53:43 -0500 Subject: [PATCH 07/10] revert --- pandas/tests/groupby/test_categorical.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fb3471c026497..9f6943006c5b7 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -103,10 +103,9 @@ def test_basic(): # TODO: split this test gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) - msg = "category type does not support sum operations" - with pytest.raises(TypeError, match=msg): + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): result = gb.sum() - result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) # GH 8623 @@ -858,10 +857,12 @@ def test_preserve_categorical_dtype(): } ) for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) - result2 = ( - df.groupby(by=col, as_index=True, observed=False).mean(numeric_only=True).reset_index() - ) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby(by=col, as_index=False, observed=False).mean() + result2 = ( + df.groupby(by=col, as_index=True, observed=False).mean().reset_index() + ) expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) From 2d42aad832bd9af0730a38bd3598c394a5f190ee Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 20 Nov 2022 08:24:55 -0500 Subject: [PATCH 08/10] Remove ops from groupby.String --- asv_bench/benchmarks/groupby.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6f0bb3091133f..58d8ec39120e6 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -671,12 +671,8 @@ class String: ["str", "string[python]"], [ "sum", - "prod", "min", "max", - "mean", - "median", - "var", "first", "last", "any", From 9c2eb849300720d966c767474e6e4db47ac213de Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 20 Nov 2022 17:45:43 -0500 Subject: [PATCH 09/10] fixup --- pandas/tests/groupby/test_categorical.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fb3471c026497..6fa5d210b8d15 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -103,10 +103,7 @@ def test_basic(): # TODO: split this test gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) - msg = "category type does not support sum operations" - with pytest.raises(TypeError, match=msg): - result = gb.sum() - result = gb.sum(numeric_only=True) + result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 @@ -858,9 +855,13 @@ def test_preserve_categorical_dtype(): } ) for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) + result1 = df.groupby(by=col, as_index=False, observed=False).mean( + numeric_only=True + ) result2 = ( - df.groupby(by=col, as_index=True, observed=False).mean(numeric_only=True).reset_index() + df.groupby(by=col, as_index=True, observed=False) + .mean(numeric_only=True) + .reset_index() ) expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) From e64ace28ec8421917c5f3c81bf1fb09752e8dcd5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 20 Nov 2022 17:49:06 -0500 Subject: [PATCH 10/10] fixup --- pandas/core/groupby/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e3d91e34e74bd..c20fe34a178f5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1028,7 +1028,6 @@ def agg_series( npvalues = lib.maybe_convert_objects(result, try_float=False) if preserve_dtype: - # numeric_only = True? out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues