From 2f8666f7b50b1f9d9a19f30ebb508ac41ad85e32 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Mar 2023 20:15:07 -0700 Subject: [PATCH 1/8] DEPR: support axis=None in DataFrame reductions --- pandas/core/generic.py | 49 +++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 060197e337f41..f5d42a7579be0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11077,6 +11077,8 @@ def _logical_func( name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs ) return res._logical_func(name, func, skipna=skipna, **kwargs) + elif axis is None: + axis = 0 if ( self.ndim > 1 @@ -11181,7 +11183,7 @@ def _stat_function_ddof( self, name: str, func, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11189,7 +11191,19 @@ def _stat_function_ddof( ) -> Series | float: nv.validate_stat_ddof_func((), kwargs, fname=name) validate_bool_kwarg(skipna, "skipna", none_allowed=False) + if axis is None: + if self.ndim > 1: + warnings.warn( + f"The behavior of {type(self).__name__}.{name} with axis=None " + "is deprecated, in a future version this will reduce over both " + "axes and return a scalar. To retain the old behavior, pass " + "axis=0 (or do not pass axis)", + FutureWarning, + stacklevel=find_stack_level(), + ) + axis = self._stat_axis_number + elif axis is lib.no_default: axis = self._stat_axis_number return self._reduce( @@ -11198,7 +11212,7 @@ def _stat_function_ddof( def sem( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11210,7 +11224,7 @@ def sem( def var( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11222,7 +11236,7 @@ def var( def std( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11336,7 +11350,7 @@ def _min_count_stat_function( self, name: str, func, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, @@ -11352,6 +11366,17 @@ def _min_count_stat_function( validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is None: + if self.ndim > 1: + warnings.warn( + f"The behavior of {type(self).__name__}.{name} with axis=None " + "is deprecated, in a future version this will reduce over both " + "axes and return a scalar. To retain the old behavior, pass " + "axis=0 (or do not pass axis)", + FutureWarning, + stacklevel=find_stack_level(), + ) + axis = self._stat_axis_number + elif axis is lib.no_default: axis = self._stat_axis_number return self._reduce( @@ -11365,7 +11390,7 @@ def _min_count_stat_function( def sum( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, @@ -11377,7 +11402,7 @@ def sum( def prod( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, @@ -11498,7 +11523,7 @@ def all( ) def sem( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11520,7 +11545,7 @@ def sem( ) def var( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11543,7 +11568,7 @@ def var( ) def std( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11631,7 +11656,7 @@ def cumprod( ) def sum( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, @@ -11653,7 +11678,7 @@ def sum( ) def prod( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, From 5b8e0787a04c9b8ba33f9184867c27795445fb7d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Mar 2023 10:23:01 -0700 Subject: [PATCH 2/8] test, whatsnew --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/tests/frame/test_npfuncs.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index db900ddd1f85b..daaf51066c8e2 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -109,6 +109,7 @@ Deprecations - Deprecated the ``axis`` keyword in :meth:`DataFrame.ewm`, :meth:`Series.ewm`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.expanding` (:issue:`51778`) - Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) - Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) +- Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py index 0b7699e46d720..6873c9c1c0124 100644 --- a/pandas/tests/frame/test_npfuncs.py +++ b/pandas/tests/frame/test_npfuncs.py @@ -26,3 +26,20 @@ def test_np_sqrt(self, float_frame): assert result.columns is float_frame.columns tm.assert_frame_equal(result, float_frame.apply(np.sqrt)) + + def test_sum_deprecated_axis_behavior(self): + # GH#52042 deprecated behavior of df.sum(axis=None), which gets + # called when we do np.sum(df) + + arr = np.random.randn(4, 3) + df = DataFrame(arr) + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + res = np.sum(df) + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.sum(axis=None) + tm.assert_series_equal(res, expected) From 6a99ba5da2a17dcf01440c920829d53ace9b801d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 29 Apr 2023 09:07:21 -0700 Subject: [PATCH 3/8] catch in apply(sum) --- doc/source/whatsnew/v0.15.1.rst | 1 + pandas/tests/groupby/test_apply.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index a1d4f9d14a905..a696307f69f06 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -97,6 +97,7 @@ API changes current behavior: .. ipython:: python + :okwarning: gr.apply(sum) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0699b7c1369f2..cee6ef0b0e9b8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1053,14 +1053,17 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - result = grp.apply(sum) + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + result = grp.apply(sum) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) - result = grp.apply(sum) + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + result = grp.apply(sum) tm.assert_frame_equal(result, expected) From b1d4ab7314f5cf08fa0c58cee2552f717114996e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 May 2023 07:50:02 -0700 Subject: [PATCH 4/8] Fix defaults --- pandas/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 19564afc41b49..98274f6bfbc21 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11004,7 +11004,7 @@ def max( @doc(make_doc("sum", ndim=2)) def sum( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, min_count: int = 0, @@ -11016,7 +11016,7 @@ def sum( @doc(make_doc("prod", ndim=2)) def prod( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, min_count: int = 0, @@ -11047,7 +11047,7 @@ def median( @doc(make_doc("sem", ndim=2)) def sem( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, ddof: int = 1, numeric_only: bool = False, @@ -11058,7 +11058,7 @@ def sem( @doc(make_doc("var", ndim=2)) def var( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, ddof: int = 1, numeric_only: bool = False, @@ -11069,7 +11069,7 @@ def var( @doc(make_doc("std", ndim=2)) def std( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, ddof: int = 1, numeric_only: bool = False, From adafb33b3b5f8fe5edac63d226113ce8ddc2350a Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 May 2023 10:32:44 -0700 Subject: [PATCH 5/8] catch warnings --- pandas/tests/groupby/aggregate/test_aggregate.py | 6 +++++- pandas/tests/groupby/test_function.py | 6 +++++- pandas/tests/groupby/test_groupby.py | 6 +++++- pandas/tests/window/test_expanding.py | 7 ++++++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 24b42421b3208..7c58761834aef 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -153,7 +153,11 @@ def test_agg_apply_corner(ts, tsframe): ) tm.assert_frame_equal(grouped.sum(), exp_df) tm.assert_frame_equal(grouped.agg(np.sum), exp_df) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df) + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = grouped.apply(np.sum) + tm.assert_frame_equal(res, exp_df) def test_agg_grouping_is_list_tuple(ts): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cf4d8a9c879b6..502b27b726d57 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -73,7 +73,11 @@ def test_builtins_apply(keys, f): gb = df.groupby(keys) fname = f.__name__ - result = gb.apply(f) + + warn = None if f is not sum else FutureWarning + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb.apply(f) ngroups = len(df.drop_duplicates(subset=keys)) assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7bda7c575d994..fae5c9dca1f14 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -752,7 +752,11 @@ def test_groupby_as_index_agg(df): gr = df.groupby(ts) gr.nth(0) # invokes set_selection_from_grouper internally - tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gr.apply(sum) + tm.assert_frame_equal(res, df.groupby(ts).apply(sum)) for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: gr = df.groupby(ts, as_index=False) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b4c5edeae949b..af16fd7fe9b7b 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -333,7 +333,12 @@ def test_expanding_func(func, static_comp, frame_or_series): result = getattr(obj, func)() assert isinstance(result, frame_or_series) - expected = static_comp(data[:11]) + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + warn = None + if frame_or_series is DataFrame and static_comp is np.sum: + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + expected = static_comp(data[:11]) if frame_or_series is Series: tm.assert_almost_equal(result[10], expected) else: From 3b2f589aade2f58259837ada91de155867c7f3aa Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 May 2023 11:41:36 -0700 Subject: [PATCH 6/8] dont check stacklevel --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/window/test_expanding.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index bdbec4eaf2815..3558377907931 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -155,7 +155,7 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_frame_equal(grouped.agg(np.sum), exp_df) msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): res = grouped.apply(np.sum) tm.assert_frame_equal(res, exp_df) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 502b27b726d57..98fce9d668e44 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -76,7 +76,7 @@ def test_builtins_apply(keys, f): warn = None if f is not sum else FutureWarning msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(warn, match=msg): + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): result = gb.apply(f) ngroups = len(df.drop_duplicates(subset=keys)) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bbdb2f3152386..4ee75378dce97 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -754,7 +754,7 @@ def test_groupby_as_index_agg(df): gr.nth(0) # invokes set_selection_from_grouper internally msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): res = gr.apply(sum) tm.assert_frame_equal(res, df.groupby(ts).apply(sum)) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 6f7fe9d31babd..bbcc260aa779e 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -337,7 +337,7 @@ def test_expanding_func(func, static_comp, frame_or_series): warn = None if frame_or_series is DataFrame and static_comp is np.sum: warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): expected = static_comp(data[:11]) if frame_or_series is Series: tm.assert_almost_equal(result[10], expected) From b335e651146565eb188c8aa8687679c555597c5d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 May 2023 13:27:12 -0700 Subject: [PATCH 7/8] mypy fixup --- pandas/core/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f86f7d662b320..bcfbfa1a2b713 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11425,7 +11425,7 @@ def _stat_function_ddof( def sem( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11437,7 +11437,7 @@ def sem( def var( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11449,7 +11449,7 @@ def var( def std( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11597,7 +11597,7 @@ def _min_count_stat_function( def sum( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, @@ -11609,7 +11609,7 @@ def sum( def prod( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, From ecef60196f4f8f66185a56e8c4888f8cd41974f4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 May 2023 19:08:37 -0700 Subject: [PATCH 8/8] catch warning --- pandas/tests/groupby/test_groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 636b642495943..0c6661b49d917 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -756,7 +756,9 @@ def test_groupby_as_index_agg(df): msg = "The behavior of DataFrame.sum with axis=None is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): res = gr.apply(sum) - tm.assert_frame_equal(res, df.groupby(ts).apply(sum)) + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + alt = df.groupby(ts).apply(sum) + tm.assert_frame_equal(res, alt) for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: gr = df.groupby(ts, as_index=False)