Skip to content

DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample #49665

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ Removal of prior version deprecations/changes
- Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`)
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
-

.. ---------------------------------------------------------------------------
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,7 +1357,13 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:

# We could use `mgr.apply` here and not have to set_axis, but
# we would have to do shape gymnastics for ArrayManager compat
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
try:
res_mgr = mgr.grouped_reduce(
arr_func, ignore_failures=numeric_only is lib.no_default
)
except NotImplementedError as err:
# For NotImplementedError, args[0] is the error message
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this still the case if it was raised with just raise NotImplementedError?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah - no, it's not. I'll fix in a followup. Thanks.

raise TypeError(err.args[0]) from err
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < orig_mgr_len:
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1655,6 +1655,7 @@ def _agg_general(
alt=npfunc,
numeric_only=numeric_only,
min_count=min_count,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -2123,6 +2124,7 @@ def mean(
"mean",
alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -2152,6 +2154,7 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
"median",
alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
numeric_only=numeric_only,
ignore_failures=numeric_only is lib.no_default,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -3756,7 +3759,9 @@ def blk_func(values: ArrayLike) -> ArrayLike:
if numeric_only_bool:
mgr = mgr.get_numeric_data()

res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
res_mgr = mgr.grouped_reduce(
blk_func, ignore_failures=numeric_only is lib.no_default
)

if not is_ser and len(res_mgr.items) != orig_mgr_len:
howstr = how.replace("group_", "")
Expand Down
143 changes: 74 additions & 69 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,12 @@ def test_averages(self, df, method):
"int",
"float",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)

with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = getattr(gb, method)(numeric_only=False)
with pytest.raises(TypeError, match="[Cc]ould not convert"):
getattr(gb, method)(numeric_only=False)
result = getattr(gb, method)()
tm.assert_frame_equal(result.reindex_like(expected), expected)

expected_columns = expected.columns
Expand Down Expand Up @@ -252,30 +250,35 @@ def test_cummin_cummax(self, df, method):
def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")

# cummin, cummax dont have numeric_only kwarg, always use False
warn = None
if method in ["cummin", "cummax"]:
# these dont have numeric_only kwarg, always use False
warn = FutureWarning
elif method in ["min", "max"]:
# these have numeric_only kwarg, but default to False
warn = FutureWarning

with tm.assert_produces_warning(
warn, match="Dropping invalid columns", raise_on_extra_warnings=False
):
if method in ("min", "max", "cummin", "cummax"):
# The methods default to numeric_only=False and raise TypeError
msg = "|".join(
[
"Categorical is not ordered",
"function is not implemented for this dtype",
]
)
with pytest.raises(TypeError, match=msg):
getattr(gb, method)()
else:
result = getattr(gb, method)()

tm.assert_index_equal(result.columns, expected_columns_numeric)

# GH#41475 deprecated silently ignoring nuisance columns
warn = None
if len(expected_columns) < len(gb._obj_with_exclusions.columns):
warn = FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
tm.assert_index_equal(result.columns, expected_columns_numeric)

if method not in ("first", "last"):
msg = "|".join(
[
"[Cc]ould not convert",
"Categorical is not ordered",
"category type does not support",
"can't multiply sequence",
"function is not implemented for this dtype",
]
)
with pytest.raises(TypeError, match=msg):
getattr(gb, method)(numeric_only=False)
else:
result = getattr(gb, method)(numeric_only=False)

tm.assert_index_equal(result.columns, expected_columns)
tm.assert_index_equal(result.columns, expected_columns)


class TestGroupByNonCythonPaths:
Expand Down Expand Up @@ -1323,45 +1326,45 @@ def test_groupby_sum_timedelta_with_nat():


@pytest.mark.parametrize(
"kernel, numeric_only_default, drops_nuisance, has_arg",
"kernel, numeric_only_default, has_arg",
[
("all", False, False, False),
("any", False, False, False),
("bfill", False, False, False),
("corr", True, False, True),
("corrwith", True, False, True),
("cov", True, False, True),
("cummax", False, True, True),
("cummin", False, True, True),
("cumprod", True, True, True),
("cumsum", True, True, True),
("diff", False, False, False),
("ffill", False, False, False),
("fillna", False, False, False),
("first", False, False, True),
("idxmax", True, False, True),
("idxmin", True, False, True),
("last", False, False, True),
("max", False, True, True),
("mean", True, True, True),
("median", True, True, True),
("min", False, True, True),
("nth", False, False, False),
("nunique", False, False, False),
("pct_change", False, False, False),
("prod", True, True, True),
("quantile", True, False, True),
("sem", True, True, True),
("skew", True, False, True),
("std", True, True, True),
("sum", True, True, True),
("var", True, False, True),
("all", False, False),
("any", False, False),
("bfill", False, False),
("corr", True, True),
("corrwith", True, True),
("cov", True, True),
("cummax", False, True),
("cummin", False, True),
("cumprod", True, True),
("cumsum", True, True),
("diff", False, False),
("ffill", False, False),
("fillna", False, False),
("first", False, True),
("idxmax", True, True),
("idxmin", True, True),
("last", False, True),
("max", False, True),
("mean", True, True),
("median", True, True),
("min", False, True),
("nth", False, False),
("nunique", False, False),
("pct_change", False, False),
("prod", True, True),
("quantile", True, True),
("sem", True, True),
("skew", True, True),
("std", True, True),
("sum", True, True),
("var", True, True),
],
)
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
def test_deprecate_numeric_only(
kernel, numeric_only_default, drops_nuisance, has_arg, numeric_only, keys
kernel, numeric_only_default, has_arg, numeric_only, keys
):
# GH#46072
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
Expand All @@ -1380,10 +1383,9 @@ def test_deprecate_numeric_only(
# Cases where b does not appear in the result
numeric_only is True
or (numeric_only is lib.no_default and numeric_only_default)
or drops_nuisance
)
):
if numeric_only is True or (not numeric_only_default and not drops_nuisance):
if numeric_only is True or not numeric_only_default:
warn = None
else:
warn = FutureWarning
Expand All @@ -1408,14 +1410,17 @@ def test_deprecate_numeric_only(
assert "b" in result.columns
elif has_arg or kernel in ("idxmax", "idxmin"):
assert numeric_only is not True
assert not drops_nuisance
# kernels that are successful on any dtype were above; this will fail
msg = (
"(not allowed for this dtype"
"|must be a string or a number"
"|cannot be performed against 'object' dtypes"
"|must be a string or a real number"
"|unsupported operand type)"
msg = "|".join(
[
"not allowed for this dtype",
"must be a string or a number",
"cannot be performed against 'object' dtypes",
"must be a string or a real number",
"unsupported operand type",
"not supported between instances of",
"function is not implemented for this dtype",
]
)
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,11 +915,13 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):

grouped = df.groupby("A")

if agg_function in ("var", "std", "sem") and numeric_only is False:
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
if agg_function in no_drop_nuisance and numeric_only is False:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
klass = TypeError if agg_function == "var" else ValueError
with pytest.raises(klass, match="could not convert string to float"):
klass = ValueError if agg_function in ("std", "sem") else TypeError
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
with pytest.raises(klass, match=msg):
getattr(grouped, agg_function)(numeric_only=numeric_only)
else:
if numeric_only is lib.no_default:
Expand Down Expand Up @@ -2049,10 +2051,13 @@ def get_result():
and isinstance(values, Categorical)
and len(keys) == 1
):
if op in ("min", "max"):
with pytest.raises(TypeError, match="Categorical is not ordered"):
get_result()
return
# Categorical doesn't implement, so with numeric_only=True
# these are dropped and we get an empty DataFrame back
result = get_result()
expected = df.set_index(keys)[[]]

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/groupby/test_min_max.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,17 @@ def test_max_min_object_multiple_columns(using_array_manager):

gb = df.groupby("A")

with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.max(numeric_only=False)
with pytest.raises(TypeError, match="not supported between instances"):
gb.max(numeric_only=False)
result = gb[["C"]].max()
# "max" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.min(numeric_only=False)
with pytest.raises(TypeError, match="not supported between instances"):
gb.max(numeric_only=False)
result = gb[["C"]].min()
# "min" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
Expand Down
15 changes: 7 additions & 8 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,8 +821,8 @@ def test_end_and_end_day_origin(
("sum", False, {"cat": ["cat_1cat_2"], "num": [25]}),
("sum", lib.no_default, {"num": [25]}),
("prod", True, {"num": [100]}),
("prod", False, {"num": [100]}),
("prod", lib.no_default, {"num": [100]}),
("prod", False, "can't multiply sequence"),
("prod", lib.no_default, "can't multiply sequence"),
("min", True, {"num": [5]}),
("min", False, {"cat": ["cat_1"], "num": [5]}),
("min", lib.no_default, {"cat": ["cat_1"], "num": [5]}),
Expand All @@ -836,10 +836,10 @@ def test_end_and_end_day_origin(
("last", False, {"cat": ["cat_2"], "num": [20]}),
("last", lib.no_default, {"cat": ["cat_2"], "num": [20]}),
("mean", True, {"num": [12.5]}),
("mean", False, {"num": [12.5]}),
("mean", False, "Could not convert"),
("mean", lib.no_default, {"num": [12.5]}),
("median", True, {"num": [12.5]}),
("median", False, {"num": [12.5]}),
("median", False, "could not convert"),
("median", lib.no_default, {"num": [12.5]}),
("std", True, {"num": [10.606601717798213]}),
("std", False, "could not convert string to float"),
Expand Down Expand Up @@ -876,15 +876,14 @@ def test_frame_downsample_method(method, numeric_only, expected_data):
msg = (
f"default value of numeric_only in DataFrameGroupBy.{method} is deprecated"
)
elif method in ("prod", "mean", "median") and numeric_only is not True:
warn = FutureWarning
msg = f"Dropping invalid columns in DataFrameGroupBy.{method} is deprecated"
else:
warn = None
msg = ""
with tm.assert_produces_warning(warn, match=msg):
if isinstance(expected_data, str):
klass = TypeError if method == "var" else ValueError
klass = (
TypeError if method in ("var", "mean", "median", "prod") else ValueError
)
with pytest.raises(klass, match=expected_data):
_ = func(**kwargs)
else:
Expand Down