Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Other enhancements
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down
33 changes: 23 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10536,11 +10536,17 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
"""
return self.apply(Series.nunique, axis=axis, dropna=dropna)

@doc(_shared_docs["idxmin"])
def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
@doc(_shared_docs["idxmin"], numeric_only_default="False")
def idxmin(
self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
) -> Series:
axis = self._get_axis_number(axis)
if numeric_only:
data = self._get_numeric_data()
else:
data = self

res = self._reduce(
res = data._reduce(
nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
)
indices = res._values
Expand All @@ -10550,15 +10556,22 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
assert isinstance(indices, np.ndarray) # for mypy

index = self._get_axis(axis)
index = data._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return self._constructor_sliced(result, index=self._get_agg_axis(axis))
return data._constructor_sliced(result, index=data._get_agg_axis(axis))

@doc(_shared_docs["idxmax"], numeric_only_default="False")
def idxmax(
self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
) -> Series:

@doc(_shared_docs["idxmax"])
def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
axis = self._get_axis_number(axis)
if numeric_only:
data = self._get_numeric_data()
else:
data = self

res = self._reduce(
res = data._reduce(
nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
)
indices = res._values
Expand All @@ -10568,9 +10581,9 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
assert isinstance(indices, np.ndarray) # for mypy

index = self._get_axis(axis)
index = data._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return self._constructor_sliced(result, index=self._get_agg_axis(axis))
return data._constructor_sliced(result, index=data._get_agg_axis(axis))

def _get_agg_axis(self, axis_num: int) -> Index:
"""
Expand Down
22 changes: 15 additions & 7 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1553,10 +1553,14 @@ def nunique(self, dropna: bool = True) -> DataFrame:

return results

@doc(_shared_docs["idxmax"])
def idxmax(self, axis=0, skipna: bool = True):
@doc(
_shared_docs["idxmax"],
numeric_only_default="True for axis=0, False for axis=1",
)
def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
if numeric_only is None:
numeric_only = None if axis == 0 else False

def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
Expand All @@ -1575,13 +1579,17 @@ def func(df):
func.__name__ = "idxmax"
return self._python_apply_general(func, self._obj_with_exclusions)

@doc(_shared_docs["idxmin"])
def idxmin(self, axis=0, skipna: bool = True):
@doc(
_shared_docs["idxmin"],
numeric_only_default="True for axis=0, False for axis=1",
)
def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
if numeric_only is None:
numeric_only = None if axis == 0 else False

def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
# NB: here we use numeric_only=None, in DataFrame it is False GH#46560
res = df._reduce(
nanops.nanargmin,
"argmin",
Expand Down
82 changes: 69 additions & 13 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,7 +1502,7 @@ def _python_apply_general(
)

@final
def _python_agg_general(self, func, *args, **kwargs):
def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs):
func = com.is_builtin_func(func)
f = lambda x: func(x, *args, **kwargs)

Expand All @@ -1520,6 +1520,8 @@ def _python_agg_general(self, func, *args, **kwargs):
# if this function is invalid for this dtype, we will ignore it.
result = self.grouper.agg_series(obj, f)
except TypeError:
if raise_on_typeerror:
raise
warn_dropping_nuisance_columns_deprecated(type(self), "agg")
continue

Expand Down Expand Up @@ -1593,7 +1595,12 @@ def _agg_py_fallback(

@final
def _cython_agg_general(
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
self,
how: str,
alt: Callable,
numeric_only: bool,
min_count: int = -1,
ignore_failures: bool = True,
):
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
# that goes through SeriesGroupBy
Expand Down Expand Up @@ -1629,7 +1636,7 @@ def array_func(values: ArrayLike) -> ArrayLike:

# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=True)
new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)

if not is_ser and len(new_mgr) < len(data):
warn_dropping_nuisance_columns_deprecated(type(self), how)
Expand Down Expand Up @@ -2041,6 +2048,7 @@ def std(
ddof: int = 1,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool | lib.NoDefault = lib.no_default,
):
"""
Compute standard deviation of groups, excluding missing values.
Expand Down Expand Up @@ -2069,6 +2077,11 @@ def std(

.. versionadded:: 1.4.0

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Series or DataFrame
Expand All @@ -2081,8 +2094,9 @@ def std(
else:
return self._get_cythonized_result(
libgroupby.group_var,
needs_counts=True,
cython_dtype=np.dtype(np.float64),
numeric_only=numeric_only,
needs_counts=True,
post_processing=lambda vals, inference: np.sqrt(vals),
ddof=ddof,
)
Expand All @@ -2095,6 +2109,7 @@ def var(
ddof: int = 1,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool | lib.NoDefault = lib.no_default,
):
"""
Compute variance of groups, excluding missing values.
Expand Down Expand Up @@ -2123,6 +2138,11 @@ def var(

.. versionadded:: 1.4.0

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Series or DataFrame
Expand All @@ -2133,22 +2153,37 @@ def var(

return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
else:
ignore_failures = numeric_only is lib.no_default
numeric_only = self._resolve_numeric_only(numeric_only)
if ddof == 1:
numeric_only = self._resolve_numeric_only(lib.no_default)
return self._cython_agg_general(
"var",
alt=lambda x: Series(x).var(ddof=ddof),
numeric_only=numeric_only,
ignore_failures=ignore_failures,
)
else:
func = lambda x: x.var(ddof=ddof)
with self._group_selection_context():
return self._python_agg_general(func)
if numeric_only:
nonnumeric_exclusions = frozenset(
self.obj.columns.difference(self.exclusions).difference(
self.obj._get_numeric_data().columns
)
)
else:
nonnumeric_exclusions = frozenset()
with com.temp_setattr(
self, "exclusions", self.exclusions | nonnumeric_exclusions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the purpose here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - I was able to remove this hack by setting the appropriate value of raise_on_typeerror.

):
with self._group_selection_context():
return self._python_agg_general(
func, raise_on_typeerror=not ignore_failures
)

@final
@Substitution(name="groupby")
@Appender(_common_see_also)
def sem(self, ddof: int = 1):
def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):
"""
Compute standard error of the mean of groups, excluding missing values.

Expand All @@ -2159,12 +2194,17 @@ def sem(self, ddof: int = 1):
ddof : int, default 1
Degrees of freedom.

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Series or DataFrame
Standard error of the mean of values within each group.
"""
result = self.std(ddof=ddof)
result = self.std(ddof=ddof, numeric_only=numeric_only)
if result.ndim == 1:
result /= np.sqrt(self.count())
else:
Expand Down Expand Up @@ -2968,7 +3008,12 @@ def nth(
return result

@final
def quantile(self, q=0.5, interpolation: str = "linear"):
def quantile(
self,
q=0.5,
interpolation: str = "linear",
numeric_only: bool | lib.NoDefault = lib.no_default,
):
"""
Return group values at the given quantile, a la numpy.percentile.

Expand All @@ -2978,6 +3023,10 @@ def quantile(self, q=0.5, interpolation: str = "linear"):
Value(s) between 0 and 1 providing the quantile(s) to compute.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
Method to use when the desired quantile falls between two points.
numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Expand All @@ -3002,6 +3051,7 @@ def quantile(self, q=0.5, interpolation: str = "linear"):
a 2.0
b 3.0
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:
if is_object_dtype(vals):
Expand Down Expand Up @@ -3095,9 +3145,15 @@ def blk_func(values: ArrayLike) -> ArrayLike:
obj = self._obj_with_exclusions
is_ser = obj.ndim == 1
mgr = self._get_data_to_aggregate()

res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
if not is_ser and len(res_mgr.items) != len(mgr.items):
data = mgr.get_numeric_data() if numeric_only_bool else mgr
ignore_failures = numeric_only_bool
res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)

if (
numeric_only is lib.no_default
and not is_ser
and len(res_mgr.items) != len(mgr.items)
):
warn_dropping_nuisance_columns_deprecated(type(self), "quantile")

if len(res_mgr.items) == 0:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/shared_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,10 @@
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.
numeric_only : bool, default {numeric_only_default}
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Expand Down Expand Up @@ -812,6 +816,10 @@
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.
numeric_only : bool, default {numeric_only_default}
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,17 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis):
expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("numeric_only", [True, False])
def test_idxmin_numeric_only(self, numeric_only):
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
if numeric_only:
result = df.idxmin(numeric_only=numeric_only)
expected = Series([2, 1], index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not allowed for this dtype"):
df.idxmin(numeric_only=numeric_only)

def test_idxmin_axis_2(self, float_frame):
frame = float_frame
msg = "No axis named 2 for object type DataFrame"
Expand All @@ -914,6 +925,17 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis):
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("numeric_only", [True, False])
def test_idxmax_numeric_only(self, numeric_only):
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
if numeric_only:
result = df.idxmax(numeric_only=numeric_only)
expected = Series([1, 0], index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not allowed for this dtype"):
df.idxmin(numeric_only=numeric_only)

def test_idxmax_axis_2(self, float_frame):
frame = float_frame
msg = "No axis named 2 for object type DataFrame"
Expand Down
Loading