Skip to content

PERF: extract_array #52351

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,17 +907,12 @@ def _map_values(self, mapper, na_action=None, convert: bool = True):
If the function returns a tuple with more than one element
a MultiIndex will be returned.
"""
arr = extract_array(self, extract_numpy=True, extract_range=True)
arr = self._values

if isinstance(arr, ExtensionArray):
return arr.map(mapper, na_action=na_action)

# Argument 1 to "map_array" has incompatible type
# "Union[IndexOpsMixin, ndarray[Any, Any]]";
# expected "Union[ExtensionArray, ndarray[Any, Any]]
return algorithms.map_array(
arr, mapper, na_action=na_action, convert=convert # type: ignore[arg-type]
)
return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)

@final
def value_counts(
Expand Down
40 changes: 28 additions & 12 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@
ABCDataFrame,
ABCExtensionArray,
ABCIndex,
ABCPandasArray,
ABCRangeIndex,
ABCSeries,
)
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -379,6 +377,21 @@ def array(
return PandasArray._from_sequence(data, dtype=dtype, copy=copy)


_typs = frozenset(
{
"index",
"rangeindex",
"multiindex",
"datetimeindex",
"timedeltaindex",
"periodindex",
"categoricalindex",
"intervalindex",
"series",
}
)


@overload
def extract_array(
obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ...
Expand Down Expand Up @@ -438,19 +451,22 @@ def extract_array(
>>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
array([1, 2, 3])
"""
if isinstance(obj, (ABCIndex, ABCSeries)):
if isinstance(obj, ABCRangeIndex):
typ = getattr(obj, "_typ", None)
if typ in _typs:
# i.e. isinstance(obj, (ABCIndex, ABCSeries))
if typ == "rangeindex":
if extract_range:
return obj._values
# https://github.com/python/mypy/issues/1081
# error: Incompatible return value type (got "RangeIndex", expected
# "Union[T, Union[ExtensionArray, ndarray[Any, Any]]]")
return obj # type: ignore[return-value]
# error: "T" has no attribute "_values"
return obj._values # type: ignore[attr-defined]
return obj

return obj._values
# error: "T" has no attribute "_values"
return obj._values # type: ignore[attr-defined]

elif extract_numpy and isinstance(obj, ABCPandasArray):
return obj.to_numpy()
elif extract_numpy and typ == "npy_extension":
# i.e. isinstance(obj, ABCPandasArray)
# error: "T" has no attribute "to_numpy"
return obj.to_numpy() # type: ignore[attr-defined]

return obj

Expand Down
40 changes: 14 additions & 26 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@
notna,
)

from pandas.core.construction import extract_array

bn = import_optional_dependency("bottleneck", errors="warn")
_BOTTLENECK_INSTALLED = bn is not None
_USE_BOTTLENECK = False
Expand Down Expand Up @@ -308,9 +306,6 @@ def _get_values(
# with scalar fill_value. This guarantee is important for the
# np.where call below
assert is_scalar(fill_value)
# error: Incompatible types in assignment (expression has type "Union[Any,
# Union[ExtensionArray, ndarray]]", variable has type "ndarray")
values = extract_array(values, extract_numpy=True) # type: ignore[assignment]

mask = _maybe_get_mask(values, skipna, mask)

Expand Down Expand Up @@ -522,12 +517,12 @@ def nanany(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, 2])
>>> nanops.nanany(s)
>>> nanops.nanany(s.values)
True

>>> from pandas.core import nanops
>>> s = pd.Series([np.nan])
>>> nanops.nanany(s)
>>> nanops.nanany(s.values)
False
"""
if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
Expand Down Expand Up @@ -577,12 +572,12 @@ def nanall(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, 2, np.nan])
>>> nanops.nanall(s)
>>> nanops.nanall(s.values)
True

>>> from pandas.core import nanops
>>> s = pd.Series([1, 0])
>>> nanops.nanall(s)
>>> nanops.nanall(s.values)
False
"""
if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
Expand Down Expand Up @@ -637,7 +632,7 @@ def nansum(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, 2, np.nan])
>>> nanops.nansum(s)
>>> nanops.nansum(s.values)
3.0
"""
values, mask, dtype, dtype_max, _ = _get_values(
Expand Down Expand Up @@ -705,7 +700,7 @@ def nanmean(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, 2, np.nan])
>>> nanops.nanmean(s)
>>> nanops.nanmean(s.values)
1.5
"""
values, mask, dtype, dtype_max, _ = _get_values(
Expand Down Expand Up @@ -761,7 +756,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, np.nan, 2, 2])
>>> nanops.nanmedian(s)
>>> nanops.nanmedian(s.values)
2.0
"""

Expand Down Expand Up @@ -928,7 +923,7 @@ def nanstd(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, np.nan, 2, 3])
>>> nanops.nanstd(s)
>>> nanops.nanstd(s.values)
1.0
"""
if values.dtype == "M8[ns]":
Expand All @@ -944,7 +939,7 @@ def nanstd(
@disallow("M8", "m8")
@bottleneck_switch(ddof=1)
def nanvar(
values,
values: np.ndarray,
*,
axis: AxisInt | None = None,
skipna: bool = True,
Expand Down Expand Up @@ -975,10 +970,9 @@ def nanvar(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, np.nan, 2, 3])
>>> nanops.nanvar(s)
>>> nanops.nanvar(s.values)
1.0
"""
values = extract_array(values, extract_numpy=True)
dtype = values.dtype
mask = _maybe_get_mask(values, skipna, mask)
if is_any_int_dtype(dtype):
Expand Down Expand Up @@ -1050,7 +1044,7 @@ def nansem(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, np.nan, 2, 3])
>>> nanops.nansem(s)
>>> nanops.nansem(s.values)
0.5773502691896258
"""
# This checks if non-numeric-like data is passed with numeric_only=False
Expand Down Expand Up @@ -1229,12 +1223,9 @@ def nanskew(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, np.nan, 1, 2])
>>> nanops.nanskew(s)
>>> nanops.nanskew(s.values)
1.7320508075688787
"""
# error: Incompatible types in assignment (expression has type "Union[Any,
# Union[ExtensionArray, ndarray]]", variable has type "ndarray")
values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
values = values.astype("f8")
Expand Down Expand Up @@ -1319,12 +1310,9 @@ def nankurt(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, np.nan, 1, 3, 2])
>>> nanops.nankurt(s)
>>> nanops.nankurt(s.values)
-1.2892561983471076
"""
# error: Incompatible types in assignment (expression has type "Union[Any,
# Union[ExtensionArray, ndarray]]", variable has type "ndarray")
values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
values = values.astype("f8")
Expand Down Expand Up @@ -1413,7 +1401,7 @@ def nanprod(
--------
>>> from pandas.core import nanops
>>> s = pd.Series([1, 2, 3, np.nan])
>>> nanops.nanprod(s)
>>> nanops.nanprod(s.values)
6.0
"""
mask = _maybe_get_mask(values, skipna, mask)
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
date_range,
)
import pandas._testing as tm
from pandas.core import nanops
from pandas.tests.groupby import get_groupby_method_args
from pandas.util import _test_decorators as td

Expand Down Expand Up @@ -365,7 +364,7 @@ def test_cython_median():
labels[::17] = np.nan

result = df.groupby(labels).median()
exp = df.groupby(labels).agg(nanops.nanmedian)
exp = df.groupby(labels).agg(np.nanmedian)
tm.assert_frame_equal(result, exp)

df = DataFrame(np.random.randn(1000, 5))
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1238,8 +1238,8 @@ def test_nanops_independent_of_mask_param(operation):
# GH22764
ser = Series([1, 2, np.nan, 3, np.nan, 4])
mask = ser.isna()
median_expected = operation(ser)
median_result = operation(ser, mask=mask)
median_expected = operation(ser._values)
median_result = operation(ser._values, mask=mask)
assert median_expected == median_result


Expand Down