From 0d725e87f4c7d81eb84e6d0df2849222ff1760ab Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Oct 2020 06:46:31 -0500 Subject: [PATCH 01/10] Implement DataFrame.__array_ufunc__ For some cases, this will preserve extension types of arrays by calling the ufunc blockwise. ```python In [1]: import pandas as pd; import numpy as np In [2]: df = pd.DataFrame({"A": pd.array([0, 1], dtype="Sparse")}) In [3]: np.sin(df).dtypes Out[3]: A Sparse[float64, nan] dtype: object ``` We don't currently handle the multi-input case well (aside from ufuncs that are implemented as dunder ops like `np.add`). For these, we fall back to the old implementation of converting to an ndarray. --- doc/source/whatsnew/v1.2.0.rst | 3 + pandas/core/frame.py | 4 ++ pandas/core/generic.py | 100 +++++++++++++++++++++++++++- pandas/core/ops/__init__.py | 5 +- pandas/core/ops/common.py | 53 ++++++++++++++- pandas/core/series.py | 75 --------------------- pandas/tests/frame/test_ufunc.py | 111 +++++++++++++++++++++++++++++++ 7 files changed, 272 insertions(+), 79 deletions(-) create mode 100644 pandas/tests/frame/test_ufunc.py diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5be9155b3ff0b..3dd209d6cdd3f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -189,6 +189,8 @@ Other enhancements - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) +- Calling a NumPy ufunc on a ``DataFrame`` with extension types now presrves the extension types when possible (:issue:`23743`). +- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) .. _whatsnew_120.api_breaking.python: @@ -289,6 +291,7 @@ Deprecations - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) +- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`) - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c1ed9025f2c8..e4ad863fe5f4f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -556,6 +556,10 @@ def __init__( NDFrame.__init__(self, mgr) + # ---------------------------------------------------------------------- + # Array interface + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + # ---------------------------------------------------------------------- @property diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 19801025b7672..e8fbe62d4d766 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -87,11 +87,11 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import missing, nanops +from pandas.core import missing, nanops, ops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.construction import create_series_with_explicit_dtype, extract_array from pandas.core.flags import Flags from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index @@ -1912,6 +1912,102 @@ def __array_wrap__( self, method="__array_wrap__" ) + @ops.defer_or_dispatch_ufunc + def __array_ufunc__( + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any + ): + # XXX: check outer + # align all the inputs. + types = tuple(type(x) for x in inputs) + alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + + if len(set(types)) > 1: + # We currently don't handle ufunc(DataFrame, Series) + # well. Previously this raised an internal ValueError. We might + # support it someday, so raise a NotImplementedError. + raise NotImplementedError( + "Cannot apply ufunc {} to mixed DataFrame and Series " + "inputs.".format(ufunc) + ) + axes = self.axes + for obj in alignable[1:]: + # this relies on the fact that we aren't handling mixed + # series / frame ufuncs. + for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): + axes[i] = ax1 | ax2 + + reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) + inputs = tuple( + x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x + for x, t in zip(inputs, types) + ) + else: + reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) + + if self.ndim == 1: + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + name = names[0] if len(set(names)) == 1 else None + reconstruct_kwargs = {"name": name} + else: + reconstruct_kwargs = {} + + def reconstruct(result): + if lib.is_scalar(result): + return result + if result.ndim != self.ndim: + if method == "outer": + if self.ndim == 2: + # we already deprecated for Series + msg = ( + "outer method for ufunc {} is not implemented on " + "pandas objects. Returning an ndarray, but in the " + "future this will raise a 'NotImplementedError'. " + "Consider explicitly converting the DataFrame " + "to an array with '.to_numpy()' first." + ) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) + return result + raise NotImplementedError + return result + if isinstance(result, BlockManager): + # we went through BlockManager.apply + return self._constructor(result, **reconstruct_kwargs, copy=False) + else: + # we converted an array, lost our axes + return self._constructor( + result, **reconstruct_axes, **reconstruct_kwargs, copy=False + ) + + if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1): + # Just give up on preserving types in the complex case. + # In theory we could preserve them for them. + # * nout>1 is doable if BlockManager.apply took nout and + # returned a Tuple[BlockManager]. + # * len(inputs) > 1 is doable when we know that we have + # aligned blocks / dtypes. + inputs = tuple(np.asarray(x) for x in inputs) + result = getattr(ufunc, method)(*inputs) + elif self.ndim == 1: + # ufunc(series, ...) + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + else: + # ufunc(dataframe) + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + + if ufunc.nout > 1: + result = tuple(reconstruct(x) for x in result) + else: + result = reconstruct(result) + return result + # ideally we would define this to avoid the getattr checks, but # is slower # @property diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 84319b69d9a35..80e76e3167df3 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -27,7 +27,10 @@ get_array_op, logical_op, ) -from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.ops.common import ( # noqa:F401 + defer_or_dispatch_ufunc, + unpack_zerodim_and_defer, +) from pandas.core.ops.docstrings import ( _arith_doc_FRAME, _flex_comp_doc_FRAME, diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index 515a0a5198d74..abcaac1e67d69 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -2,9 +2,12 @@ Boilerplate functions used in defining binary operations. """ from functools import wraps -from typing import Callable +from typing import Any, Callable + +import numpy as np from pandas._libs.lib import item_from_zerodim +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas._typing import F from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries @@ -65,3 +68,51 @@ def new_method(self, other): return method(self, other) return new_method + + +def defer_or_dispatch_ufunc(meth): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Ensure method returns NotImplemented when operating against "senior" + classes. Ensure zero-dimensional ndarrays are always unpacked. + + Parameters + ---------- + method : binary method + + Returns + ------- + method + """ + + @wraps(meth) + def new_method(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + cls = type(self) + + # for binary ops, use our custom dunder methods + result = maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + return meth(self, ufunc, method, *inputs, **kwargs) + + return new_method diff --git a/pandas/core/series.py b/pandas/core/series.py index 5cc163807fac6..4d1478a9498df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -683,81 +683,6 @@ def view(self, dtype=None) -> "Series": # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) - def __array_ufunc__( - self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any - ): - # TODO: handle DataFrame - cls = type(self) - - # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - # align all the inputs. - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - types = tuple(type(x) for x in inputs) - # TODO: dataframe - alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] - - if len(alignable) > 1: - # This triggers alignment. - # At the moment, there aren't any ufuncs with more than two inputs - # so this ends up just being x1.index | x2.index, but we write - # it to handle *args. - index = alignable[0].index - for s in alignable[1:]: - index |= s.index - inputs = tuple( - x.reindex(index) if issubclass(t, Series) else x - for x, t in zip(inputs, types) - ) - else: - index = self.index - - inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - - name = names[0] if len(set(names)) == 1 else None - - def construct_return(result): - if lib.is_scalar(result): - return result - elif result.ndim > 1: - # e.g. np.subtract.outer - if method == "outer": - # GH#27198 - raise NotImplementedError - return result - return self._constructor(result, index=index, name=name, copy=False) - - if type(result) is tuple: - # multiple return values - return tuple(construct_return(x) for x in result) - elif method == "at": - # no return value - return None - else: - return construct_return(result) - def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py new file mode 100644 index 0000000000000..159fe23f2b090 --- /dev/null +++ b/pandas/tests/frame/test_ufunc.py @@ -0,0 +1,111 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +dtypes = [ + "int64", + "Int64", + dict(A="int64", B="Int64"), +] + + +@pytest.mark.parametrize("dtype", dtypes) +def test_unary_unary(dtype): + # unary input, unary output + values = np.array([[-1, -1], [1, 1]], dtype="int64") + df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype) + result = np.positive(df) + expected = pd.DataFrame( + np.positive(values), index=df.index, columns=df.columns + ).astype(dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", dtypes) +def test_unary_binary(dtype): + # unary input, binary output + if pd.api.types.is_extension_array_dtype(dtype) or isinstance(dtype, dict): + pytest.xfail(reason="Extension / mixed with multiple outuputs not implemented.") + + values = np.array([[-1, -1], [1, 1]], dtype="int64") + df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype) + result_pandas = np.modf(df) + assert isinstance(result_pandas, tuple) + assert len(result_pandas) == 2 + expected_numpy = np.modf(values) + + for result, b in zip(result_pandas, expected_numpy): + expected = pd.DataFrame(b, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", dtypes) +def test_binary_input_dispatch_binop(dtype): + # binop ufuncs are dispatched to our dunder methods. + values = np.array([[-1, -1], [1, 1]], dtype="int64") + df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype) + result = np.add(df, df) + expected = pd.DataFrame( + np.add(values, values), index=df.index, columns=df.columns + ).astype(dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype_a", dtypes) +@pytest.mark.parametrize("dtype_b", dtypes) +def test_binary_input_aligns_columns(dtype_a, dtype_b): + if ( + pd.api.types.is_extension_array_dtype(dtype_a) + or isinstance(dtype_a, dict) + or pd.api.types.is_extension_array_dtype(dtype_b) + or isinstance(dtype_b, dict) + ): + pytest.xfail(reason="Extension / mixed with multiple inputs not implemented.") + + df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}).astype(dtype_a) + + if isinstance(dtype_a, dict) and isinstance(dtype_b, dict): + dtype_b["C"] = dtype_b.pop("B") + + df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b) + result = np.logaddexp(df1, df2) + expected = np.logaddexp( + np.array([[1, 3, np.nan], [2, 4, np.nan]]), + np.array([[1, np.nan, 3], [2, np.nan, 4]]), + ) + expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", dtypes) +def test_binary_input_aligns_index(dtype): + if pd.api.types.is_extension_array_dtype(dtype) or isinstance(dtype, dict): + pytest.xfail(reason="Extension / mixed with multiple inputs not implemented.") + df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).astype(dtype) + df2 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "c"]).astype(dtype) + result = np.logaddexp(df1, df2) + expected = np.logaddexp( + np.array([[1, 3], [3, 4], [np.nan, np.nan]]), + np.array([[1, 3], [np.nan, np.nan], [3, 4]]), + ) + # TODO(FloatArray): this will be Float64Dtype. + expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_binary_frame_series_raises(): + # We don't currently implement + df = pd.DataFrame({"A": [1, 2]}) + with pytest.raises(NotImplementedError, match="logaddexp"): + np.logaddexp(df, df["A"]) + + with pytest.raises(NotImplementedError, match="logaddexp"): + np.logaddexp(df["A"], df) + + +def test_frame_outer_deprecated(): + df = pd.DataFrame({"A": [1, 2]}) + with tm.assert_produces_warning(FutureWarning): + np.subtract.outer(df, df) From c4c1470fae439197ba6c9ee66b77cc1eed739042 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Oct 2020 13:43:07 -0500 Subject: [PATCH 02/10] remove unnecessary decorator --- pandas/core/generic.py | 29 +++++++++++++++++--- pandas/core/ops/__init__.py | 5 +--- pandas/core/ops/common.py | 53 +------------------------------------ 3 files changed, 28 insertions(+), 59 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e8fbe62d4d766..c070377fbeb1c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1912,11 +1912,34 @@ def __array_wrap__( self, method="__array_wrap__" ) - @ops.defer_or_dispatch_ufunc def __array_ufunc__( self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any ): - # XXX: check outer + cls = type(self) + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + # align all the inputs. types = tuple(type(x) for x in inputs) alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] @@ -1971,7 +1994,7 @@ def reconstruct(result): "Consider explicitly converting the DataFrame " "to an array with '.to_numpy()' first." ) - warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=3) return result raise NotImplementedError return result diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 80e76e3167df3..84319b69d9a35 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -27,10 +27,7 @@ get_array_op, logical_op, ) -from pandas.core.ops.common import ( # noqa:F401 - defer_or_dispatch_ufunc, - unpack_zerodim_and_defer, -) +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.docstrings import ( _arith_doc_FRAME, _flex_comp_doc_FRAME, diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index abcaac1e67d69..515a0a5198d74 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -2,12 +2,9 @@ Boilerplate functions used in defining binary operations. """ from functools import wraps -from typing import Any, Callable - -import numpy as np +from typing import Callable from pandas._libs.lib import item_from_zerodim -from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas._typing import F from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries @@ -68,51 +65,3 @@ def new_method(self, other): return method(self, other) return new_method - - -def defer_or_dispatch_ufunc(meth): - """ - Boilerplate for pandas conventions in arithmetic and comparison methods. - - Ensure method returns NotImplemented when operating against "senior" - classes. Ensure zero-dimensional ndarrays are always unpacked. - - Parameters - ---------- - method : binary method - - Returns - ------- - method - """ - - @wraps(meth) - def new_method(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): - cls = type(self) - - # for binary ops, use our custom dunder methods - result = maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - return meth(self, ufunc, method, *inputs, **kwargs) - - return new_method From 4fcb1a4dd3659fa22ac6d5b04cc372f07689fc61 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Oct 2020 17:03:43 -0500 Subject: [PATCH 03/10] Fixup --- pandas/core/frame.py | 5 +---- pandas/core/generic.py | 15 +++++++++++---- pandas/core/series.py | 2 +- pandas/tests/generic/test_finalize.py | 2 +- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e4ad863fe5f4f..452678b23c593 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -410,6 +410,7 @@ class DataFrame(NDFrame): _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) @property def _constructor(self) -> Type[DataFrame]: @@ -556,10 +557,6 @@ def __init__( NDFrame.__init__(self, mgr) - # ---------------------------------------------------------------------- - # Array interface - _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) - # ---------------------------------------------------------------------- @property diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c070377fbeb1c..817feecc09cc7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2000,14 +2000,21 @@ def reconstruct(result): return result if isinstance(result, BlockManager): # we went through BlockManager.apply - return self._constructor(result, **reconstruct_kwargs, copy=False) + result = self._constructor(result, **reconstruct_kwargs, copy=False) else: # we converted an array, lost our axes - return self._constructor( + result = self._constructor( result, **reconstruct_axes, **reconstruct_kwargs, copy=False ) + # TODO: When we support multiple values in __finalize__, this + # should pass alignable to `__fianlize__` instead of self. + # Then `np.add(a, b)` would consider attrs from both a and b + # when a and b are NDFrames. + return result.__finalize__(self) - if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1): + if self.ndim > 1 and ( + len(inputs) > 1 or ufunc.nout > 1 + ): # type: ignore[attr-defined] # Just give up on preserving types in the complex case. # In theory we could preserve them for them. # * nout>1 is doable if BlockManager.apply took nout and @@ -2025,7 +2032,7 @@ def reconstruct(result): mgr = inputs[0]._mgr result = mgr.apply(getattr(ufunc, method)) - if ufunc.nout > 1: + if ufunc.nout > 1: # type: ignore[attr-defined] result = tuple(reconstruct(x) for x in result) else: result = reconstruct(result) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4d1478a9498df..90b2c9ec13359 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -176,6 +176,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ _typ = "series" + _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) _name: Label _metadata: List[str] = ["name"] @@ -681,7 +682,6 @@ def view(self, dtype=None) -> "Series": # ---------------------------------------------------------------------- # NDArray Compat - _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) def __array__(self, dtype=None) -> np.ndarray: """ diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 6692102bc9008..0df0c45d1b222 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -330,7 +330,7 @@ (pd.DataFrame, frame_data, operator.inv), (pd.Series, [1], operator.inv), (pd.DataFrame, frame_data, abs), - pytest.param((pd.Series, [1], abs), marks=not_implemented_mark), + (pd.Series, [1], abs), pytest.param((pd.DataFrame, frame_data, round), marks=not_implemented_mark), (pd.Series, [1], round), (pd.DataFrame, frame_data, operator.methodcaller("take", [0, 0])), From 971659e91ac082708520157b0155d5d8281fa9dc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Oct 2020 17:10:56 -0500 Subject: [PATCH 04/10] fixup finalize --- pandas/core/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 817feecc09cc7..f9aa4fa21dfed 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2010,7 +2010,9 @@ def reconstruct(result): # should pass alignable to `__fianlize__` instead of self. # Then `np.add(a, b)` would consider attrs from both a and b # when a and b are NDFrames. - return result.__finalize__(self) + if len(alignable) == 1: + result = result.__finalize__(self) + return result if self.ndim > 1 and ( len(inputs) > 1 or ufunc.nout > 1 From 6bd73dcb4055820c587f1cc17f789012eab79db3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Oct 2020 17:11:46 -0500 Subject: [PATCH 05/10] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3dd209d6cdd3f..60dbc914affd6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -474,6 +474,7 @@ Other - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - Fixed metadata propagation in the :class:`Series.dt` accessor (:issue:`28283`) +- Fixed metadata propagation in :meth:`Series.abs` and ufuncs called on Series (:issue:`28283`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) .. --------------------------------------------------------------------------- From 0afdf493e4f411e1023db32b5480be4e6fa6d92d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Oct 2020 18:14:28 -0500 Subject: [PATCH 06/10] fixup --- pandas/core/generic.py | 4 ++-- pandas/tests/generic/test_duplicate_labels.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f9aa4fa21dfed..47ecefde660fc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2015,8 +2015,8 @@ def reconstruct(result): return result if self.ndim > 1 and ( - len(inputs) > 1 or ufunc.nout > 1 - ): # type: ignore[attr-defined] + len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] + ): # Just give up on preserving types in the complex case. # In theory we could preserve them for them. # * nout>1 is doable if BlockManager.apply took nout and diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index 97468e1f10a8b..8077a65ed07d8 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -37,8 +37,8 @@ def test_construction_ok(self, cls, data): operator.methodcaller("add", 1), operator.methodcaller("rename", str.upper), operator.methodcaller("rename", "name"), - pytest.param(operator.methodcaller("abs"), marks=not_implemented), - # TODO: test np.abs + operator.methodcaller("abs"), + np.abs, ], ) def test_preserved_series(self, func): From 2260c836455247a9abc5c09cf3edd13b7d96af78 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Oct 2020 08:59:47 -0500 Subject: [PATCH 07/10] fixup --- pandas/tests/frame/test_ufunc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py index 159fe23f2b090..7bc9aa29af3b4 100644 --- a/pandas/tests/frame/test_ufunc.py +++ b/pandas/tests/frame/test_ufunc.py @@ -70,8 +70,8 @@ def test_binary_input_aligns_columns(dtype_a, dtype_b): dtype_b["C"] = dtype_b.pop("B") df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b) - result = np.logaddexp(df1, df2) - expected = np.logaddexp( + result = np.heaviside(df1, df2) + expected = np.heaviside( np.array([[1, 3, np.nan], [2, 4, np.nan]]), np.array([[1, np.nan, 3], [2, np.nan, 4]]), ) @@ -85,8 +85,8 @@ def test_binary_input_aligns_index(dtype): pytest.xfail(reason="Extension / mixed with multiple inputs not implemented.") df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).astype(dtype) df2 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "c"]).astype(dtype) - result = np.logaddexp(df1, df2) - expected = np.logaddexp( + result = np.heaviside(df1, df2) + expected = np.heaviside( np.array([[1, 3], [3, 4], [np.nan, np.nan]]), np.array([[1, 3], [np.nan, np.nan], [3, 4]]), ) From 919ebb55a0c5e3d9082883886d2232627cc46f6a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 18 Oct 2020 14:06:32 -0500 Subject: [PATCH 08/10] Move to arraylike --- pandas/core/arraylike.py | 137 ++++++++++++++++++++++++++++++++++++++- pandas/core/generic.py | 129 +----------------------------------- 2 files changed, 139 insertions(+), 127 deletions(-) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index da366c9abf0a4..b5fdc3fe4589d 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -5,8 +5,15 @@ ExtensionArray """ import operator +from typing import Any, Callable +import warnings -from pandas.core.ops import roperator +import numpy as np + +from pandas._libs import lib + +from pandas.core.construction import extract_array +from pandas.core.ops import maybe_dispatch_ufunc_to_dunder_op, roperator from pandas.core.ops.common import unpack_zerodim_and_defer @@ -140,3 +147,131 @@ def __pow__(self, other): @unpack_zerodim_and_defer("__rpow__") def __rpow__(self, other): return self._arith_method(other, roperator.rpow) + + +def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + from pandas.core.generic import NDFrame + from pandas.core.internals import BlockManager + + cls = type(self) + + # for binary ops, use our custom dunder methods + result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + types = tuple(type(x) for x in inputs) + alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + + if len(set(types)) > 1: + # We currently don't handle ufunc(DataFrame, Series) + # well. Previously this raised an internal ValueError. We might + # support it someday, so raise a NotImplementedError. + raise NotImplementedError( + "Cannot apply ufunc {} to mixed DataFrame and Series " + "inputs.".format(ufunc) + ) + axes = self.axes + for obj in alignable[1:]: + # this relies on the fact that we aren't handling mixed + # series / frame ufuncs. + for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): + axes[i] = ax1 | ax2 + + reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) + inputs = tuple( + x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x + for x, t in zip(inputs, types) + ) + else: + reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) + + if self.ndim == 1: + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + name = names[0] if len(set(names)) == 1 else None + reconstruct_kwargs = {"name": name} + else: + reconstruct_kwargs = {} + + def reconstruct(result): + if lib.is_scalar(result): + return result + if result.ndim != self.ndim: + if method == "outer": + if self.ndim == 2: + # we already deprecated for Series + msg = ( + "outer method for ufunc {} is not implemented on " + "pandas objects. Returning an ndarray, but in the " + "future this will raise a 'NotImplementedError'. " + "Consider explicitly converting the DataFrame " + "to an array with '.to_numpy()' first." + ) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) + return result + raise NotImplementedError + return result + if isinstance(result, BlockManager): + # we went through BlockManager.apply + result = self._constructor(result, **reconstruct_kwargs, copy=False) + else: + # we converted an array, lost our axes + result = self._constructor( + result, **reconstruct_axes, **reconstruct_kwargs, copy=False + ) + # TODO: When we support multiple values in __finalize__, this + # should pass alignable to `__fianlize__` instead of self. + # Then `np.add(a, b)` would consider attrs from both a and b + # when a and b are NDFrames. + if len(alignable) == 1: + result = result.__finalize__(self) + return result + + if self.ndim > 1 and ( + len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] + ): + # Just give up on preserving types in the complex case. + # In theory we could preserve them for them. + # * nout>1 is doable if BlockManager.apply took nout and + # returned a Tuple[BlockManager]. + # * len(inputs) > 1 is doable when we know that we have + # aligned blocks / dtypes. + inputs = tuple(np.asarray(x) for x in inputs) + result = getattr(ufunc, method)(*inputs) + elif self.ndim == 1: + # ufunc(series, ...) + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + else: + # ufunc(dataframe) + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + + if ufunc.nout > 1: # type: ignore[attr-defined] + result = tuple(reconstruct(x) for x in result) + else: + result = reconstruct(result) + return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index df87977cda901..551eb00ff30a6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -86,11 +86,11 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import indexing, missing, nanops, ops +from pandas.core import arraylike, indexing, missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.construction import create_series_with_explicit_dtype, extract_array +from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.flags import Flags from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( @@ -1920,130 +1920,7 @@ def __array_wrap__( def __array_ufunc__( self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any ): - cls = type(self) - - # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - # align all the inputs. - types = tuple(type(x) for x in inputs) - alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] - - if len(alignable) > 1: - # This triggers alignment. - # At the moment, there aren't any ufuncs with more than two inputs - # so this ends up just being x1.index | x2.index, but we write - # it to handle *args. - - if len(set(types)) > 1: - # We currently don't handle ufunc(DataFrame, Series) - # well. Previously this raised an internal ValueError. We might - # support it someday, so raise a NotImplementedError. - raise NotImplementedError( - "Cannot apply ufunc {} to mixed DataFrame and Series " - "inputs.".format(ufunc) - ) - axes = self.axes - for obj in alignable[1:]: - # this relies on the fact that we aren't handling mixed - # series / frame ufuncs. - for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): - axes[i] = ax1 | ax2 - - reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) - inputs = tuple( - x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x - for x, t in zip(inputs, types) - ) - else: - reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) - - if self.ndim == 1: - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - name = names[0] if len(set(names)) == 1 else None - reconstruct_kwargs = {"name": name} - else: - reconstruct_kwargs = {} - - def reconstruct(result): - if lib.is_scalar(result): - return result - if result.ndim != self.ndim: - if method == "outer": - if self.ndim == 2: - # we already deprecated for Series - msg = ( - "outer method for ufunc {} is not implemented on " - "pandas objects. Returning an ndarray, but in the " - "future this will raise a 'NotImplementedError'. " - "Consider explicitly converting the DataFrame " - "to an array with '.to_numpy()' first." - ) - warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=3) - return result - raise NotImplementedError - return result - if isinstance(result, BlockManager): - # we went through BlockManager.apply - result = self._constructor(result, **reconstruct_kwargs, copy=False) - else: - # we converted an array, lost our axes - result = self._constructor( - result, **reconstruct_axes, **reconstruct_kwargs, copy=False - ) - # TODO: When we support multiple values in __finalize__, this - # should pass alignable to `__fianlize__` instead of self. - # Then `np.add(a, b)` would consider attrs from both a and b - # when a and b are NDFrames. - if len(alignable) == 1: - result = result.__finalize__(self) - return result - - if self.ndim > 1 and ( - len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] - ): - # Just give up on preserving types in the complex case. - # In theory we could preserve them for them. - # * nout>1 is doable if BlockManager.apply took nout and - # returned a Tuple[BlockManager]. - # * len(inputs) > 1 is doable when we know that we have - # aligned blocks / dtypes. - inputs = tuple(np.asarray(x) for x in inputs) - result = getattr(ufunc, method)(*inputs) - elif self.ndim == 1: - # ufunc(series, ...) - inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - else: - # ufunc(dataframe) - mgr = inputs[0]._mgr - result = mgr.apply(getattr(ufunc, method)) - - if ufunc.nout > 1: # type: ignore[attr-defined] - result = tuple(reconstruct(x) for x in result) - else: - result = reconstruct(result) - return result + return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs) # ideally we would define this to avoid the getattr checks, but # is slower From acfe4344008b05dfbe3f16e0c70afc1440e3bbf6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 13 Nov 2020 10:18:41 -0600 Subject: [PATCH 09/10] union --- pandas/core/arraylike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index b5fdc3fe4589d..e71084f8f630f 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -199,7 +199,7 @@ def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any) # this relies on the fact that we aren't handling mixed # series / frame ufuncs. for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): - axes[i] = ax1 | ax2 + axes[i] = ax1.union(ax2) reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) inputs = tuple( From a6b120a1ff07e6a2f22f91b34155c8816febd3ff Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 25 Nov 2020 08:46:08 -0800 Subject: [PATCH 10/10] docstring, typo fixup --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arraylike.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e1110020367f8..9f7aff0a30bd3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -238,7 +238,7 @@ Other enhancements - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) - - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) -- Calling a NumPy ufunc on a ``DataFrame`` with extension types now presrves the extension types when possible (:issue:`23743`). +- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`). - Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index e71084f8f630f..6b28f8f135769 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -150,6 +150,13 @@ def __rpow__(self, other): def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + """ + Compatibility with numpy ufuncs. + + See also + -------- + numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ + """ from pandas.core.generic import NDFrame from pandas.core.internals import BlockManager