From c35f0e5c9fd110793e589611368a2b15bcaceb9b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Jan 2023 11:59:07 -0800 Subject: [PATCH 1/5] ENH: support any/all for pyarrow numeric and duration dtypes --- pandas/core/arrays/arrow/array.py | 25 ++++++++++++++++++++++++- pandas/tests/extension/test_arrow.py | 16 +++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de85ed67e7e8c..453be7ed27c9c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -10,7 +10,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + Timedelta, + lib, +) from pandas._typing import ( ArrayLike, AxisInt, @@ -971,6 +974,26 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ------ TypeError : subclass does not define reductions """ + pa_type = self._data.type + + if name in ["any", "all"] and ( + pa.types.is_integer(pa_type) + or pa.types.is_floating(pa_type) + or pa.types.is_duration(pa_type) + ): + # pyarrow only supports any/all for boolean dtype, we allow + # for other dtypes, matching our non-pyarrow behavior + + if pa.types.is_duration(pa_type): + zero = Timedelta(0) + else: + zero = 0 + result = (self != zero)._reduce(name, skipna=skipna, **kwargs) + if isinstance(result, np.bool_): + # need to rule out pd.NA + result = bool(result) + return result + if name == "sem": def pyarrow_meth(data, skip_nulls, **kwargs): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0a7303ea239ed..5ee7c09c4e898 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -491,10 +491,24 @@ def test_reduce_series( f"pyarrow={pa.__version__} for {pa_dtype}" ), ) - if not pa.types.is_boolean(pa_dtype): + if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype): + # We *might* want to make this behave like the non-pyarrow cases, + # but have not yet decided. request.node.add_marker(xfail_mark) + op_name = all_boolean_reductions ser = pd.Series(data) + + if pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype): + # xref GH#34479 we support this in our non-pyarrow datetime64 dtypes, + # but it isn't obvious we _should_. For now, we keep the pyarrow + # behavior which does not support this. + + with pytest.raises(TypeError, match="does not support reduction"): + getattr(ser, op_name)(skipna=skipna) + + return + result = getattr(ser, op_name)(skipna=skipna) assert result is (op_name == "any") From ce6b444681cee2dae83261b414e0e315859468e1 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Jan 2023 14:32:29 -0800 Subject: [PATCH 2/5] mypy fixup --- pandas/core/arrays/arrow/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 453be7ed27c9c..6686583b5134d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -984,6 +984,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # pyarrow only supports any/all for boolean dtype, we allow # for other dtypes, matching our non-pyarrow behavior + zero: int | Timedelta if pa.types.is_duration(pa_type): zero = Timedelta(0) else: From 3fcffd5a05c6de693f3895788ccbf1291fd150f9 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 19 Jan 2023 13:31:01 -0800 Subject: [PATCH 3/5] use pc.not_equal --- pandas/core/arrays/arrow/array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9c01f0f1e611c..27f839918cf24 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1026,7 +1026,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): zero = Timedelta(0) else: zero = 0 - result = (self != zero)._reduce(name, skipna=skipna, **kwargs) + + not_eq = pc.not_equal(self._data, zero) + result = not_eq._reduce(name, skipna=skipna, **kwargs) if isinstance(result, np.bool_): # need to rule out pd.NA result = bool(result) From 4b040ab66de8f63a6ee6eed0642b6f324dab7e32 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 20 Jan 2023 10:19:36 -0800 Subject: [PATCH 4/5] use suggested pattern --- pandas/core/arrays/arrow/array.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 935b09a32bd78..1f4ed3faaccd5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1017,6 +1017,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ pa_type = self._data.type + data_to_reduce = self._data + if name in ["any", "all"] and ( pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type) @@ -1032,11 +1034,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): zero = 0 not_eq = pc.not_equal(self._data, zero) - result = not_eq._reduce(name, skipna=skipna, **kwargs) - if isinstance(result, np.bool_): - # need to rule out pd.NA - result = bool(result) - return result + data_to_reduce = not_eq if name == "sem": @@ -1059,8 +1057,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs): if pyarrow_meth is None: # Let ExtensionArray._reduce raise the TypeError return super()._reduce(name, skipna=skipna, **kwargs) + try: - result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs) + result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs) except (AttributeError, NotImplementedError, TypeError) as err: msg = ( f"'{type(self).__name__}' with dtype {self.dtype} " From 2308f849716d36b117ba9086957498409962a30c Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 21 Jan 2023 15:31:29 -0800 Subject: [PATCH 5/5] fix not_eq --- pandas/core/arrays/arrow/array.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1f4ed3faaccd5..87caed654815a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -10,10 +10,7 @@ import numpy as np -from pandas._libs import ( - Timedelta, - lib, -) +from pandas._libs import lib from pandas._typing import ( ArrayLike, AxisInt, @@ -1027,13 +1024,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # pyarrow only supports any/all for boolean dtype, we allow # for other dtypes, matching our non-pyarrow behavior - zero: int | Timedelta if pa.types.is_duration(pa_type): - zero = Timedelta(0) + data_to_cmp = self._data.cast(pa.int64()) else: - zero = 0 + data_to_cmp = self._data - not_eq = pc.not_equal(self._data, zero) + not_eq = pc.not_equal(data_to_cmp, 0) data_to_reduce = not_eq if name == "sem":