From f9215596d1423c6adb24a69bcfb68afd686ec2bd Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 27 Dec 2022 06:39:33 -0500 Subject: [PATCH 1/6] PERF: ArrowExtensionArray.searchsorted --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 12b0d90e68ab9..6da4ad842bbcd 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -745,6 +745,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) +- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`#####`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) - Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7e954b3d1d1ec..616d0a6f31668 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Any, + Literal, TypeVar, cast, ) @@ -40,6 +41,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core import algorithms as algos from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray import pandas.core.common as com @@ -116,6 +118,11 @@ def floordiv_compat( } if TYPE_CHECKING: + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import Series ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") @@ -661,6 +668,23 @@ def reshape(self, *args, **kwargs): f"as backed by a 1D pyarrow.ChunkedArray." ) + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + if isinstance(value, ExtensionArray): + value = value.astype(object) + # Base class searchsorted would cast to object, which is *much* slower. + return algos.searchsorted(self.to_numpy(), value, side=side, sorter=sorter) + def take( self, indices: TakeIndexer, From 48c3830189665b357ab46bcbe39390adcbab344d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 27 Dec 2022 06:44:59 -0500 Subject: [PATCH 2/6] gh ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 6da4ad842bbcd..7ebba3ed7c74d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -745,7 +745,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) -- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`#####`) +- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) - Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) From ed45d75d5a24567db0f7d1493e895ed6ea285a03 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 27 Dec 2022 21:08:46 -0500 Subject: [PATCH 3/6] fix --- pandas/core/arrays/arrow/array.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 616d0a6f31668..87d6474e9fa8a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,7 +41,6 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core import algorithms as algos from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray import pandas.core.common as com @@ -683,7 +682,7 @@ def searchsorted( if isinstance(value, ExtensionArray): value = value.astype(object) # Base class searchsorted would cast to object, which is *much* slower. - return algos.searchsorted(self.to_numpy(), value, side=side, sorter=sorter) + return self.to_numpy().searchsorted(value, side=side, sorter=sorter) def take( self, From 43d5f45768969b077f12bba14008b673df509dab Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 29 Dec 2022 20:56:03 -0500 Subject: [PATCH 4/6] add test for searchsorted when array contains pd.NA --- pandas/core/arrays/string_.py | 24 +++++++++++++++++++++++- pandas/tests/extension/base/methods.py | 13 +++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e5fb3fc3ff836..9b26db07fc28f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np @@ -54,6 +57,11 @@ if TYPE_CHECKING: import pyarrow + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import Series @@ -492,6 +500,20 @@ def memory_usage(self, deep: bool = False) -> int: return result + lib.memory_usage_of_objects(self._ndarray) return result + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + return super().searchsorted(value=value, side=side, sorter=sorter) + def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2df410dff2b00..4513aec71860d 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -438,6 +438,19 @@ def test_searchsorted(self, data_for_sorting, as_series): sorter = np.array([1, 2, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + # arr containing na value + if arr._can_hold_na and arr.dtype.na_value is pd.NA: + arr_with_na = pd.array([a, b, pd.NA], dtype=arr.dtype) + if as_series: + arr_with_na = pd.Series(arr_with_na) + err_msg = ( + "searchsorted requires array to be sorted, " + "which is impossible with NAs present." + ) + with pytest.raises(ValueError, match=err_msg): + print(arr_with_na.dtype) + arr_with_na.searchsorted(a) + def test_where_series(self, data, na_value, as_frame): assert data[0] != data[1] cls = type(data) From 504cc29a8ccacbe5c093a9a9afbce6bcdbfd2149 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 3 Jan 2023 17:51:14 -0500 Subject: [PATCH 5/6] remove print statement --- pandas/tests/extension/base/methods.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4513aec71860d..9ae37a07189ca 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -448,7 +448,6 @@ def test_searchsorted(self, data_for_sorting, as_series): "which is impossible with NAs present." ) with pytest.raises(ValueError, match=err_msg): - print(arr_with_na.dtype) arr_with_na.searchsorted(a) def test_where_series(self, data, na_value, as_frame): From 240031e94a6653e83b2d9d1f4f604724c45955d5 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 4 Jan 2023 19:42:21 -0500 Subject: [PATCH 6/6] move tests --- pandas/tests/extension/base/methods.py | 12 ------------ pandas/tests/extension/test_arrow.py | 17 +++++++++++++++++ pandas/tests/extension/test_string.py | 17 +++++++++++++++++ 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 9ae37a07189ca..2df410dff2b00 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -438,18 +438,6 @@ def test_searchsorted(self, data_for_sorting, as_series): sorter = np.array([1, 2, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 - # arr containing na value - if arr._can_hold_na and arr.dtype.na_value is pd.NA: - arr_with_na = pd.array([a, b, pd.NA], dtype=arr.dtype) - if as_series: - arr_with_na = pd.Series(arr_with_na) - err_msg = ( - "searchsorted requires array to be sorted, " - "which is impossible with NAs present." - ) - with pytest.raises(ValueError, match=err_msg): - arr_with_na.searchsorted(a) - def test_where_series(self, data, na_value, as_frame): assert data[0] != data[1] cls = type(data) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c1785591f41a9..02f72d67673ae 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1553,3 +1553,20 @@ def test_round(): result = ser.round(-1) expected = pd.Series([120.0, pd.NA, 60.0], dtype=dtype) tm.assert_series_equal(result, expected) + + +def test_searchsorted_with_na_raises(data_for_sorting, as_series): + # GH50447 + b, c, a = data_for_sorting + arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] + arr[-1] = pd.NA + + if as_series: + arr = pd.Series(arr) + + msg = ( + "searchsorted requires array to be sorted, " + "which is impossible with NAs present." + ) + with pytest.raises(ValueError, match=msg): + arr.searchsorted(b) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index de7967a8578b5..3e865947aa968 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -420,3 +420,20 @@ def arrow_not_supported(self, data, request): reason="2D support not implemented for ArrowStringArray" ) request.node.add_marker(mark) + + +def test_searchsorted_with_na_raises(data_for_sorting, as_series): + # GH50447 + b, c, a = data_for_sorting + arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] + arr[-1] = pd.NA + + if as_series: + arr = pd.Series(arr) + + msg = ( + "searchsorted requires array to be sorted, " + "which is impossible with NAs present." + ) + with pytest.raises(ValueError, match=msg): + arr.searchsorted(b)