Skip to content

PERF: ArrowExtensionArray.searchsorted #50447

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 5, 2023
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,7 @@ Performance improvements
- Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`)
- Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`)
- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`)
- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`)
- Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`)
- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`)
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
Expand Down
23 changes: 23 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import (
TYPE_CHECKING,
Any,
Literal,
TypeVar,
cast,
)
Expand Down Expand Up @@ -116,6 +117,11 @@ def floordiv_compat(
}

if TYPE_CHECKING:
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)

from pandas import Series

ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
Expand Down Expand Up @@ -693,6 +699,23 @@ def round(
"""
return type(self)(pc.round(self._data, ndigits=decimals))

@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> npt.NDArray[np.intp] | np.intp:
if self._hasna:
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the same error the base class would raise? Are there tests for this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the base class raises:

TypeError: boolean value of NA is ambiguous

The ArrowExtensionArray.searchsorted method added here is pretty much a copy of BaseMaskedArray.searchsorted. I will add a test. (I'll add one for BaseMaskedArray as well - I don't see it being tested there)

"searchsorted requires array to be sorted, which is impossible "
"with NAs present."
)
if isinstance(value, ExtensionArray):
value = value.astype(object)
# Base class searchsorted would cast to object, which is *much* slower.
return self.to_numpy().searchsorted(value, side=side, sorter=sorter)

def take(
self,
indices: TakeIndexer,
Expand Down
24 changes: 23 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import (
TYPE_CHECKING,
Literal,
)

import numpy as np

Expand Down Expand Up @@ -54,6 +57,11 @@
if TYPE_CHECKING:
import pyarrow

from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)

from pandas import Series


Expand Down Expand Up @@ -492,6 +500,20 @@ def memory_usage(self, deep: bool = False) -> int:
return result + lib.memory_usage_of_objects(self._ndarray)
return result

@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> npt.NDArray[np.intp] | np.intp:
if self._hasna:
raise ValueError(
"searchsorted requires array to be sorted, which is impossible "
"with NAs present."
)
return super().searchsorted(value=value, side=side, sorter=sorter)

def _cmp_method(self, other, op):
from pandas.arrays import BooleanArray

Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1553,3 +1553,20 @@ def test_round():
result = ser.round(-1)
expected = pd.Series([120.0, pd.NA, 60.0], dtype=dtype)
tm.assert_series_equal(result, expected)


def test_searchsorted_with_na_raises(data_for_sorting, as_series):
# GH50447
b, c, a = data_for_sorting
arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
arr[-1] = pd.NA

if as_series:
arr = pd.Series(arr)

msg = (
"searchsorted requires array to be sorted, "
"which is impossible with NAs present."
)
with pytest.raises(ValueError, match=msg):
arr.searchsorted(b)
17 changes: 17 additions & 0 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,3 +420,20 @@ def arrow_not_supported(self, data, request):
reason="2D support not implemented for ArrowStringArray"
)
request.node.add_marker(mark)


def test_searchsorted_with_na_raises(data_for_sorting, as_series):
# GH50447
b, c, a = data_for_sorting
arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
arr[-1] = pd.NA

if as_series:
arr = pd.Series(arr)

msg = (
"searchsorted requires array to be sorted, "
"which is impossible with NAs present."
)
with pytest.raises(ValueError, match=msg):
arr.searchsorted(b)