Skip to content
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,7 @@ Performance improvements
- Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`)
- Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`)
- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`)
- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`)
- Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`)
- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`)
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
Expand Down
23 changes: 23 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import (
TYPE_CHECKING,
Any,
Literal,
TypeVar,
cast,
)
Expand Down Expand Up @@ -116,6 +117,11 @@ def floordiv_compat(
}

if TYPE_CHECKING:
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)

from pandas import Series

ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
Expand Down Expand Up @@ -687,6 +693,23 @@ def round(
"""
return type(self)(pc.round(self._data, ndigits=decimals))

@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> npt.NDArray[np.intp] | np.intp:
if self._hasna:
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the same error the base class would raise? Are there tests for this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the base class raises:

TypeError: boolean value of NA is ambiguous

The ArrowExtensionArray.searchsorted method added here is pretty much a copy of BaseMaskedArray.searchsorted. I will add a test. (I'll add one for BaseMaskedArray as well - I don't see it being tested there)

"searchsorted requires array to be sorted, which is impossible "
"with NAs present."
)
if isinstance(value, ExtensionArray):
value = value.astype(object)
# Base class searchsorted would cast to object, which is *much* slower.
return self.to_numpy().searchsorted(value, side=side, sorter=sorter)

def take(
self,
indices: TakeIndexer,
Expand Down
24 changes: 23 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import (
TYPE_CHECKING,
Literal,
)

import numpy as np

Expand Down Expand Up @@ -54,6 +57,11 @@
if TYPE_CHECKING:
import pyarrow

from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
)

from pandas import Series


Expand Down Expand Up @@ -492,6 +500,20 @@ def memory_usage(self, deep: bool = False) -> int:
return result + lib.memory_usage_of_objects(self._ndarray)
return result

@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> npt.NDArray[np.intp] | np.intp:
if self._hasna:
raise ValueError(
"searchsorted requires array to be sorted, which is impossible "
"with NAs present."
)
return super().searchsorted(value=value, side=side, sorter=sorter)

def _cmp_method(self, other, op):
from pandas.arrays import BooleanArray

Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,19 @@ def test_searchsorted(self, data_for_sorting, as_series):
sorter = np.array([1, 2, 0])
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0

# arr containing na value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think since this is applicable to pandas specific masked arrays + arrow array, this test should live in those specific test files. Technically these base extension tests should be passable for any EA developer who might not define searchsorted to error in this way

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, I've moved the tests.

if arr._can_hold_na and arr.dtype.na_value is pd.NA:
arr_with_na = pd.array([a, b, pd.NA], dtype=arr.dtype)
if as_series:
arr_with_na = pd.Series(arr_with_na)
err_msg = (
"searchsorted requires array to be sorted, "
"which is impossible with NAs present."
)
with pytest.raises(ValueError, match=err_msg):
print(arr_with_na.dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you remove this print?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, removed. thx

arr_with_na.searchsorted(a)

def test_where_series(self, data, na_value, as_frame):
assert data[0] != data[1]
cls = type(data)
Expand Down