Skip to content

Commit ea241c6

Browse files
committed
Make common impl. with Index.searchsorted
1 parent 700e4b4 commit ea241c6

File tree

4 files changed

+95
-14
lines changed

4 files changed

+95
-14
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ Performance Improvements
501501
both when indexing by label (using .loc) and position(.iloc).
502502
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
503503
- Improved performance of :func:`Series.searchsorted` (:issue:`22034`)
504+
- Improved performance of :func:`Index.searchsorted` when dtype is uint64, float64 or object (:issue:`22034`)
504505
- Improved performance of :func:`Series.describe` in case of numeric dtypes (:issue:`21274`)
505506
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
506507
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)

pandas/core/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,8 +1230,8 @@ def factorize(self, sort=False, na_sentinel=-1):
12301230
@Appender(_shared_docs['searchsorted'])
12311231
@deprecate_kwarg(old_arg_name='key', new_arg_name='value')
12321232
def searchsorted(self, value, side='left', sorter=None):
1233-
# needs coercion on the key (DatetimeIndex does already)
1234-
return self.values.searchsorted(value, side=side, sorter=sorter)
1233+
return com.searchsorted(self._values, value,
1234+
side=side, sorter=sorter)
12351235

12361236
def drop_duplicates(self, keep='first', inplace=False):
12371237
inplace = validate_bool_kwarg(inplace, 'inplace')

pandas/core/common.py

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@
1313
from pandas._libs import lib, tslibs
1414

1515
from pandas import compat
16-
from pandas.compat import iteritems, PY36, OrderedDict
16+
from pandas.compat import iteritems, PY2, PY36, OrderedDict
1717
from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
18-
from pandas.core.dtypes.common import is_integer
18+
from pandas.core.dtypes.common import (is_integer, is_integer_dtype,
19+
is_numeric_dtype, is_number,
20+
is_scalar, ensure_platform_int)
1921
from pandas.core.dtypes.inference import _iterable_not_string
2022
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
2123
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -430,3 +432,73 @@ def _pipe(obj, func, *args, **kwargs):
430432
return func(*args, **kwargs)
431433
else:
432434
return func(obj, *args, **kwargs)
435+
436+
437+
def ensure_within_integer_bounds(value, dtype):
438+
"""Ensure that ``value`` is within the integer bounds in ``dtype``.
439+
440+
Parameters
441+
----------
442+
value : a number or array of numbers
443+
dtype : a numpy integer dtype
444+
445+
Raises
446+
------
447+
ValueError : if value is outside the bounds set in iinfo(dtype)
448+
"""
449+
if PY2:
450+
# python 2 allows e.g. "a" < 1, avoid this
451+
if not (is_number(value) or is_numeric_dtype(value)):
452+
msg = "value must be a number, was type {}"
453+
raise ValueError(msg.format(value))
454+
455+
# check if value is within integer bounds
456+
iinfo = np.iinfo(dtype)
457+
value_arr = np.array([value]) if is_scalar(value) else np.array(value)
458+
if (value_arr < iinfo.min).any() or (value_arr > iinfo.max).any():
459+
msg = "Value {} out of bound for dtype {}"
460+
raise ValueError(msg.format(value, dtype))
461+
462+
463+
def searchsorted_integer(arr, value, side="left", sorter=None):
464+
"""searchsorted implementation, but only for integer arrays.
465+
466+
We get a speedup if the dtype of arr and value is the same.
467+
468+
See :func:`searchsorted` for a more general searchsorted implementation.
469+
"""
470+
ensure_within_integer_bounds(value, arr.dtype)
471+
472+
if sorter is not None:
473+
sorter = ensure_platform_int(sorter)
474+
475+
# convert dtype of value for better searchsorted speed
476+
dtype = arr.dtype
477+
if is_integer(value) or is_integer_dtype(value):
478+
value = np.asarray(value, dtype=dtype)
479+
elif hasattr(value, 'is_integer') and value.is_integer():
480+
# float 2.0 should be converted to int 2
481+
# but float 2.2 should *not* be converted to int 2
482+
value = np.asarray(value, dtype=dtype)
483+
484+
return arr.searchsorted(value, side=side, sorter=sorter)
485+
486+
487+
def searchsorted(arr, value, side="left", sorter=None):
488+
"""
489+
Do a arr.searchsorted(value) with adjustments for dtypes.
490+
491+
:func:`numpy.searchsorted` is only fast if value is of same dtype
492+
as the searched array. Else numpy recasts arr to a higher dtype, which
493+
causes a slowdown. Below we ensure that value has the right dtype
494+
for giving fast results for arr.searchsorted, when possible.
495+
496+
See :meth:`Index.searchsorted` for details on parameters and return value.
497+
"""
498+
if sorter is not None:
499+
sorter = ensure_platform_int(sorter)
500+
501+
if is_integer_dtype(arr):
502+
return searchsorted_integer(arr, value, side=side, sorter=sorter)
503+
else:
504+
return arr.searchsorted(value, side=side, sorter=sorter)

pandas/core/series.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2087,16 +2087,24 @@ def __rmatmul__(self, other):
20872087
@Appender(base._shared_docs['searchsorted'])
20882088
@deprecate_kwarg(old_arg_name='v', new_arg_name='value')
20892089
def searchsorted(self, value, side='left', sorter=None):
2090-
if sorter is not None:
2091-
sorter = ensure_platform_int(sorter)
2092-
if not is_extension_type(self._values):
2093-
# numpy searchsorted is only fast if value is of same dtype as the
2094-
# searched array. Below we ensure that value has the right dtype,
2095-
# and is not 0-dimensional.
2096-
value = np.asarray(value, dtype=self._values.dtype)
2097-
value = value[..., np.newaxis] if value.ndim == 0 else value
2098-
2099-
return self._values.searchsorted(value, side=side, sorter=sorter)
2090+
simple_types = (is_integer_dtype, is_float_dtype, is_object_dtype,
2091+
is_categorical_dtype)
2092+
2093+
if any(is_dtype(self) for is_dtype in simple_types):
2094+
result = com.searchsorted(self._values, value,
2095+
side=side, sorter=sorter)
2096+
else:
2097+
# e.g. self is datetimelike and value is a pd.Timestamp
2098+
if sorter is not None:
2099+
sorter = ensure_platform_int(sorter)
2100+
value = Series(value)._values
2101+
result = self._values.searchsorted(value, side=side, sorter=sorter)
2102+
2103+
if is_scalar(result):
2104+
# ensure that a 1-dim array is returned
2105+
result = np.array([result])
2106+
2107+
return result
21002108

21012109
# -------------------------------------------------------------------
21022110
# Combination

0 commit comments

Comments
 (0)