Make common impl. with Index.searchsorted

topper-123 · topper-123 · commit ea241c6b4c8d · 2018-08-14T21:34:26.000+01:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -501,6 +501,7 @@ Performance Improvements
   both when indexing by label (using .loc) and position(.iloc).
   Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
 - Improved performance of :func:`Series.searchsorted` (:issue:`22034`)
+- Improved performance of :func:`Index.searchsorted` when dtype is uint64, float64 or object (:issue:`22034`)
 - Improved performance of :func:`Series.describe` in case of numeric dtypes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1230,8 +1230,8 @@ def factorize(self, sort=False, na_sentinel=-1):
     @Appender(_shared_docs['searchsorted'])
     @deprecate_kwarg(old_arg_name='key', new_arg_name='value')
     def searchsorted(self, value, side='left', sorter=None):
-        # needs coercion on the key (DatetimeIndex does already)
-        return self.values.searchsorted(value, side=side, sorter=sorter)
+        return com.searchsorted(self._values, value,
+                                side=side, sorter=sorter)
 
     def drop_duplicates(self, keep='first', inplace=False):
         inplace = validate_bool_kwarg(inplace, 'inplace')
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -13,9 +13,11 @@
 from pandas._libs import lib, tslibs
 
 from pandas import compat
-from pandas.compat import iteritems, PY36, OrderedDict
+from pandas.compat import iteritems, PY2, PY36, OrderedDict
 from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
-from pandas.core.dtypes.common import is_integer
+from pandas.core.dtypes.common import (is_integer, is_integer_dtype,
+                                       is_numeric_dtype, is_number,
+                                       is_scalar, ensure_platform_int)
 from pandas.core.dtypes.inference import _iterable_not_string
 from pandas.core.dtypes.missing import isna, isnull, notnull  # noqa
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -430,3 +432,73 @@ def _pipe(obj, func, *args, **kwargs):
         return func(*args, **kwargs)
     else:
         return func(obj, *args, **kwargs)
+
+
+def ensure_within_integer_bounds(value, dtype):
+    """Ensure that ``value`` is within the integer bounds in ``dtype``.
+
+    Parameters
+    ----------
+    value : a number or array of numbers
+    dtype : a numpy integer dtype
+
+    Raises
+    ------
+    ValueError : if value is outside the bounds set in iinfo(dtype)
+    """
+    if PY2:
+        # python 2 allows e.g. "a" < 1, avoid this
+        if not (is_number(value) or is_numeric_dtype(value)):
+            msg = "value must be a number, was type {}"
+            raise ValueError(msg.format(value))
+
+    # check if value is within integer bounds
+    iinfo = np.iinfo(dtype)
+    value_arr = np.array([value]) if is_scalar(value) else np.array(value)
+    if (value_arr < iinfo.min).any() or (value_arr > iinfo.max).any():
+        msg = "Value {} out of bound for dtype {}"
+        raise ValueError(msg.format(value, dtype))
+
+
+def searchsorted_integer(arr, value, side="left", sorter=None):
+    """searchsorted implementation, but only for integer arrays.
+
+    We get a speedup if the dtype of arr and value is the same.
+
+    See :func:`searchsorted` for a more general searchsorted implementation.
+    """
+    ensure_within_integer_bounds(value, arr.dtype)
+
+    if sorter is not None:
+        sorter = ensure_platform_int(sorter)
+
+    # convert dtype of value for better searchsorted speed
+    dtype = arr.dtype
+    if is_integer(value) or is_integer_dtype(value):
+        value = np.asarray(value, dtype=dtype)
+    elif hasattr(value, 'is_integer') and value.is_integer():
+        # float 2.0 should be converted to int 2
+        # but float 2.2 should *not* be converted to int 2
+        value = np.asarray(value, dtype=dtype)
+
+    return arr.searchsorted(value, side=side, sorter=sorter)
+
+
+def searchsorted(arr, value, side="left", sorter=None):
+    """
+    Do a arr.searchsorted(value) with adjustments for dtypes.
+
+    :func:`numpy.searchsorted` is only fast if value is of same dtype
+    as the searched array. Else numpy recasts arr to a higher dtype, which
+    causes a slowdown. Below we ensure that value has the right dtype
+    for giving fast results for arr.searchsorted, when possible.
+
+    See :meth:`Index.searchsorted` for details on parameters and return value.
+    """
+    if sorter is not None:
+        sorter = ensure_platform_int(sorter)
+
+    if is_integer_dtype(arr):
+        return searchsorted_integer(arr, value, side=side, sorter=sorter)
+    else:
+        return arr.searchsorted(value, side=side, sorter=sorter)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2087,16 +2087,24 @@ def __rmatmul__(self, other):
     @Appender(base._shared_docs['searchsorted'])
     @deprecate_kwarg(old_arg_name='v', new_arg_name='value')
     def searchsorted(self, value, side='left', sorter=None):
-        if sorter is not None:
-            sorter = ensure_platform_int(sorter)
-        if not is_extension_type(self._values):
-            # numpy searchsorted is only fast if value is of same dtype as the
-            # searched array. Below we ensure that value has the right dtype,
-            # and is not 0-dimensional.
-            value = np.asarray(value, dtype=self._values.dtype)
-            value = value[..., np.newaxis] if value.ndim == 0 else value
-
-        return self._values.searchsorted(value, side=side, sorter=sorter)
+        simple_types = (is_integer_dtype, is_float_dtype, is_object_dtype,
+                        is_categorical_dtype)
+
+        if any(is_dtype(self) for is_dtype in simple_types):
+            result = com.searchsorted(self._values, value,
+                                      side=side, sorter=sorter)
+        else:
+            # e.g. self is datetimelike and value is a pd.Timestamp
+            if sorter is not None:
+                sorter = ensure_platform_int(sorter)
+            value = Series(value)._values
+            result = self._values.searchsorted(value, side=side, sorter=sorter)
+
+        if is_scalar(result):
+            # ensure that a 1-dim array is returned
+            result = np.array([result])
+
+        return result
 
     # -------------------------------------------------------------------
     # Combination