diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 11cf2450d2f28..1a6234625ab93 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -66,6 +66,7 @@ Enhancements - Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. +- Added support for ``searchsorted()`` on `Categorical` class (:issue:`8420`). - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here`. - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 5b3e9e8a22b12..b91b46283e2fe 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -776,7 +776,61 @@ def nbytes(self): return self._codes.nbytes + self._categories.values.nbytes def searchsorted(self, v, side='left', sorter=None): - raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420") + """Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted Categorical `self` such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Parameters + ---------- + v : array_like + Array-like values or a scalar value, to insert/search for in `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `a`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + Series.searchsorted + numpy.searchsorted + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + [apple, bread, bread, cheese, milk] + Categories (4, object): [apple < bread < cheese < milk] + >>> x.searchsorted('bread') + array([1]) # Note: an array, not a scalar + >>> x.searchsorted(['bread']) + array([1]) + >>> x.searchsorted(['bread', 'eggs']) + array([1, 4]) + >>> x.searchsorted(['bread', 'eggs'], side='right') + array([3, 4]) # eggs before milk + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + array([3, 5]) # eggs after donuts, after switching milk and donuts + """ + if not self.ordered: + raise ValueError("searchsorted requires an ordered Categorical.") + + from pandas.core.series import Series + values_as_codes = self.categories.values.searchsorted(Series(v).values, side) + return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 196ad8b7680b9..e04be787d04ee 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -888,13 +888,47 @@ def test_nbytes(self): self.assertEqual(cat.nbytes, exp) def test_searchsorted(self): + # https://github.com/pydata/pandas/issues/8420 + s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + c1 = pd.Categorical(s1) + c2 = pd.Categorical(s2) + + # Single item array + res = c1.searchsorted(['bread']) + chk = s1.searchsorted(['bread']) + exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) - # See https://github.com/pydata/pandas/issues/8420 - # TODO: implement me... - cat = pd.Categorical([1,2,3]) - def f(): - cat.searchsorted(3) - self.assertRaises(NotImplementedError, f) + # Scalar version of single item array + # Categorical return np.array like pd.Series, but different from np.array.searchsorted() + res = c1.searchsorted('bread') + chk = s1.searchsorted('bread') + exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Searching for a value that is not present in the Categorical + res = c1.searchsorted(['bread', 'eggs']) + chk = s1.searchsorted(['bread', 'eggs']) + exp = np.array([1, 4]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Searching for a value that is not present, to the right + res = c1.searchsorted(['bread', 'eggs'], side='right') + chk = s1.searchsorted(['bread', 'eggs'], side='right') + exp = np.array([3, 4]) # eggs before milk + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # As above, but with a sorter array to reorder an unsorted array + res = c2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + exp = np.array([3, 5]) # eggs after donuts, after switching milk and donuts + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) def test_deprecated_labels(self): # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier