diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 23675752a4593..b49a9d7957d51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -448,9 +448,11 @@ def isin(comps, values) -> np.ndarray: return f(comps, values) -def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None): +def _factorize_array( + values, na_sentinel: int = -1, size_hint=None, na_value=None +) -> Tuple[np.ndarray, np.ndarray]: """ - Factorize an array-like to labels and uniques. + Factorize an array-like to codes and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -468,18 +470,16 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non Returns ------- - labels : ndarray + codes : ndarray uniques : ndarray """ hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) - uniques, labels = table.factorize( - values, na_sentinel=na_sentinel, na_value=na_value - ) + uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) - labels = ensure_platform_int(labels) - return labels, uniques + codes = ensure_platform_int(codes) + return codes, uniques _shared_docs[ @@ -1924,33 +1924,34 @@ def diff(arr, n: int, axis: int = 0): # this module. def safe_sort( values, - labels=None, + codes=None, na_sentinel: int = -1, assume_unique: bool = False, verify: bool = True, -): +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: """ - Sort ``values`` and reorder corresponding ``labels``. - ``values`` should be unique if ``labels`` is not None. + Sort ``values`` and reorder corresponding ``codes``. + + ``values`` should be unique if ``codes`` is not None. Safe for use with mixed types (int, str), orders ints before strs. Parameters ---------- values : list-like - Sequence; must be unique if ``labels`` is not None. - labels : list_like + Sequence; must be unique if ``codes`` is not None. + codes : list_like, optional Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 - Value in ``labels`` to mark "not found". - Ignored when ``labels`` is None. + Value in ``codes`` to mark "not found". + Ignored when ``codes`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up - the calculation. Ignored when ``labels`` is None. + the calculation. Ignored when ``codes`` is None. verify : bool, default True - Check if labels are out of bound for the values and put out of bound - labels equal to na_sentinel. If ``verify=False``, it is assumed there - are no out of bound labels. Ignored when ``labels`` is None. + Check if codes are out of bound for the values and put out of bound + codes equal to na_sentinel. If ``verify=False``, it is assumed there + are no out of bound codes. Ignored when ``codes`` is None. .. versionadded:: 0.25.0 @@ -1958,17 +1959,17 @@ def safe_sort( ------- ordered : ndarray Sorted ``values`` - new_labels : ndarray - Reordered ``labels``; returned when ``labels`` is not None. + new_codes : ndarray + Reordered ``codes``; returned when ``codes`` is not None. Raises ------ TypeError - * If ``values`` is not list-like or if ``labels`` is neither None + * If ``values`` is not list-like or if ``codes`` is neither None nor list-like * If ``values`` cannot be sorted ValueError - * If ``labels`` is not None and ``values`` contain duplicates. + * If ``codes`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError( @@ -2002,22 +2003,22 @@ def sort_mixed(values): # try this anyway ordered = sort_mixed(values) - # labels: + # codes: - if labels is None: + if codes is None: return ordered - if not is_list_like(labels): + if not is_list_like(codes): raise TypeError( "Only list-like objects or None are allowed to be" - "passed to safe_sort as labels" + "passed to safe_sort as codes" ) - labels = ensure_platform_int(np.asarray(labels)) + codes = ensure_platform_int(np.asarray(codes)) from pandas import Index if not assume_unique and not Index(values).is_unique: - raise ValueError("values should be unique if labels is not None") + raise ValueError("values should be unique if codes is not None") if sorter is None: # mixed types @@ -2029,9 +2030,9 @@ def sort_mixed(values): if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() - new_labels = take_1d(order2, labels, fill_value=-1) + new_codes = take_1d(order2, codes, fill_value=-1) if verify: - mask = (labels < -len(values)) | (labels >= len(values)) + mask = (codes < -len(values)) | (codes >= len(values)) else: mask = None else: @@ -2039,13 +2040,13 @@ def sort_mixed(values): reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` - new_labels = reverse_indexer.take(labels, mode="wrap") + new_codes = reverse_indexer.take(codes, mode="wrap") - mask = labels == na_sentinel + mask = codes == na_sentinel if verify: - mask = mask | (labels < -len(values)) | (labels >= len(values)) + mask = mask | (codes < -len(values)) | (codes >= len(values)) if mask is not None: - np.putmask(new_labels, mask, na_sentinel) + np.putmask(new_codes, mask, na_sentinel) - return ordered, ensure_platform_int(new_labels) + return ordered, ensure_platform_int(new_codes) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 5d7eb70817a11..90cd9cc3e006d 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -314,27 +314,27 @@ def verify_order(df): def test_decons(): - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) + def testit(codes_list, shape): + group_index = get_group_index(codes_list, shape, sort=True, xnull=True) + codes_list2 = decons_group_index(group_index, shape) - for a, b in zip(label_list, label_list2): + for a, b in zip(codes_list, codes_list2): tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [ + codes_list = [ np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), ] - testit(label_list, shape) + testit(codes_list, shape) shape = (10000, 10000) - label_list = [ + codes_list = [ np.tile(np.arange(10000, dtype=np.int64), 5), np.tile(np.arange(10000, dtype=np.int64), 5), ] - testit(label_list, shape) + testit(codes_list, shape) class TestSafeSort: @@ -355,42 +355,42 @@ def test_basic_sort(self): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) - def test_labels(self, verify): + def test_codes(self, verify): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) - labels = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) # na_sentinel - labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) - labels = [] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([], dtype=np.intp) + codes = [] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_labels_out_of_bound(self, na_sentinel): + def test_codes_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices - labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel) - expected_labels = np.array( + codes = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel) + expected_codes = np.array( [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp ) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer(self): values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) @@ -399,12 +399,12 @@ def test_mixed_integer(self): tm.assert_numpy_array_equal(result, expected) values = np.array(["b", 1, 0, "a"], dtype=object) - labels = [0, 1, 2, 3, 0, -1, 1] - result, result_labels = safe_sort(values, labels) + codes = [0, 1, 2, 3, 0, -1, 1] + result, result_codes = safe_sort(values, codes) expected = np.array([0, 1, "a", "b"], dtype=object) - expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) + expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer_from_list(self): values = ["b", 1, 0, "a", 0, "b"] @@ -428,10 +428,10 @@ def test_exceptions(self): safe_sort(values=1) with pytest.raises(TypeError, match="Only list-like objects or None"): - safe_sort(values=[0, 1, 2], labels=1) + safe_sort(values=[0, 1, 2], codes=1) with pytest.raises(ValueError, match="values should be unique"): - safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + safe_sort(values=[0, 1, 2, 1], codes=[0, 1]) def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') @@ -443,12 +443,12 @@ def test_extension_array(self): @pytest.mark.parametrize("verify", [True, False]) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_extension_array_labels(self, verify, na_sentinel): + def test_extension_array_codes(self, verify, na_sentinel): a = array([1, 3, 2], dtype="Int64") - result, labels = safe_sort( + result, codes = safe_sort( a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify ) expected_values = array([1, 2, 3], dtype="Int64") - expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) + expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes)