diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5783d3c2353aa..047eb848b7540 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -199,8 +199,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): Returns ------- - tuple - 1-d indexer ordered by groups, group counts. + ndarray[intp_t, ndim=1] + Indexer + ndarray[int64_t, ndim=1] + Group Counts Notes ----- @@ -208,11 +210,12 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): """ cdef: Py_ssize_t i, loc, label, n - ndarray[int64_t] counts, where, result + ndarray[int64_t] counts, where + ndarray[intp_t] indexer counts = np.zeros(ngroups + 1, dtype=np.int64) n = len(index) - result = np.zeros(n, dtype=np.int64) + indexer = np.zeros(n, dtype=np.intp) where = np.zeros(ngroups + 1, dtype=np.int64) with nogil: @@ -228,10 +231,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): # this is our indexer for i in range(n): label = index[i] + 1 - result[where[label]] = i + indexer[where[label]] = i where[label] += 1 - return result, counts + return indexer, counts @cython.boundscheck(False) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 4eefd9d1f7267..cdf4ef3b119d2 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -66,7 +66,7 @@ def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values, {{else}} def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, {{endif}} - const int64_t[:] indexer, + const intp_t[:] indexer, {{c_type_out}}[:] out, fill_value=np.nan): @@ -102,7 +102,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[int64_t] indexer, + ndarray[intp_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): cdef: @@ -156,7 +156,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[int64_t] indexer, + ndarray[intp_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1bfb66cbf21ac..89020f2078584 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -19,6 +19,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -141,6 +142,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K, ngroups, size ndarray[int64_t] _counts ndarray[float64_t, ndim=2] data + ndarray[intp_t] indexer float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 511b373bc7e1f..c2947de943e1a 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -33,7 +33,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[intp_t] left_sorter, right_sorter + ndarray[int64_t] left_count, right_count ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -84,8 +85,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter - ndarray rev + ndarray[int64_t] left_count, right_count + ndarray[intp_t] rev, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -157,7 +158,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[intp_t] left_sorter, right_sorter + ndarray[int64_t] left_count, right_count ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc int64_t left_pos = 0, right_pos = 0 @@ -215,12 +217,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, _get_result_indexer(right_sorter, right_indexer)) -cdef _get_result_indexer(ndarray[int64_t] sorter, ndarray[int64_t] indexer): +cdef ndarray[int64_t] _get_result_indexer( + ndarray[intp_t] sorter, ndarray[int64_t] indexer +): if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` res = np.empty(len(indexer), dtype=np.int64) - take_1d_int64_int64(sorter, indexer, res, -1) + take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1) + # FIXME: sorter is intp_t, not int64_t, opposite for indexer; + # will this break on 32bit builds? else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index c1abd8bbf39d0..ba1b2a0f0e76e 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -117,10 +117,10 @@ def _take_nd_ndarray( ) -> np.ndarray: if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) + indexer = np.arange(arr.shape[axis], dtype=np.intp) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = ensure_int64(indexer, copy=False) + indexer = ensure_platform_int(indexer) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, out, fill_value, allow_fill ) @@ -317,7 +317,7 @@ def _get_take_nd_function( if func is None: def func(arr, indexer, out, fill_value=np.nan): - indexer = ensure_int64(indexer) + indexer = ensure_platform_int(indexer) _take_nd_object( arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info ) @@ -468,7 +468,7 @@ def wrapper( def _take_nd_object( arr: np.ndarray, - indexer: np.ndarray, + indexer: np.ndarray, # np.ndarray[np.intp] out: np.ndarray, axis: int, fill_value, @@ -544,4 +544,5 @@ def _take_preprocess_indexer_and_fill_value( # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() + indexer = ensure_platform_int(indexer) return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 864bd0684d445..00667aae5c9ff 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1961,7 +1961,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: Returns ------- - dict of categories -> indexers + Dict[Hashable, np.ndarray[np.intp]] + dict of categories -> indexers Examples -------- @@ -1979,7 +1980,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ categories = self.categories r, counts = libalgos.groupsort_indexer( - self.codes.astype("int64"), categories.size + self.codes.astype("int64", copy=False), categories.size ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 10c13327c79d3..3aa4d26f7dc8f 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -606,6 +606,7 @@ def get_group_index_sorter( ) if do_groupsort: sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) + # sorter _should_ already be intp, but mypy is not yet able to verify else: sorter = group_index.argsort(kind="mergesort") return ensure_platform_int(sorter) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f685680515a8f..da438826a939a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1717,9 +1717,9 @@ def test_groupby_categorical_indices_unused_categories(): grouped = df.groupby("key", sort=False) result = grouped.indices expected = { - "b": np.array([0, 1], dtype="int64"), - "a": np.array([2], dtype="int64"), - "c": np.array([], dtype="int64"), + "b": np.array([0, 1], dtype="intp"), + "a": np.array([2], dtype="intp"), + "c": np.array([], dtype="intp"), } assert result.keys() == expected.keys() for key in result.keys(): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 876df69ae7f63..c8df18ddaeebe 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2123,19 +2123,19 @@ def test_groupsort_indexer(): # need to use a stable sort # np.argsort returns int, groupsort_indexer - # always returns int64 + # always returns intp expected = np.argsort(a, kind="mergesort") - expected = expected.astype(np.int64) + expected = expected.astype(np.intp) tm.assert_numpy_array_equal(result, expected) # compare with lexsort # np.lexsort returns int, groupsort_indexer - # always returns int64 + # always returns intp key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) - expected = expected.astype(np.int64) + expected = expected.astype(np.intp) tm.assert_numpy_array_equal(result, expected)