From 857386c073473c4b3ca9217d1c8a132e0b88d697 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 29 Sep 2021 22:54:04 -0400 Subject: [PATCH 1/4] wip --- pandas/_libs/algos.pxd | 4 +- pandas/_libs/algos.pyx | 121 +++++++++++------------- pandas/_libs/dtypes.pxd | 27 ++++++ pandas/_libs/groupby.pyx | 133 +++++++++++++-------------- pandas/_libs/join.pyx | 101 +++++++------------- pandas/_libs/reshape.pyx | 28 +----- pandas/_libs/util.pxd | 15 --- pandas/_libs/window/aggregations.pyx | 26 +++--- 8 files changed, 196 insertions(+), 259 deletions(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 4f7cc9345ed30..fdeff2ed11805 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -1,7 +1,7 @@ -from pandas._libs.util cimport numeric +from pandas._libs.dtypes cimport numeric_t -cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil +cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil cdef enum TiebreakEnumType: TIEBREAK_AVERAGE diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 22e2abc9b9c36..7bb95ecf1ce35 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -45,7 +45,7 @@ from numpy cimport ( cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.dtypes cimport numeric_object_t +from pandas._libs.dtypes cimport numeric_object_t, numeric_t, iu_64_floating_obj_t from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -57,7 +57,6 @@ from pandas._libs.khash cimport ( ) from pandas._libs.util cimport ( get_nat, - numeric, ) import pandas._libs.missing as missing @@ -240,9 +239,9 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups): return indexer.base, counts.base -cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: +cdef inline Py_ssize_t swap(numeric_t *a, numeric_t *b) nogil: cdef: - numeric t + numeric_t t # cython doesn't allow pointer dereference so use array syntax t = a[0] @@ -251,7 +250,7 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: return 0 -cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil: +cdef inline numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil: """ See kth_smallest.__doc__. The additional parameter n specifies the maximum number of elements considered in arr, needed for compatibility with usage @@ -259,7 +258,7 @@ cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nog """ cdef: Py_ssize_t i, j, l, m - numeric x + numeric_t x l = 0 m = n - 1 @@ -291,7 +290,7 @@ cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nog @cython.boundscheck(False) @cython.wraparound(False) -def kth_smallest(numeric[::1] arr, Py_ssize_t k) -> numeric: +def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t: """ Compute the kth smallest value in arr. Note that the input array will be modified. @@ -309,7 +308,7 @@ def kth_smallest(numeric[::1] arr, Py_ssize_t k) -> numeric: The kth smallest value in arr """ cdef: - numeric result + numeric_t result with nogil: result = kth_smallest_c(&arr[0], k, arr.shape[0]) @@ -514,20 +513,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr # ---------------------------------------------------------------------- -ctypedef fused algos_t: - float64_t - float32_t - object - int64_t - int32_t - int16_t - int8_t - uint64_t - uint32_t - uint16_t - uint8_t - - def validate_limit(nobs: int | None, limit=None) -> int: """ Check that the `limit` argument is a positive integer. @@ -556,12 +541,12 @@ def validate_limit(nobs: int | None, limit=None) -> int: @cython.boundscheck(False) @cython.wraparound(False) -def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: +def pad(ndarray[numeric_object_t] old, ndarray[numeric_object_t] new, limit=None) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t, ndim=1] indexer - algos_t cur, next_val + numeric_object_t cur, next_val int lim, fill_count = 0 nleft = len(old) @@ -614,10 +599,10 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): +def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N - algos_t val + numeric_object_t val uint8_t prev_mask int lim, fill_count = 0 @@ -646,10 +631,10 @@ def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): +def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K - algos_t val + numeric_object_t val int lim, fill_count = 0 K, N = (values).shape @@ -702,12 +687,12 @@ D @cython.boundscheck(False) @cython.wraparound(False) -def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: +def backfill(ndarray[numeric_object_t] old, ndarray[numeric_object_t] new, limit=None) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t, ndim=1] indexer - algos_t cur, prev + numeric_object_t cur, prev int lim, fill_count = 0 nleft = len(old) @@ -759,11 +744,11 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: return indexer -def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): +def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None): pad_inplace(values[::-1], mask[::-1], limit=limit) -def backfill_2d_inplace(algos_t[:, :] values, +def backfill_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None): pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit) @@ -771,7 +756,7 @@ def backfill_2d_inplace(algos_t[:, :] values, @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): +def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): """ Returns ------- @@ -782,7 +767,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): """ cdef: Py_ssize_t i, n - algos_t prev, cur + numeric_object_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 bint is_unique = 1 @@ -802,7 +787,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): if timelike and arr[0] == NPY_NAT: return False, False, True - if algos_t is not object: + if numeric_object_t is not object: with nogil: prev = arr[0] for i in range(1, n): @@ -861,9 +846,9 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): # rank_1d, rank_2d # ---------------------------------------------------------------------- -cdef numeric_object_t get_rank_nan_fill_val( +cdef iu_64_floating_obj_t get_rank_nan_fill_val( bint rank_nans_highest, - numeric_object_t[:] _=None + iu_64_floating_obj_t[:] _=None ): """ Return the value we'll use to represent missing values when sorting depending @@ -871,20 +856,20 @@ cdef numeric_object_t get_rank_nan_fill_val( is unused, but needed for fused type specialization) """ if rank_nans_highest: - if numeric_object_t is object: + if iu_64_floating_obj_t is object: return Infinity() - elif numeric_object_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return util.INT64_MAX - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: return util.UINT64_MAX else: return np.inf else: - if numeric_object_t is object: + if iu_64_floating_obj_t is object: return NegInfinity() - elif numeric_object_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: return 0 else: return -np.inf @@ -893,7 +878,7 @@ cdef numeric_object_t get_rank_nan_fill_val( @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - ndarray[numeric_object_t, ndim=1] values, + ndarray[iu_64_floating_obj_t, ndim=1] values, const intp_t[:] labels=None, bint is_datetimelike=False, ties_method="average", @@ -906,7 +891,7 @@ def rank_1d( Parameters ---------- - values : array of numeric_object_t values to be ranked + values : array of iu_64_floating_obj_t values to be ranked labels : np.ndarray[np.intp] or None Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called @@ -936,11 +921,11 @@ def rank_1d( int64_t[::1] grp_sizes intp_t[:] lexsort_indexer float64_t[::1] out - ndarray[numeric_object_t, ndim=1] masked_vals - numeric_object_t[:] masked_vals_memview + ndarray[iu_64_floating_obj_t, ndim=1] masked_vals + iu_64_floating_obj_t[:] masked_vals_memview uint8_t[:] mask bint keep_na, nans_rank_highest, check_labels, check_mask - numeric_object_t nan_fill_val + iu_64_floating_obj_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -961,22 +946,22 @@ def rank_1d( check_labels = labels is not None # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (numeric_object_t is uint64_t or - (numeric_object_t is int64_t and not is_datetimelike)) + check_mask = not (iu_64_floating_obj_t is uint64_t or + (iu_64_floating_obj_t is int64_t and not is_datetimelike)) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array - if numeric_object_t is object and values.dtype != np.object_: + if iu_64_floating_obj_t is object and values.dtype != np.object_: masked_vals = values.astype('O') else: masked_vals = values.copy() - if numeric_object_t is object: + if iu_64_floating_obj_t is object: mask = missing.isnaobj(masked_vals) - elif numeric_object_t is int64_t and is_datetimelike: + elif iu_64_floating_obj_t is int64_t and is_datetimelike: mask = (masked_vals == NPY_NAT).astype(np.uint8) - elif numeric_object_t is float64_t: + elif iu_64_floating_obj_t is float64_t: mask = np.isnan(masked_vals).astype(np.uint8) else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) @@ -988,7 +973,7 @@ def rank_1d( # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) if nans_rank_highest: order = [masked_vals, mask] else: @@ -1035,7 +1020,7 @@ cdef void rank_sorted_1d( int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, # Can make const with cython3 (https://github.com/cython/cython/issues/3222) - numeric_object_t[:] masked_vals, + iu_64_floating_obj_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1059,7 +1044,7 @@ cdef void rank_sorted_1d( if labels is None. sort_indexer : intp_t[:] Array of indices which sorts masked_vals - masked_vals : numeric_object_t[:] + masked_vals : iu_64_floating_obj_t[:] The values input to rank_1d, with missing values replaced by fill values mask : uint8_t[:] Array where entries are True if the value is missing, False otherwise. @@ -1091,7 +1076,7 @@ cdef void rank_sorted_1d( # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil - if numeric_object_t is object: + if iu_64_floating_obj_t is object: with gil: for i in range(N): at_end = i == N - 1 @@ -1299,7 +1284,7 @@ cdef void rank_sorted_1d( def rank_2d( - ndarray[numeric_object_t, ndim=2] in_arr, + ndarray[iu_64_floating_obj_t, ndim=2] in_arr, int axis=0, bint is_datetimelike=False, ties_method="average", @@ -1314,13 +1299,13 @@ def rank_2d( Py_ssize_t k, n, col float64_t[::1, :] out # Column-major so columns are contiguous int64_t[::1] grp_sizes - ndarray[numeric_object_t, ndim=2] values - numeric_object_t[:, :] masked_vals + ndarray[iu_64_floating_obj_t, ndim=2] values + iu_64_floating_obj_t[:, :] masked_vals intp_t[:, :] sort_indexer uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest - numeric_object_t nan_fill_val + iu_64_floating_obj_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -1330,25 +1315,25 @@ def rank_2d( keep_na = na_option == 'keep' # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (numeric_object_t is uint64_t or - (numeric_object_t is int64_t and not is_datetimelike)) + check_mask = not (iu_64_floating_obj_t is uint64_t or + (iu_64_floating_obj_t is int64_t and not is_datetimelike)) if axis == 1: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() - if numeric_object_t is object: + if iu_64_floating_obj_t is object: if values.dtype != np.object_: values = values.astype('O') nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) - if numeric_object_t is object: + if iu_64_floating_obj_t is object: mask = missing.isnaobj2d(values).view(np.uint8) - elif numeric_object_t is float64_t: + elif iu_64_floating_obj_t is float64_t: mask = np.isnan(values).view(np.uint8) # int64 and datetimelike diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index ef95b8aab6e70..ad579ada7417a 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -5,13 +5,40 @@ Common location for shared fused types from numpy cimport ( float32_t, float64_t, + int8_t, + int16_t, + int32_t, int64_t, + uint8_t, + uint16_t, + uint32_t, uint64_t, ) +ctypedef fused numeric_t: + int8_t + int16_t + int32_t + int64_t + + uint8_t + uint16_t + uint32_t + uint64_t + + float32_t + float64_t + ctypedef fused numeric_object_t: + numeric_t + object + +ctypedef fused iu_64_floating_t: float64_t float32_t int64_t uint64_t + +ctypedef fused iu_64_floating_obj_t: + iu_64_floating_t object diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index bbdc5a8287502..6988c2cf7f28e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -33,7 +33,6 @@ cnp.import_array() from pandas._libs.algos cimport kth_smallest_c from pandas._libs.util cimport ( get_nat, - numeric, ) from pandas._libs.algos import ( @@ -43,7 +42,7 @@ from pandas._libs.algos import ( take_2d_axis1_float64_float64, ) -from pandas._libs.dtypes cimport numeric_object_t +from pandas._libs.dtypes cimport iu_64_floating_obj_t, iu_64_floating_t, numeric_t from pandas._libs.missing cimport checknull @@ -201,8 +200,8 @@ def group_cumprod_float64(float64_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cumsum(numeric[:, ::1] out, - ndarray[numeric, ndim=2] values, +def group_cumsum(numeric_t[:, ::1] out, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, int ngroups, is_datetimelike, @@ -231,8 +230,8 @@ def group_cumsum(numeric[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, size - numeric val, y, t - numeric[:, ::1] accum, compensation + numeric_t val, y, t + numeric_t[:, ::1] accum, compensation intp_t lab N, K = (values).shape @@ -250,7 +249,7 @@ def group_cumsum(numeric[:, ::1] out, # For floats, use Kahan summation to reduce floating-point # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) - if numeric == float32_t or numeric == float64_t: + if numeric_t == float32_t or numeric_t == float64_t: if val == val: y = val - compensation[lab, j] t = accum[lab, j] + y @@ -806,7 +805,7 @@ def group_ohlc(floating[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t, ndim=2] out, - ndarray[numeric, ndim=1] values, + ndarray[numeric_t, ndim=1] values, ndarray[intp_t] labels, ndarray[uint8_t] mask, const intp_t[:] sort_indexer, @@ -922,15 +921,15 @@ def group_quantile(ndarray[float64_t, ndim=2] out, # group_nth, group_last, group_rank # ---------------------------------------------------------------------- -cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: - if numeric_object_t is object: +cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) nogil: + if iu_64_floating_obj_t is object: # Should never be used, but we need to avoid the `val != val` below # or else cython will raise about gil acquisition. raise NotImplementedError - elif numeric_object_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return is_datetimelike and val == NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: # There is no NA value for uint64 return False else: @@ -938,12 +937,12 @@ cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` +# use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_last(numeric_object_t[:, ::1] out, +def group_last(iu_64_floating_obj_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1) -> None: """ @@ -951,8 +950,8 @@ def group_last(numeric_object_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - numeric_object_t val - ndarray[numeric_object_t, ndim=2] resx + iu_64_floating_obj_t val + ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -963,14 +962,14 @@ def group_last(numeric_object_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if numeric_object_t is object: + if iu_64_floating_obj_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if numeric_object_t is object: + if iu_64_floating_obj_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1012,9 +1011,9 @@ def group_last(numeric_object_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if numeric_object_t is int64_t: + if iu_64_floating_obj_t is int64_t: out[i, j] = NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: runtime_error = True break else: @@ -1030,12 +1029,12 @@ def group_last(numeric_object_t[:, ::1] out, # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` +# use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_nth(numeric_object_t[:, ::1] out, +def group_nth(iu_64_floating_obj_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, int64_t min_count=-1, int64_t rank=1, @@ -1045,8 +1044,8 @@ def group_nth(numeric_object_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - numeric_object_t val - ndarray[numeric_object_t, ndim=2] resx + iu_64_floating_obj_t val + ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -1057,14 +1056,14 @@ def group_nth(numeric_object_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if numeric_object_t is object: + if iu_64_floating_obj_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if numeric_object_t is object: + if iu_64_floating_obj_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1109,9 +1108,9 @@ def group_nth(numeric_object_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if numeric_object_t is int64_t: + if iu_64_floating_obj_t is int64_t: out[i, j] = NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: runtime_error = True break else: @@ -1128,7 +1127,7 @@ def group_nth(numeric_object_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, ::1] out, - ndarray[numeric_object_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, str ties_method="average", @@ -1140,7 +1139,7 @@ def group_rank(float64_t[:, ::1] out, ---------- out : np.ndarray[np.float64, ndim=2] Values to which this method will write its results. - values : np.ndarray of numeric_object_t values to be ranked + values : np.ndarray of iu_64_floating_obj_t values to be ranked labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values` @@ -1195,18 +1194,12 @@ def group_rank(float64_t[:, ::1] out, # ---------------------------------------------------------------------- # TODO: consider implementing for more dtypes -ctypedef fused groupby_t: - float64_t - float32_t - int64_t - uint64_t - @cython.wraparound(False) @cython.boundscheck(False) -cdef group_min_max(groupby_t[:, ::1] out, +cdef group_min_max(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1218,7 +1211,7 @@ cdef group_min_max(groupby_t[:, ::1] out, Parameters ---------- - out : np.ndarray[groupby_t, ndim=2] + out : np.ndarray[iu_64_floating_t, ndim=2] Array to store result in. counts : np.ndarray[int64] Input as a zeroed array, populated by group sizes during algorithm @@ -1247,8 +1240,8 @@ cdef group_min_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - groupby_t val, nan_val - ndarray[groupby_t, ndim=2] group_min_or_max + iu_64_floating_t val, nan_val + ndarray[iu_64_floating_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs bint uses_mask = mask is not None @@ -1263,10 +1256,10 @@ cdef group_min_max(groupby_t[:, ::1] out, nobs = np.zeros((out).shape, dtype=np.int64) group_min_or_max = np.empty_like(out) - if groupby_t is int64_t: + if iu_64_floating_t is int64_t: group_min_or_max[:] = -_int64_max if compute_max else _int64_max nan_val = NPY_NAT - elif groupby_t is uint64_t: + elif iu_64_floating_t is uint64_t: # NB: We do not define nan_val because there is no such thing # for uint64_t. We carefully avoid having to reference it in this # case. @@ -1304,7 +1297,7 @@ cdef group_min_max(groupby_t[:, ::1] out, for i in range(ngroups): for j in range(K): if nobs[i, j] < min_count: - if groupby_t is uint64_t: + if iu_64_floating_t is uint64_t: runtime_error = True break else: @@ -1323,9 +1316,9 @@ cdef group_min_max(groupby_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_max(groupby_t[:, ::1] out, +def group_max(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1347,9 +1340,9 @@ def group_max(groupby_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_min(groupby_t[:, ::1] out, +def group_min(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1371,8 +1364,8 @@ def group_min(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef group_cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef group_cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, uint8_t[:, ::1] mask, const intp_t[::1] labels, int ngroups, @@ -1384,9 +1377,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Parameters ---------- - out : np.ndarray[groupby_t, ndim=2] + out : np.ndarray[iu_64_floating_t, ndim=2] Array to store cummin/max in. - values : np.ndarray[groupby_t, ndim=2] + values : np.ndarray[iu_64_floating_t, ndim=2] Values to take cummin/max of. mask : np.ndarray[bool] or None If not None, indices represent missing values, @@ -1408,12 +1401,12 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - groupby_t[:, ::1] accum + iu_64_floating_t[:, ::1] accum accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) - if groupby_t is int64_t: + if iu_64_floating_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max - elif groupby_t is uint64_t: + elif iu_64_floating_t is uint64_t: accum[:] = 0 if compute_max else np.iinfo(np.uint64).max else: accum[:] = -np.inf if compute_max else np.inf @@ -1426,10 +1419,10 @@ cdef group_cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, - groupby_t[:, ::1] accum, + iu_64_floating_t[:, ::1] accum, bint skipna, bint is_datetimelike, bint compute_max): @@ -1439,12 +1432,12 @@ cdef cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval, na_val + iu_64_floating_t val, mval, na_val uint8_t[:, ::1] seen_na intp_t lab bint na_possible - if groupby_t is float64_t or groupby_t is float32_t: + if iu_64_floating_t is float64_t or iu_64_floating_t is float32_t: na_val = NaN na_possible = True elif is_datetimelike: @@ -1485,11 +1478,11 @@ cdef cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef masked_cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef masked_cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, uint8_t[:, ::1] mask, const intp_t[::1] labels, - groupby_t[:, ::1] accum, + iu_64_floating_t[:, ::1] accum, bint skipna, bint compute_max): """ @@ -1498,7 +1491,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval + iu_64_floating_t val, mval uint8_t[:, ::1] seen_na intp_t lab @@ -1529,8 +1522,8 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummin(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +def group_cummin(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, @@ -1551,8 +1544,8 @@ def group_cummin(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummax(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +def group_cummax(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index b6acf8914c0a6..286a1a189db4c 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -4,23 +4,16 @@ import numpy as np cimport numpy as cnp from numpy cimport ( - float32_t, - float64_t, - int8_t, - int16_t, - int32_t, int64_t, intp_t, ndarray, - uint8_t, - uint16_t, - uint32_t, uint64_t, ) cnp.import_array() from pandas._libs.algos import groupsort_indexer +from pandas._libs.dtypes cimport numeric_object_t, numeric_t @cython.wraparound(False) @@ -257,31 +250,17 @@ def ffill_indexer(const intp_t[:] indexer) -> np.ndarray: # left_join_indexer, inner_join_indexer, outer_join_indexer # ---------------------------------------------------------------------- -ctypedef fused join_t: - float64_t - float32_t - object - int8_t - int16_t - int32_t - int64_t - uint8_t - uint16_t - uint32_t - uint64_t - - # Joins on ordered, unique indices # right might contain non-unique values @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): +def left_join_indexer_unique(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t] indexer - join_t lval, rval + numeric_object_t lval, rval i = 0 j = 0 @@ -322,15 +301,15 @@ def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -425,15 +404,15 @@ def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -518,12 +497,12 @@ def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): cdef: Py_ssize_t i, j, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -656,26 +635,14 @@ from pandas._libs.hashtable cimport ( UInt64HashTable, ) -ctypedef fused asof_t: - uint8_t - uint16_t - uint32_t - uint64_t - int8_t - int16_t - int32_t - int64_t - float - float64_t - ctypedef fused by_t: object int64_t uint64_t -def asof_join_backward_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, @@ -685,8 +652,8 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 HashTable hash_table by_t by_value @@ -743,8 +710,8 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_forward_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=1, @@ -754,8 +721,8 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 HashTable hash_table by_t by_value @@ -812,8 +779,8 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_nearest_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, @@ -822,7 +789,7 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_size, right_size, i ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - asof_t bdiff, fdiff + numeric_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) @@ -865,8 +832,8 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, # asof_join # ---------------------------------------------------------------------- -def asof_join_backward(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_backward(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): @@ -874,8 +841,8 @@ def asof_join_backward(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -918,8 +885,8 @@ def asof_join_backward(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_forward(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_forward(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): @@ -927,8 +894,8 @@ def asof_join_forward(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -972,15 +939,15 @@ def asof_join_forward(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_nearest(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_nearest(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_size, right_size, i ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - asof_t bdiff, fdiff + numeric_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 959d83a55d4f3..45e9da52c0663 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -2,17 +2,9 @@ import cython from cython import Py_ssize_t from numpy cimport ( - float32_t, - float64_t, - int8_t, - int16_t, - int32_t, int64_t, ndarray, uint8_t, - uint16_t, - uint32_t, - uint64_t, ) import numpy as np @@ -22,26 +14,14 @@ cimport numpy as cnp cnp.import_array() from pandas._libs.lib cimport c_is_list_like - -ctypedef fused reshape_t: - uint8_t - uint16_t - uint32_t - uint64_t - int8_t - int16_t - int32_t - int64_t - float32_t - float64_t - object +from pandas._libs.dtypes cimport numeric_object_t @cython.wraparound(False) @cython.boundscheck(False) -def unstack(reshape_t[:, :] values, const uint8_t[:] mask, +def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, - reshape_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: + numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: """ Transform long values to wide new_values. @@ -60,7 +40,7 @@ def unstack(reshape_t[:, :] values, const uint8_t[:] mask, cdef: Py_ssize_t i, j, w, nulls, s, offset - if reshape_t is not object: + if numeric_object_t is not object: # evaluated at compile-time with nogil: for i in range(stride): diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index be22fc368c28f..df88c896ac593 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -16,18 +16,3 @@ cdef extern from "src/headers/stdint.h": enum: INT32_MIN enum: INT64_MAX enum: INT64_MIN - - -ctypedef fused numeric: - cnp.int8_t - cnp.int16_t - cnp.int32_t - cnp.int64_t - - cnp.uint8_t - cnp.uint16_t - cnp.uint32_t - cnp.uint64_t - - cnp.float32_t - cnp.float64_t diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ea52bd24a3689..4fdc9ad393470 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -28,7 +28,7 @@ cdef extern from "src/headers/cmath" namespace "std": from pandas._libs.algos import is_monotonic -from pandas._libs.util cimport numeric +from pandas._libs.dtypes cimport numeric_t cdef extern from "../src/skiplist.h": @@ -851,18 +851,18 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, # https://github.com/pydata/bottleneck -cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: +cdef inline numeric_t init_mm(numeric_t ai, Py_ssize_t *nobs, bint is_max) nogil: - if numeric in cython.floating: + if numeric_t in cython.floating: if ai == ai: nobs[0] = nobs[0] + 1 elif is_max: - if numeric == cython.float: + if numeric_t == cython.float: ai = MINfloat32 else: ai = MINfloat64 else: - if numeric == cython.float: + if numeric_t == cython.float: ai = MAXfloat32 else: ai = MAXfloat64 @@ -873,18 +873,18 @@ cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: return ai -cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: +cdef inline void remove_mm(numeric_t aold, Py_ssize_t *nobs) nogil: """ remove a value from the mm calc """ - if numeric in cython.floating and aold == aold: + if numeric_t in cython.floating and aold == aold: nobs[0] = nobs[0] - 1 -cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, - numeric value) nogil: +cdef inline numeric_t calc_mm(int64_t minp, Py_ssize_t nobs, + numeric_t value) nogil: cdef: - numeric result + numeric_t result - if numeric in cython.floating: + if numeric_t in cython.floating: if nobs >= minp: result = value else: @@ -940,13 +940,13 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start, return _roll_min_max(values, start, end, minp, is_max=0) -cdef _roll_min_max(ndarray[numeric] values, +cdef _roll_min_max(ndarray[numeric_t ] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t minp, bint is_max): cdef: - numeric ai + numeric_t ai int64_t curr_win_size, start Py_ssize_t i, k, nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front From 0eef472594b9432f558b7db5394a1e1d1bbfe1a3 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 30 Sep 2021 16:25:49 -0400 Subject: [PATCH 2/4] Fix compilation issue --- pandas/_libs/window/aggregations.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 4fdc9ad393470..29fe20090875b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -20,14 +20,13 @@ from numpy cimport ( cnp.import_array() -cdef extern from "src/headers/cmath" namespace "std": +cdef extern from "../src/headers/cmath" namespace "std": bint isnan(float64_t) nogil bint notnan(float64_t) nogil int signbit(float64_t) nogil float64_t sqrt(float64_t x) nogil from pandas._libs.algos import is_monotonic - from pandas._libs.dtypes cimport numeric_t @@ -880,7 +879,7 @@ cdef inline void remove_mm(numeric_t aold, Py_ssize_t *nobs) nogil: cdef inline numeric_t calc_mm(int64_t minp, Py_ssize_t nobs, - numeric_t value) nogil: + numeric_t value) nogil: cdef: numeric_t result @@ -940,7 +939,7 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start, return _roll_min_max(values, start, end, minp, is_max=0) -cdef _roll_min_max(ndarray[numeric_t ] values, +cdef _roll_min_max(ndarray[numeric_t] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t minp, From 4302a3ccaa306d66b4be786328234d9a00d3ae78 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 30 Sep 2021 18:18:21 -0400 Subject: [PATCH 3/4] precommit fixup --- pandas/_libs/algos.pyx | 22 ++++++++++++++++------ pandas/_libs/groupby.pyx | 10 ++++++---- pandas/_libs/join.pyx | 11 +++++++++-- pandas/_libs/reshape.pyx | 2 +- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7bb95ecf1ce35..82f9280870d59 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -45,7 +45,11 @@ from numpy cimport ( cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.dtypes cimport numeric_object_t, numeric_t, iu_64_floating_obj_t +from pandas._libs.dtypes cimport ( + iu_64_floating_obj_t, + numeric_object_t, + numeric_t, +) from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -55,9 +59,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) -from pandas._libs.util cimport ( - get_nat, -) +from pandas._libs.util cimport get_nat import pandas._libs.missing as missing @@ -541,7 +543,11 @@ def validate_limit(nobs: int | None, limit=None) -> int: @cython.boundscheck(False) @cython.wraparound(False) -def pad(ndarray[numeric_object_t] old, ndarray[numeric_object_t] new, limit=None) -> ndarray: +def pad( + ndarray[numeric_object_t] old, + ndarray[numeric_object_t] new, + limit=None +) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright @@ -687,7 +693,11 @@ D @cython.boundscheck(False) @cython.wraparound(False) -def backfill(ndarray[numeric_object_t] old, ndarray[numeric_object_t] new, limit=None) -> ndarray: +def backfill( + ndarray[numeric_object_t] old, + ndarray[numeric_object_t] new, + limit=None +) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ab47fbca3f0dc..1e05ef443d516 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -31,9 +31,7 @@ from numpy.math cimport NAN cnp.import_array() from pandas._libs.algos cimport kth_smallest_c -from pandas._libs.util cimport ( - get_nat, -) +from pandas._libs.util cimport get_nat from pandas._libs.algos import ( ensure_platform_int, @@ -42,7 +40,11 @@ from pandas._libs.algos import ( take_2d_axis1_float64_float64, ) -from pandas._libs.dtypes cimport iu_64_floating_obj_t, iu_64_floating_t, numeric_t +from pandas._libs.dtypes cimport ( + iu_64_floating_obj_t, + iu_64_floating_t, + numeric_t, +) from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 286a1a189db4c..c9a4b49f90037 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -13,7 +13,11 @@ from numpy cimport ( cnp.import_array() from pandas._libs.algos import groupsort_indexer -from pandas._libs.dtypes cimport numeric_object_t, numeric_t + +from pandas._libs.dtypes cimport ( + numeric_object_t, + numeric_t, +) @cython.wraparound(False) @@ -256,7 +260,10 @@ def ffill_indexer(const intp_t[:] indexer) -> np.ndarray: @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): +def left_join_indexer_unique( + ndarray[numeric_object_t] left, + ndarray[numeric_object_t] right +): cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t] indexer diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 45e9da52c0663..9d3b80b321537 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -13,8 +13,8 @@ cimport numpy as cnp cnp.import_array() -from pandas._libs.lib cimport c_is_list_like from pandas._libs.dtypes cimport numeric_object_t +from pandas._libs.lib cimport c_is_list_like @cython.wraparound(False) From 5dd526a136b4575acea9aff5bc7fcf75c5efe10a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 30 Sep 2021 22:08:30 -0400 Subject: [PATCH 4/4] Add comments explaining fused types --- pandas/_libs/dtypes.pxd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index ad579ada7417a..f87a1525b15fd 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -15,6 +15,7 @@ from numpy cimport ( uint64_t, ) +# All numeric types except complex ctypedef fused numeric_t: int8_t int16_t @@ -29,16 +30,19 @@ ctypedef fused numeric_t: float32_t float64_t +# All numeric types + object, doesn't include complex ctypedef fused numeric_object_t: numeric_t object +# i64 + u64 + all float types ctypedef fused iu_64_floating_t: float64_t float32_t int64_t uint64_t +# i64 + u64 + all float types + object ctypedef fused iu_64_floating_obj_t: iu_64_floating_t object