diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 6211734706e37..ea05c4afc8fce 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -24,6 +24,7 @@ def diff_2d(ndarray[diff_t, ndim=2] arr, Py_ssize_t periods, int axis): cdef: Py_ssize_t i, j, sx, sy, start, stop + bint f_contig = arr.flags.f_contiguous # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 @@ -37,40 +38,41 @@ def diff_2d(ndarray[diff_t, ndim=2] arr, # We put this inside an indented else block to avoid cython build # warnings about unreachable code sx, sy = (arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx + with nogil: + if f_contig: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] else: - if periods >= 0: - start, stop = periods, sy + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] else: - start, stop = 0, sy + periods - for j in range(start, stop): + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] # ---------------------------------------------------------------------- diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 11c56f784d378..a3e1b9694f5d2 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -29,13 +29,14 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) - # First pass, determine size of result set, do not use the NA group - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] + with nogil: + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] - if rc > 0 and lc > 0: - count += lc * rc + if rc > 0 and lc > 0: + count += lc * rc # exclude the NA group left_pos = left_count[0] @@ -44,19 +45,20 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, left_indexer = np.empty(count, dtype=np.int64) right_indexer = np.empty(count, dtype=np.int64) - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] - - if rc > 0 and lc > 0: - for j in range(lc): - offset = position + j * rc - for k in range(rc): - left_indexer[offset + k] = left_pos + j - right_indexer[offset + k] = right_pos + k - position += lc * rc - left_pos += lc - right_pos += rc + with nogil: + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) @@ -79,12 +81,13 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) - # First pass, determine size of result set, do not use the NA group - for i in range(1, max_groups + 1): - if right_count[i] > 0: - count += left_count[i] * right_count[i] - else: - count += left_count[i] + with nogil: + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + if right_count[i] > 0: + count += left_count[i] * right_count[i] + else: + count += left_count[i] # exclude the NA group left_pos = left_count[0] @@ -93,24 +96,25 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, left_indexer = np.empty(count, dtype=np.int64) right_indexer = np.empty(count, dtype=np.int64) - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] + with nogil: + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] - if rc == 0: - for j in range(lc): - left_indexer[position + j] = left_pos + j - right_indexer[position + j] = -1 - position += lc - else: - for j in range(lc): - offset = position + j * rc - for k in range(rc): - left_indexer[offset + k] = left_pos + j - right_indexer[offset + k] = right_pos + k - position += lc * rc - left_pos += lc - right_pos += rc + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc left_indexer = _get_result_indexer(left_sorter, left_indexer) right_indexer = _get_result_indexer(right_sorter, right_indexer) @@ -149,15 +153,16 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) - # First pass, determine size of result set, do not use the NA group - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] + with nogil: + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] - if rc > 0 and lc > 0: - count += lc * rc - else: - count += lc + rc + if rc > 0 and lc > 0: + count += lc * rc + else: + count += lc + rc # exclude the NA group left_pos = left_count[0] @@ -166,29 +171,30 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, left_indexer = np.empty(count, dtype=np.int64) right_indexer = np.empty(count, dtype=np.int64) - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] - - if rc == 0: - for j in range(lc): - left_indexer[position + j] = left_pos + j - right_indexer[position + j] = -1 - position += lc - elif lc == 0: - for j in range(rc): - left_indexer[position + j] = -1 - right_indexer[position + j] = right_pos + j - position += rc - else: - for j in range(lc): - offset = position + j * rc - for k in range(rc): - left_indexer[offset + k] = left_pos + j - right_indexer[offset + k] = right_pos + k - position += lc * rc - left_pos += lc - right_pos += rc + with nogil: + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + elif lc == 0: + for j in range(rc): + left_indexer[position + j] = -1 + right_indexer[position + j] = right_pos + j + position += rc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index a2096d389823f..d1adc7789a7a3 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -49,7 +49,7 @@ cdef inline int int_min(int a, int b): return a if a <= b else b # -def _check_minp(win, minp, N, floor=None): +def _check_minp(win, minp, N, floor=None) -> int: """ Parameters ---------- @@ -180,7 +180,8 @@ cdef class FixedWindowIndexer(WindowIndexer): def __init__(self, ndarray values, int64_t win, int64_t minp, bint left_closed, bint right_closed, object index=None, object floor=None): - cdef ndarray start_s, start_e, end_s, end_e + cdef: + ndarray[int64_t] start_s, start_e, end_s, end_e assert index is None self.is_variable = 0 @@ -298,7 +299,7 @@ cdef class VariableWindowIndexer(WindowIndexer): def get_window_indexer(values, win, minp, index, closed, floor=None, use_mock=True): """ - return the correct window indexer for the computation + Return the correct window indexer for the computation. Parameters ---------- @@ -319,7 +320,6 @@ def get_window_indexer(values, win, minp, index, closed, compat Indexer that allows us to use a standard code path with all of the indexers. - Returns ------- tuple of 1d int64 ndarrays of the offsets & data about the window @@ -365,7 +365,7 @@ def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, float64_t val, count_x = 0.0 int64_t s, e, nobs, N Py_ssize_t i, j - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output start, end, N, win, minp, _ = get_window_indexer(values, win, @@ -413,8 +413,7 @@ def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, # Rolling sum -cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, - float64_t sum_x) nogil: +cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogil: cdef: float64_t result @@ -435,8 +434,7 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: sum_x[0] = sum_x[0] + val -cdef inline void remove_sum(float64_t val, - int64_t *nobs, float64_t *sum_x) nogil: +cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: """ remove a value from the sum calc """ if notnan(val): @@ -451,7 +449,7 @@ def roll_sum(ndarray[float64_t] values, int64_t win, int64_t minp, int64_t s, e, range_endpoint int64_t nobs = 0, i, j, N bint is_variable - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, @@ -572,7 +570,7 @@ def roll_mean(ndarray[float64_t] values, int64_t win, int64_t minp, int64_t s, e bint is_variable Py_ssize_t nobs = 0, i, j, neg_ct = 0, N - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, @@ -709,7 +707,7 @@ def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, int64_t s, e bint is_variable Py_ssize_t i, j, N - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, @@ -871,7 +869,7 @@ def roll_skew(ndarray[float64_t] values, int64_t win, int64_t minp, int64_t nobs = 0, i, j, N int64_t s, e bint is_variable - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, @@ -1015,7 +1013,7 @@ def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, int64_t nobs = 0, i, j, N int64_t s, e bint is_variable - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output start, end, N, win, minp, is_variable = get_window_indexer(values, win, @@ -1088,7 +1086,7 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, Py_ssize_t i, j int64_t nobs = 0, N, s, e int midpoint - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output # we use the Fixed/Variable Indexer here as the @@ -1471,7 +1469,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, int64_t nobs = 0, i, j, s, e, N Py_ssize_t idx bint is_variable - ndarray[int64_t] start, end + int64_t[:] start, end ndarray[float64_t] output float64_t vlow, vhigh InterpolationType interpolation_type @@ -1589,7 +1587,7 @@ def roll_generic(object obj, float64_t *oldbuf int64_t nobs = 0, i, j, s, e, N bint is_variable - ndarray[int64_t] start, end + int64_t[:] start, end n = len(obj) if n == 0: @@ -1679,18 +1677,17 @@ def roll_generic(object obj, # Rolling sum and mean for weighted window -def roll_weighted_sum(float64_t[:] values, float64_t[:] weights, - int minp): +def roll_weighted_sum(float64_t[:] values, float64_t[:] weights, int minp): return _roll_weighted_sum_mean(values, weights, minp, avg=0) -def roll_weighted_mean(float64_t[:] values, float64_t[:] weights, - int minp): +def roll_weighted_mean(float64_t[:] values, float64_t[:] weights, int minp): return _roll_weighted_sum_mean(values, weights, minp, avg=1) -def _roll_weighted_sum_mean(float64_t[:] values, float64_t[:] weights, - int minp, bint avg): +cdef ndarray[float64_t] _roll_weighted_sum_mean(float64_t[:] values, + float64_t[:] weights, + int minp, bint avg): """ Assume len(weights) << len(values) """ @@ -1702,64 +1699,64 @@ def _roll_weighted_sum_mean(float64_t[:] values, float64_t[:] weights, in_n = len(values) win_n = len(weights) - output = np.zeros(in_n, dtype=float) - counts = np.zeros(in_n, dtype=float) + output = np.zeros(in_n, dtype=np.float64) + counts = np.zeros(in_n, dtype=np.float64) if avg: - tot_wgt = np.zeros(in_n, dtype=float) + tot_wgt = np.zeros(in_n, dtype=np.float64) minp = _check_minp(len(weights), minp, in_n) - if avg: - for win_i in range(win_n): - val_win = weights[win_i] - if val_win != val_win: - continue - - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = values[in_i] - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 - tot_wgt[in_i + (win_n - win_i) - 1] += val_win - - for in_i in range(in_n): - c = counts[in_i] - if c < minp: - output[in_i] = NaN - else: - w = tot_wgt[in_i] - if w == 0: + with nogil: + if avg: + for win_i in range(win_n): + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i in range(in_n - (win_n - win_i) + 1): + val_in = values[in_i] + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + tot_wgt[in_i + (win_n - win_i) - 1] += val_win + + for in_i in range(in_n): + c = counts[in_i] + if c < minp: output[in_i] = NaN else: - output[in_i] /= tot_wgt[in_i] + w = tot_wgt[in_i] + if w == 0: + output[in_i] = NaN + else: + output[in_i] /= tot_wgt[in_i] - else: - for win_i in range(win_n): - val_win = weights[win_i] - if val_win != val_win: - continue + else: + for win_i in range(win_n): + val_win = weights[win_i] + if val_win != val_win: + continue - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = values[in_i] + for in_i in range(in_n - (win_n - win_i) + 1): + val_in = values[in_i] - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 - for in_i in range(in_n): - c = counts[in_i] - if c < minp: - output[in_i] = NaN + for in_i in range(in_n): + c = counts[in_i] + if c < minp: + output[in_i] = NaN - return output + return np.asarray(output) # ---------------------------------------------------------------------- # Exponentially weighted moving average -def ewma(float64_t[:] vals, float64_t com, - int adjust, int ignore_na, int minp): +def ewma(float64_t[:] vals, float64_t com, int adjust, bint ignore_na, int minp): """ Compute exponentially-weighted moving average using center-of-mass. @@ -1768,12 +1765,12 @@ def ewma(float64_t[:] vals, float64_t com, vals : ndarray (float64 type) com : float64 adjust: int - ignore_na: int + ignore_na: bool minp: int Returns ------- - y : ndarray + ndarray """ cdef: @@ -1781,6 +1778,7 @@ def ewma(float64_t[:] vals, float64_t com, ndarray[float64_t] output = np.empty(N, dtype=float) float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur Py_ssize_t i, nobs + bint is_observation if N == 0: return output @@ -1797,29 +1795,30 @@ def ewma(float64_t[:] vals, float64_t com, output[0] = weighted_avg if (nobs >= minp) else NaN old_wt = 1. - for i in range(1, N): - cur = vals[i] - is_observation = (cur == cur) - nobs += int(is_observation) - if weighted_avg == weighted_avg: - - if is_observation or (not ignore_na): - - old_wt *= old_wt_factor - if is_observation: - - # avoid numerical errors on constant series - if weighted_avg != cur: - weighted_avg = ((old_wt * weighted_avg) + - (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif is_observation: - weighted_avg = cur + with nogil: + for i in range(1, N): + cur = vals[i] + is_observation = (cur == cur) + nobs += is_observation + if weighted_avg == weighted_avg: + + if is_observation or (not ignore_na): + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ((old_wt * weighted_avg) + + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: + weighted_avg = cur - output[i] = weighted_avg if (nobs >= minp) else NaN + output[i] = weighted_avg if (nobs >= minp) else NaN return output @@ -1829,7 +1828,7 @@ def ewma(float64_t[:] vals, float64_t com, def ewmcov(float64_t[:] input_x, float64_t[:] input_y, - float64_t com, int adjust, int ignore_na, int minp, int bias): + float64_t com, int adjust, bint ignore_na, int minp, int bias): """ Compute exponentially-weighted moving variance using center-of-mass. @@ -1839,21 +1838,23 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, input_y : ndarray (float64 type) com : float64 adjust: int - ignore_na: int + ignore_na: bool minp: int bias: int Returns ------- - y : ndarray + ndarray """ cdef: Py_ssize_t N = len(input_x) float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + float64_t numerator, denominator Py_ssize_t i, nobs ndarray[float64_t] output + bint is_observation if len(input_y) != N: raise ValueError("arrays are of different lengths " @@ -1882,55 +1883,57 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, sum_wt2 = 1. old_wt = 1. - for i in range(1, N): - cur_x = input_x[i] - cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) - nobs += int(is_observation) - if mean_x == mean_x: - if is_observation or (not ignore_na): - sum_wt *= old_wt_factor - sum_wt2 *= (old_wt_factor * old_wt_factor) - old_wt *= old_wt_factor - if is_observation: - old_mean_x = mean_x - old_mean_y = mean_y - - # avoid numerical errors on constant series - if mean_x != cur_x: - mean_x = ((old_wt * old_mean_x) + - (new_wt * cur_x)) / (old_wt + new_wt) - - # avoid numerical errors on constant series - if mean_y != cur_y: - mean_y = ((old_wt * old_mean_y) + - (new_wt * cur_y)) / (old_wt + new_wt) - cov = ((old_wt * (cov + ((old_mean_x - mean_x) * - (old_mean_y - mean_y)))) + - (new_wt * ((cur_x - mean_x) * - (cur_y - mean_y)))) / (old_wt + new_wt) - sum_wt += new_wt - sum_wt2 += (new_wt * new_wt) - old_wt += new_wt - if not adjust: - sum_wt /= old_wt - sum_wt2 /= (old_wt * old_wt) - old_wt = 1. - elif is_observation: - mean_x = cur_x - mean_y = cur_y + with nogil: - if nobs >= minp: - if not bias: - numerator = sum_wt * sum_wt - denominator = numerator - sum_wt2 - if (denominator > 0.): - output[i] = ((numerator / denominator) * cov) + for i in range(1, N): + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += is_observation + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + + # avoid numerical errors on constant series + if mean_x != cur_x: + mean_x = ((old_wt * old_mean_x) + + (new_wt * cur_x)) / (old_wt + new_wt) + + # avoid numerical errors on constant series + if mean_y != cur_y: + mean_y = ((old_wt * old_mean_y) + + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * + (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * + (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + if (denominator > 0.): + output[i] = ((numerator / denominator) * cov) + else: + output[i] = NaN else: - output[i] = NaN + output[i] = cov else: - output[i] = cov - else: - output[i] = NaN + output[i] = NaN return output