From 2598ceaa753a146d23899d9e1ed48e7557ffce2d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 20 Dec 2016 16:23:46 -0500 Subject: [PATCH] BUG: Patch rank() uint64 behavior Adds uint64 ranking functions to algos.pyx to allow for proper ranking with uint64. Also introduces partial patch for factorize() by adding uint64 hashtables and vectors for usage. However, this patch is only partial because the larger bug of non-support for uint64 in Index has not been fixed. Also patches bug in UInt64HashTable that had an erroneous null condition that was caught during testing and was hence removed. --- pandas/algos.pyx | 610 +---------------------- pandas/core/algorithms.py | 64 ++- pandas/src/algos_rank_helper.pxi.in | 385 ++++++++++++++ pandas/src/algos_take_helper.pxi.in | 33 +- pandas/src/hashtable_class_helper.pxi.in | 2 +- pandas/tests/test_algos.py | 53 +- setup.py | 2 +- 7 files changed, 506 insertions(+), 643 deletions(-) create mode 100644 pandas/src/algos_rank_helper.pxi.in diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 04f3ac70bdf5c..32955fd0f465b 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -67,499 +67,6 @@ tiebreakers = { } -# ctypedef fused pvalue_t: -# float64_t -# int64_t -# object - -# from cython cimport floating, integral - -cdef _take_2d_float64(ndarray[float64_t, ndim=2] values, - object idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx - ndarray[float64_t, ndim=2] result - object val - - N, K = ( values).shape - result = np.empty_like(values) - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result - -cdef _take_2d_int64(ndarray[int64_t, ndim=2] values, - object idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx - ndarray[int64_t, ndim=2] result - object val - - N, K = ( values).shape - result = np.empty_like(values) - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result - -cdef _take_2d_object(ndarray[object, ndim=2] values, - object idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx - ndarray[object, ndim=2] result - object val - - N, K = ( values).shape - result = values.copy() - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result - - -def rank_1d_float64(object in_arr, ties_method='average', ascending=True, - na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 - ndarray[float64_t] sorted_data, ranks, values - ndarray[int64_t] argsorted - float64_t val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - tiebreak = tiebreakers[ties_method] - - values = np.asarray(in_arr).copy() - - keep_na = na_option == 'keep' - - if ascending ^ (na_option == 'top'): - nan_value = np.inf - else: - nan_value = -np.inf - mask = np.isnan(values) - np.putmask(values, mask, nan_value) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - # py2.5/win32 hack, can't pass i8 - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort() - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - argsorted = _as.astype('i8') - - for i in range(n): - sum_ranks += i + 1 - dups += 1 - val = sorted_data[i] - if (val == nan_value) and keep_na: - ranks[argsorted[i]] = nan - continue - count += 1.0 - if i == n - 1 or sorted_data[i + 1] != val: - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - if pct: - return ranks / count - else: - return ranks - - -def rank_1d_int64(object in_arr, ties_method='average', ascending=True, - na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 - ndarray[int64_t] sorted_data, values - ndarray[float64_t] ranks - ndarray[int64_t] argsorted - int64_t val, nan_value - float64_t sum_ranks = 0 - bint keep_na - int tiebreak = 0 - float count = 0.0 - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - values = np.asarray(in_arr) - - if ascending ^ (na_option == 'top'): - nan_value = np.iinfo('int64').max - else: - nan_value = np.iinfo('int64').min - - # unlike floats, which have np.inf, -np.inf, and np.nan - # ints do not - mask = values == iNaT - np.putmask(values, mask, nan_value) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - # py2.5/win32 hack, can't pass i8 - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort() - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - argsorted = _as.astype('i8') - - for i in range(n): - sum_ranks += i + 1 - dups += 1 - val = sorted_data[i] - if (val == nan_value) and keep_na: - ranks[argsorted[i]] = nan - continue - count += 1.0 - if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - if pct: - return ranks / count - else: - return ranks - - -def rank_2d_float64(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - ndarray[float64_t, ndim=2] ranks, values - ndarray[int64_t, ndim=2] argsorted - float64_t val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - in_arr = np.asarray(in_arr) - - if axis == 0: - values = in_arr.T.copy() - else: - values = in_arr.copy() - - if ascending ^ (na_option == 'top'): - nan_value = np.inf - else: - nan_value = -np.inf - - np.putmask(values, np.isnan(values), nan_value) - - n, k = ( values).shape - ranks = np.empty((n, k), dtype='f8') - - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d_float64(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - dups = sum_ranks = 0 - total_tie_count = 0 - count = 0.0 - for j in range(k): - sum_ranks += j + 1 - dups += 1 - val = values[i, j] - if val == nan_value and keep_na: - ranks[i, argsorted[i, j]] = nan - continue - count += 1.0 - if j == k - 1 or values[i, j + 1] != val: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -def rank_2d_int64(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - ndarray[float64_t, ndim=2] ranks - ndarray[int64_t, ndim=2] argsorted - ndarray[int64_t, ndim=2, cast=True] values - int64_t val, nan_value - float64_t sum_ranks = 0 - bint keep_na = 0 - int tiebreak = 0 - float count = 0.0 - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - in_arr = np.asarray(in_arr) - - if axis == 0: - values = in_arr.T.copy() - else: - values = in_arr.copy() - - if ascending ^ (na_option == 'top'): - nan_value = np.iinfo('int64').max - else: - nan_value = np.iinfo('int64').min - - # unlike floats, which have np.inf, -np.inf, and np.nan - # ints do not - np.putmask(values, values == iNaT, nan_value) - - n, k = ( values).shape - ranks = np.empty((n, k), dtype='f8') - - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d_int64(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - dups = sum_ranks = 0 - total_tie_count = 0 - count = 0.0 - for j in range(k): - sum_ranks += j + 1 - dups += 1 - val = values[i, j] - if val == nan_value and keep_na: - ranks[i, argsorted[i, j]] = nan - continue - count += 1.0 - if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 - ndarray[float64_t] ranks - ndarray sorted_data, values - ndarray[int64_t] argsorted - object val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - values = np.array(in_arr, copy=True) - - if values.dtype != np.object_: - values = values.astype('O') - - if ascending ^ (na_option == 'top'): - # always greater than everything - nan_value = Infinity() - else: - nan_value = NegInfinity() - - mask = lib.isnullobj(values) - np.putmask(values, mask, nan_value) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - # py2.5/win32 hack, can't pass i8 - try: - _as = values.argsort() - except TypeError: - if not retry: - raise - - valid_locs = (~mask).nonzero()[0] - ranks.put(valid_locs, rank_1d_generic(values.take(valid_locs), 0, - ties_method=ties_method, - ascending=ascending)) - np.putmask(ranks, mask, np.nan) - return ranks - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - argsorted = _as.astype('i8') - for i in range(n): - sum_ranks += i + 1 - dups += 1 - val = util.get_value_at(sorted_data, i) - if val is nan_value and keep_na: - ranks[argsorted[i]] = nan - continue - if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val)): - count += 1.0 - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - raise ValueError('first not supported for non-numeric data') - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - if pct: - return ranks / count - else: - return ranks - cdef inline are_diff(object left, object right): try: return fabs(left - right) > FP_ERR @@ -589,122 +96,6 @@ class NegInfinity(object): __ge__ = lambda self, other: self is other -def rank_2d_generic(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, infs, dups = 0 - Py_ssize_t total_tie_count = 0 - ndarray[float64_t, ndim=2] ranks - ndarray[object, ndim=2] values - ndarray[int64_t, ndim=2] argsorted - object val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - in_arr = np.asarray(in_arr) - - if axis == 0: - values = in_arr.T.copy() - else: - values = in_arr.copy() - - if values.dtype != np.object_: - values = values.astype('O') - - if ascending ^ (na_option == 'top'): - # always greater than everything - nan_value = Infinity() - else: - nan_value = NegInfinity() - - mask = lib.isnullobj2d(values) - np.putmask(values, mask, nan_value) - - n, k = ( values).shape - ranks = np.empty((n, k), dtype='f8') - - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_generic(in_arr[i], - ties_method=ties_method, - ascending=ascending, - pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d_object(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - dups = sum_ranks = infs = 0 - total_tie_count = 0 - count = 0.0 - for j in range(k): - val = values[i, j] - if val is nan_value and keep_na: - ranks[i, argsorted[i, j]] = nan - infs += 1 - continue - count += 1.0 - sum_ranks += (j - infs) + 1 - dups += 1 - if j == k - 1 or are_diff(values[i, j + 1], val): - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - raise ValueError('first not supported for ' - 'non-numeric data') - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - -# def _take_indexer_2d(ndarray[float64_t, ndim=2] values, -# ndarray[Py_ssize_t, ndim=2, cast=True] indexer): -# cdef: -# Py_ssize_t i, j, N, K -# ndarray[float64_t, ndim=2] result - -# N, K = ( values).shape -# result = np.empty_like(values) -# for i in range(N): -# for j in range(K): -# result[i, j] = values[i, indexer[i, j]] -# return result - - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1: cdef numeric t @@ -1269,4 +660,5 @@ cdef inline float64_t _median_linear(float64_t* a, int n): # generated from template include "algos_common_helper.pxi" include "algos_groupby_helper.pxi" +include "algos_rank_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9b7bf2bf058ef..b4a61b26aceb3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -68,7 +68,7 @@ def match(to_match, values, na_sentinel=-1): if issubclass(values.dtype.type, string_types): values = np.array(values, dtype='O') - f = lambda htype, caster: _match_generic(to_match, values, htype, caster) + f = lambda htype, caster: _match_object(to_match, values, htype, caster) result = _hashtable_algo(f, values, np.int64) if na_sentinel != -1: @@ -82,7 +82,7 @@ def match(to_match, values, na_sentinel=-1): return result -def _match_generic(values, index, table_type, type_caster): +def _match_object(values, index, table_type, type_caster): values = type_caster(values) index = type_caster(index) table = table_type(min(len(index), 1000000)) @@ -105,11 +105,11 @@ def unique(values): """ values = com._asarray_tuplesafe(values) - f = lambda htype, caster: _unique_generic(values, htype, caster) + f = lambda htype, caster: _unique_object(values, htype, caster) return _hashtable_algo(f, values) -def _unique_generic(values, table_type, type_caster): +def _unique_object(values, table_type, type_caster): values = type_caster(values) table = table_type(min(len(values), 1000000)) uniques = table.unique(values) @@ -366,6 +366,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): + # TODO: This constructor is bugged for uint's, especially + # np.uint64 due to overflow. Test this for uint behavior + # once constructor has been fixed. uniques = Index(uniques) return labels, uniques @@ -595,7 +598,27 @@ def mode(values): def rank(values, axis=0, method='average', na_option='keep', ascending=True, pct=False): """ + Rank the values along a given axis. + Parameters + ---------- + values : array-like + Array whose values will be ranked. The number of dimensions in this + array must not exceed 2. + axis : int, default 0 + Axis over which to perform rankings. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + The method by which tiebreaks are broken during the ranking. + na_option : {'keep', 'top'}, default 'keep' + The method by which NaNs are placed in the ranking. + - ``keep``: rank each NaN value with a NaN ranking + - ``top``: replace each NaN with either +/- inf so that they + there are ranked at the top + ascending : boolean, default True + Whether or not the elements should be ranked in ascending order. + pct : boolean, default False + Whether or not to the display the returned rankings in integer form + (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) @@ -605,6 +628,8 @@ def rank(values, axis=0, method='average', na_option='keep', f, values = _get_data_algo(values, _rank2d_functions) ranks = f(values, axis=axis, ties_method=method, ascending=ascending, na_option=na_option, pct=pct) + else: + raise TypeError("Array with ndim > 2 are not supported.") return ranks @@ -700,13 +725,15 @@ def _broadcast(arr_or_scalar, shape): _rank1d_functions = { 'float64': algos.rank_1d_float64, 'int64': algos.rank_1d_int64, - 'generic': algos.rank_1d_generic + 'uint64': algos.rank_1d_uint64, + 'object': algos.rank_1d_object } _rank2d_functions = { 'float64': algos.rank_2d_float64, 'int64': algos.rank_2d_int64, - 'generic': algos.rank_2d_generic + 'uint64': algos.rank_2d_uint64, + 'object': algos.rank_2d_object } @@ -934,9 +961,10 @@ def _hashtable_algo(f, values, return_dtype=None): _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), + 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), 'int64': (htable.Int64HashTable, htable.Int64Vector), 'string': (htable.StringHashTable, htable.ObjectVector), - 'generic': (htable.PyObjectHashTable, htable.ObjectVector) + 'object': (htable.PyObjectHashTable, htable.ObjectVector) } @@ -951,11 +979,15 @@ def _get_data_algo(values, func_map): f = func_map['int64'] values = values.view('i8') - elif is_integer_dtype(values): + elif is_signed_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) - else: + elif is_unsigned_integer_dtype(values): + f = func_map['uint64'] + values = _ensure_uint64(values) + + else: values = _ensure_object(values) # its cheaper to use a String Hash Table than Object @@ -966,7 +998,7 @@ def _get_data_algo(values, func_map): pass if f is None: - f = func_map['generic'] + f = func_map['object'] return f, values @@ -997,7 +1029,7 @@ def wrapper(arr, indexer, out, fill_value=np.nan): return wrapper -def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info): +def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): # this is not ideal, performance-wise, but it's better than raising # an exception (best to optimize in Cython to avoid getting here) row_idx, col_idx = indexer @@ -1020,7 +1052,7 @@ def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info): out[i, j] = arr[u_, v] -def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): +def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info else: @@ -1171,8 +1203,8 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): def func(arr, indexer, out, fill_value=np.nan): indexer = _ensure_int64(indexer) - _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, - mask_info=mask_info) + _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value, + mask_info=mask_info) return func @@ -1343,8 +1375,8 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if func is None: def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_generic(arr, indexer, out, fill_value=fill_value, - mask_info=mask_info) + _take_2d_multi_object(arr, indexer, out, fill_value=fill_value, + mask_info=mask_info) func(arr, indexer, out=out, fill_value=fill_value) return out diff --git a/pandas/src/algos_rank_helper.pxi.in b/pandas/src/algos_rank_helper.pxi.in new file mode 100644 index 0000000000000..7e7f819c7515f --- /dev/null +++ b/pandas/src/algos_rank_helper.pxi.in @@ -0,0 +1,385 @@ +""" +Template for each `dtype` helper function for rank + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# rank_1d, rank_2d +#---------------------------------------------------------------------- + +{{py: + +# dtype ctype pos_nan_value neg_nan_value +dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), + ('float64', 'float64_t', 'np.inf', '-np.inf'), + ('uint64', 'uint64_t', '', ''), + ('int64', 'int64_t', 'np.iinfo(np.int64).max', + 'np.iinfo(np.int64).min')] + +}} + +{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if dtype == 'object'}} + + +def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average', + ascending=True, na_option='keep', pct=False): +{{else}} + + +def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, + na_option='keep', pct=False): +{{endif}} + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 + + {{if dtype == 'object'}} + ndarray sorted_data, values + {{else}} + ndarray[{{ctype}}] sorted_data, values + {{endif}} + + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + + {{if dtype == 'uint64'}} + {{ctype}} val + {{else}} + {{ctype}} val, nan_value + {{endif}} + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + tiebreak = tiebreakers[ties_method] + + {{if dtype == 'float64'}} + values = np.asarray(in_arr).copy() + {{elif dtype == 'object'}} + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + {{else}} + values = np.asarray(in_arr) + {{endif}} + + keep_na = na_option == 'keep' + + {{if dtype != 'uint64'}} + if ascending ^ (na_option == 'top'): + nan_value = {{pos_nan_value}} + else: + nan_value = {{neg_nan_value}} + + {{if dtype == 'object'}} + mask = lib.isnullobj(values) + {{elif dtype == 'float64'}} + mask = np.isnan(values) + {{elif dtype == 'int64'}} + mask = values == iNaT + {{endif}} + + np.putmask(values, mask, nan_value) + {{endif}} + + n = len(values) + ranks = np.empty(n, dtype='f8') + + {{if dtype == 'object'}} + try: + _as = values.argsort() + except TypeError: + if not retry: + raise + + valid_locs = (~mask).nonzero()[0] + ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0, + ties_method=ties_method, + ascending=ascending)) + np.putmask(ranks, mask, np.nan) + return ranks + {{else}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + {{endif}} + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + + {{if dtype == 'object'}} + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = util.get_value_at(sorted_data, i) + + if (val is nan_value) and keep_na: + ranks[argsorted[i]] = nan + continue + + count += 1.0 + + if (i == n - 1 or + are_diff(util.get_value_at(sorted_data, i + 1), val)): + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + raise ValueError('first not supported for non-numeric data') + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + {{else}} + with nogil: + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + {{if dtype != 'uint64'}} + if (val == nan_value) and keep_na: + ranks[argsorted[i]] = nan + continue + {{endif}} + + count += 1.0 + + {{if dtype == 'float64'}} + if i == n - 1 or sorted_data[i + 1] != val: + {{else}} + if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: + {{endif}} + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + {{endif}} + if pct: + return ranks / count + else: + return ranks + + +def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + + {{if dtype == 'object'}} + Py_ssize_t infs + {{endif}} + + ndarray[float64_t, ndim=2] ranks + {{if dtype == 'int64' or dtype == 'uint64'}} + ndarray[{{ctype}}, ndim=2, cast=True] values + {{else}} + ndarray[{{ctype}}, ndim=2] values + {{endif}} + + ndarray[int64_t, ndim=2] argsorted + + {{if dtype == 'uint64'}} + {{ctype}} val + {{else}} + {{ctype}} val, nan_value + {{endif}} + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + in_arr = np.asarray(in_arr) + + if axis == 0: + values = in_arr.T.copy() + else: + values = in_arr.copy() + + {{if dtype == 'object'}} + if values.dtype != np.object_: + values = values.astype('O') + {{endif}} + + {{if dtype != 'uint64'}} + if ascending ^ (na_option == 'top'): + nan_value = {{pos_nan_value}} + else: + nan_value = {{neg_nan_value}} + + {{if dtype == 'object'}} + mask = lib.isnullobj2d(values) + {{elif dtype == 'float64'}} + mask = np.isnan(values) + {{elif dtype == 'int64'}} + mask = values == iNaT + {{endif}} + + np.putmask(values, mask, nan_value) + {{endif}} + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + {{if dtype == 'object'}} + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks + {{else}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + {{endif}} + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_{{dtype}}(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + {{if dtype == 'object'}} + dups = sum_ranks = infs = 0 + {{else}} + dups = sum_ranks = 0 + {{endif}} + + total_tie_count = 0 + count = 0.0 + for j in range(k): + {{if dtype != 'object'}} + sum_ranks += j + 1 + dups += 1 + {{endif}} + + val = values[i, j] + + {{if dtype != 'uint64'}} + {{if dtype == 'object'}} + if (val is nan_value) and keep_na: + {{else}} + if (val == nan_value) and keep_na: + {{endif}} + ranks[i, argsorted[i, j]] = nan + + {{if dtype == 'object'}} + infs += 1 + {{endif}} + + continue + {{endif}} + + count += 1.0 + + {{if dtype == 'object'}} + sum_ranks += (j - infs) + 1 + dups += 1 + {{endif}} + + {{if dtype == 'object'}} + if j == k - 1 or are_diff(values[i, j + 1], val): + {{elif dtype == 'float64'}} + if j == k - 1 or values[i, j + 1] != val: + {{else}} + if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + {{endif}} + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + {{if dtype == 'object'}} + raise ValueError('first not supported ' + 'for non-numeric data') + {{else}} + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + {{endif}} + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + +{{endfor}} diff --git a/pandas/src/algos_take_helper.pxi.in b/pandas/src/algos_take_helper.pxi.in index e9abbcd13f499..71bb1bb4fe9be 100644 --- a/pandas/src/algos_take_helper.pxi.in +++ b/pandas/src/algos_take_helper.pxi.in @@ -258,4 +258,35 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, else: out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} -{{endfor}} \ No newline at end of file +{{endfor}} + +#---------------------------------------------------------------------- +# take_2d internal function +#---------------------------------------------------------------------- + +{{py: + +# dtype, ctype, init_result +dtypes = [('float64', 'float64_t', 'np.empty_like(values)'), + ('uint64', 'uint64_t', 'np.empty_like(values)'), + ('object', 'object', 'values.copy()'), + ('int64', 'int64_t', 'np.empty_like(values)')] +}} + +{{for dtype, ctype, init_result in dtypes}} + +cdef _take_2d_{{dtype}}(ndarray[{{ctype}}, ndim=2] values, object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[{{ctype}}, ndim=2] result + object val + + N, K = ( values).shape + result = {{init_result}} + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +{{endfor}} diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 55c840b20c78b..b26839599ef38 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -204,7 +204,7 @@ cdef class HashTable: # name, dtype, null_condition, float_group dtypes = [('Float64', 'float64', 'val != val', True), - ('UInt64', 'uint64', 'val == 0', False), + ('UInt64', 'uint64', 'False', False), ('Int64', 'int64', 'val == iNaT', False)] }} diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index e360089928000..75dd887c9d290 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -969,21 +969,44 @@ def test_unique_label_indices(): check_dtype=False) -def test_rank(): - tm._skip_if_no_scipy() - from scipy.stats import rankdata - - def _check(arr): - mask = ~np.isfinite(arr) - arr = arr.copy() - result = _algos.rank_1d_float64(arr) - arr[mask] = np.inf - exp = rankdata(arr) - exp[mask] = nan - assert_almost_equal(result, exp) - - _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) - _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) +class TestRank(tm.TestCase): + + def test_scipy_compat(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + def _check(arr): + mask = ~np.isfinite(arr) + arr = arr.copy() + result = _algos.rank_1d_float64(arr) + arr[mask] = np.inf + exp = rankdata(arr) + exp[mask] = nan + assert_almost_equal(result, exp) + + _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) + _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + + def test_basic(self): + exp = np.array([1, 2], dtype=np.float64) + + for dtype in np.typecodes['AllInteger']: + s = Series([1, 100], dtype=dtype) + tm.assert_numpy_array_equal(algos.rank(s), exp) + + def test_uint64_overflow(self): + exp = np.array([1, 2], dtype=np.float64) + + for dtype in [np.float64, np.uint64]: + s = Series([1, 2**63], dtype=dtype) + tm.assert_numpy_array_equal(algos.rank(s), exp) + + def test_too_many_ndims(self): + arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]) + msg = "Array with ndim > 2 are not supported" + + with tm.assertRaisesRegexp(TypeError, msg): + algos.rank(arr) def test_pad_backfill_object_segfault(): diff --git a/setup.py b/setup.py index e3774d8e36ce9..0821a7d907e6c 100755 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') _pxi_dep_template = { 'algos': ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'algos_take_helper.pxi.in'], + 'algos_take_helper.pxi.in', 'algos_rank_helper.pxi.in'], '_join': ['join_helper.pxi.in', 'joins_func_helper.pxi.in'], 'hashtable': ['hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in'],