diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 04f3ac70bdf5c..32955fd0f465b 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -67,499 +67,6 @@ tiebreakers = { } -# ctypedef fused pvalue_t: -# float64_t -# int64_t -# object - -# from cython cimport floating, integral - -cdef _take_2d_float64(ndarray[float64_t, ndim=2] values, - object idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx - ndarray[float64_t, ndim=2] result - object val - - N, K = ( values).shape - result = np.empty_like(values) - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result - -cdef _take_2d_int64(ndarray[int64_t, ndim=2] values, - object idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx - ndarray[int64_t, ndim=2] result - object val - - N, K = ( values).shape - result = np.empty_like(values) - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result - -cdef _take_2d_object(ndarray[object, ndim=2] values, - object idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx - ndarray[object, ndim=2] result - object val - - N, K = ( values).shape - result = values.copy() - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result - - -def rank_1d_float64(object in_arr, ties_method='average', ascending=True, - na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 - ndarray[float64_t] sorted_data, ranks, values - ndarray[int64_t] argsorted - float64_t val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - tiebreak = tiebreakers[ties_method] - - values = np.asarray(in_arr).copy() - - keep_na = na_option == 'keep' - - if ascending ^ (na_option == 'top'): - nan_value = np.inf - else: - nan_value = -np.inf - mask = np.isnan(values) - np.putmask(values, mask, nan_value) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - # py2.5/win32 hack, can't pass i8 - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort() - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - argsorted = _as.astype('i8') - - for i in range(n): - sum_ranks += i + 1 - dups += 1 - val = sorted_data[i] - if (val == nan_value) and keep_na: - ranks[argsorted[i]] = nan - continue - count += 1.0 - if i == n - 1 or sorted_data[i + 1] != val: - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - if pct: - return ranks / count - else: - return ranks - - -def rank_1d_int64(object in_arr, ties_method='average', ascending=True, - na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 - ndarray[int64_t] sorted_data, values - ndarray[float64_t] ranks - ndarray[int64_t] argsorted - int64_t val, nan_value - float64_t sum_ranks = 0 - bint keep_na - int tiebreak = 0 - float count = 0.0 - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - values = np.asarray(in_arr) - - if ascending ^ (na_option == 'top'): - nan_value = np.iinfo('int64').max - else: - nan_value = np.iinfo('int64').min - - # unlike floats, which have np.inf, -np.inf, and np.nan - # ints do not - mask = values == iNaT - np.putmask(values, mask, nan_value) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - # py2.5/win32 hack, can't pass i8 - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort() - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - argsorted = _as.astype('i8') - - for i in range(n): - sum_ranks += i + 1 - dups += 1 - val = sorted_data[i] - if (val == nan_value) and keep_na: - ranks[argsorted[i]] = nan - continue - count += 1.0 - if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - if pct: - return ranks / count - else: - return ranks - - -def rank_2d_float64(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - ndarray[float64_t, ndim=2] ranks, values - ndarray[int64_t, ndim=2] argsorted - float64_t val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - in_arr = np.asarray(in_arr) - - if axis == 0: - values = in_arr.T.copy() - else: - values = in_arr.copy() - - if ascending ^ (na_option == 'top'): - nan_value = np.inf - else: - nan_value = -np.inf - - np.putmask(values, np.isnan(values), nan_value) - - n, k = ( values).shape - ranks = np.empty((n, k), dtype='f8') - - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d_float64(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - dups = sum_ranks = 0 - total_tie_count = 0 - count = 0.0 - for j in range(k): - sum_ranks += j + 1 - dups += 1 - val = values[i, j] - if val == nan_value and keep_na: - ranks[i, argsorted[i, j]] = nan - continue - count += 1.0 - if j == k - 1 or values[i, j + 1] != val: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -def rank_2d_int64(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - ndarray[float64_t, ndim=2] ranks - ndarray[int64_t, ndim=2] argsorted - ndarray[int64_t, ndim=2, cast=True] values - int64_t val, nan_value - float64_t sum_ranks = 0 - bint keep_na = 0 - int tiebreak = 0 - float count = 0.0 - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - in_arr = np.asarray(in_arr) - - if axis == 0: - values = in_arr.T.copy() - else: - values = in_arr.copy() - - if ascending ^ (na_option == 'top'): - nan_value = np.iinfo('int64').max - else: - nan_value = np.iinfo('int64').min - - # unlike floats, which have np.inf, -np.inf, and np.nan - # ints do not - np.putmask(values, values == iNaT, nan_value) - - n, k = ( values).shape - ranks = np.empty((n, k), dtype='f8') - - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d_int64(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - dups = sum_ranks = 0 - total_tie_count = 0 - count = 0.0 - for j in range(k): - sum_ranks += j + 1 - dups += 1 - val = values[i, j] - if val == nan_value and keep_na: - ranks[i, argsorted[i, j]] = nan - continue - count += 1.0 - if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 - ndarray[float64_t] ranks - ndarray sorted_data, values - ndarray[int64_t] argsorted - object val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - values = np.array(in_arr, copy=True) - - if values.dtype != np.object_: - values = values.astype('O') - - if ascending ^ (na_option == 'top'): - # always greater than everything - nan_value = Infinity() - else: - nan_value = NegInfinity() - - mask = lib.isnullobj(values) - np.putmask(values, mask, nan_value) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - # py2.5/win32 hack, can't pass i8 - try: - _as = values.argsort() - except TypeError: - if not retry: - raise - - valid_locs = (~mask).nonzero()[0] - ranks.put(valid_locs, rank_1d_generic(values.take(valid_locs), 0, - ties_method=ties_method, - ascending=ascending)) - np.putmask(ranks, mask, np.nan) - return ranks - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - argsorted = _as.astype('i8') - for i in range(n): - sum_ranks += i + 1 - dups += 1 - val = util.get_value_at(sorted_data, i) - if val is nan_value and keep_na: - ranks[argsorted[i]] = nan - continue - if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val)): - count += 1.0 - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - raise ValueError('first not supported for non-numeric data') - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - if pct: - return ranks / count - else: - return ranks - cdef inline are_diff(object left, object right): try: return fabs(left - right) > FP_ERR @@ -589,122 +96,6 @@ class NegInfinity(object): __ge__ = lambda self, other: self is other -def rank_2d_generic(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, infs, dups = 0 - Py_ssize_t total_tie_count = 0 - ndarray[float64_t, ndim=2] ranks - ndarray[object, ndim=2] values - ndarray[int64_t, ndim=2] argsorted - object val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float count = 0.0 - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - in_arr = np.asarray(in_arr) - - if axis == 0: - values = in_arr.T.copy() - else: - values = in_arr.copy() - - if values.dtype != np.object_: - values = values.astype('O') - - if ascending ^ (na_option == 'top'): - # always greater than everything - nan_value = Infinity() - else: - nan_value = NegInfinity() - - mask = lib.isnullobj2d(values) - np.putmask(values, mask, nan_value) - - n, k = ( values).shape - ranks = np.empty((n, k), dtype='f8') - - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_generic(in_arr[i], - ties_method=ties_method, - ascending=ascending, - pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d_object(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - dups = sum_ranks = infs = 0 - total_tie_count = 0 - count = 0.0 - for j in range(k): - val = values[i, j] - if val is nan_value and keep_na: - ranks[i, argsorted[i, j]] = nan - infs += 1 - continue - count += 1.0 - sum_ranks += (j - infs) + 1 - dups += 1 - if j == k - 1 or are_diff(values[i, j + 1], val): - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - raise ValueError('first not supported for ' - 'non-numeric data') - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - -# def _take_indexer_2d(ndarray[float64_t, ndim=2] values, -# ndarray[Py_ssize_t, ndim=2, cast=True] indexer): -# cdef: -# Py_ssize_t i, j, N, K -# ndarray[float64_t, ndim=2] result - -# N, K = ( values).shape -# result = np.empty_like(values) -# for i in range(N): -# for j in range(K): -# result[i, j] = values[i, indexer[i, j]] -# return result - - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1: cdef numeric t @@ -1269,4 +660,5 @@ cdef inline float64_t _median_linear(float64_t* a, int n): # generated from template include "algos_common_helper.pxi" include "algos_groupby_helper.pxi" +include "algos_rank_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9b7bf2bf058ef..b4a61b26aceb3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -68,7 +68,7 @@ def match(to_match, values, na_sentinel=-1): if issubclass(values.dtype.type, string_types): values = np.array(values, dtype='O') - f = lambda htype, caster: _match_generic(to_match, values, htype, caster) + f = lambda htype, caster: _match_object(to_match, values, htype, caster) result = _hashtable_algo(f, values, np.int64) if na_sentinel != -1: @@ -82,7 +82,7 @@ def match(to_match, values, na_sentinel=-1): return result -def _match_generic(values, index, table_type, type_caster): +def _match_object(values, index, table_type, type_caster): values = type_caster(values) index = type_caster(index) table = table_type(min(len(index), 1000000)) @@ -105,11 +105,11 @@ def unique(values): """ values = com._asarray_tuplesafe(values) - f = lambda htype, caster: _unique_generic(values, htype, caster) + f = lambda htype, caster: _unique_object(values, htype, caster) return _hashtable_algo(f, values) -def _unique_generic(values, table_type, type_caster): +def _unique_object(values, table_type, type_caster): values = type_caster(values) table = table_type(min(len(values), 1000000)) uniques = table.unique(values) @@ -366,6 +366,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): + # TODO: This constructor is bugged for uint's, especially + # np.uint64 due to overflow. Test this for uint behavior + # once constructor has been fixed. uniques = Index(uniques) return labels, uniques @@ -595,7 +598,27 @@ def mode(values): def rank(values, axis=0, method='average', na_option='keep', ascending=True, pct=False): """ + Rank the values along a given axis. + Parameters + ---------- + values : array-like + Array whose values will be ranked. The number of dimensions in this + array must not exceed 2. + axis : int, default 0 + Axis over which to perform rankings. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + The method by which tiebreaks are broken during the ranking. + na_option : {'keep', 'top'}, default 'keep' + The method by which NaNs are placed in the ranking. + - ``keep``: rank each NaN value with a NaN ranking + - ``top``: replace each NaN with either +/- inf so that they + there are ranked at the top + ascending : boolean, default True + Whether or not the elements should be ranked in ascending order. + pct : boolean, default False + Whether or not to the display the returned rankings in integer form + (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) @@ -605,6 +628,8 @@ def rank(values, axis=0, method='average', na_option='keep', f, values = _get_data_algo(values, _rank2d_functions) ranks = f(values, axis=axis, ties_method=method, ascending=ascending, na_option=na_option, pct=pct) + else: + raise TypeError("Array with ndim > 2 are not supported.") return ranks @@ -700,13 +725,15 @@ def _broadcast(arr_or_scalar, shape): _rank1d_functions = { 'float64': algos.rank_1d_float64, 'int64': algos.rank_1d_int64, - 'generic': algos.rank_1d_generic + 'uint64': algos.rank_1d_uint64, + 'object': algos.rank_1d_object } _rank2d_functions = { 'float64': algos.rank_2d_float64, 'int64': algos.rank_2d_int64, - 'generic': algos.rank_2d_generic + 'uint64': algos.rank_2d_uint64, + 'object': algos.rank_2d_object } @@ -934,9 +961,10 @@ def _hashtable_algo(f, values, return_dtype=None): _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), + 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), 'int64': (htable.Int64HashTable, htable.Int64Vector), 'string': (htable.StringHashTable, htable.ObjectVector), - 'generic': (htable.PyObjectHashTable, htable.ObjectVector) + 'object': (htable.PyObjectHashTable, htable.ObjectVector) } @@ -951,11 +979,15 @@ def _get_data_algo(values, func_map): f = func_map['int64'] values = values.view('i8') - elif is_integer_dtype(values): + elif is_signed_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) - else: + elif is_unsigned_integer_dtype(values): + f = func_map['uint64'] + values = _ensure_uint64(values) + + else: values = _ensure_object(values) # its cheaper to use a String Hash Table than Object @@ -966,7 +998,7 @@ def _get_data_algo(values, func_map): pass if f is None: - f = func_map['generic'] + f = func_map['object'] return f, values @@ -997,7 +1029,7 @@ def wrapper(arr, indexer, out, fill_value=np.nan): return wrapper -def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info): +def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): # this is not ideal, performance-wise, but it's better than raising # an exception (best to optimize in Cython to avoid getting here) row_idx, col_idx = indexer @@ -1020,7 +1052,7 @@ def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info): out[i, j] = arr[u_, v] -def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): +def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info else: @@ -1171,8 +1203,8 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): def func(arr, indexer, out, fill_value=np.nan): indexer = _ensure_int64(indexer) - _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, - mask_info=mask_info) + _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value, + mask_info=mask_info) return func @@ -1343,8 +1375,8 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if func is None: def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_generic(arr, indexer, out, fill_value=fill_value, - mask_info=mask_info) + _take_2d_multi_object(arr, indexer, out, fill_value=fill_value, + mask_info=mask_info) func(arr, indexer, out=out, fill_value=fill_value) return out diff --git a/pandas/src/algos_rank_helper.pxi.in b/pandas/src/algos_rank_helper.pxi.in new file mode 100644 index 0000000000000..7e7f819c7515f --- /dev/null +++ b/pandas/src/algos_rank_helper.pxi.in @@ -0,0 +1,385 @@ +""" +Template for each `dtype` helper function for rank + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# rank_1d, rank_2d +#---------------------------------------------------------------------- + +{{py: + +# dtype ctype pos_nan_value neg_nan_value +dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), + ('float64', 'float64_t', 'np.inf', '-np.inf'), + ('uint64', 'uint64_t', '', ''), + ('int64', 'int64_t', 'np.iinfo(np.int64).max', + 'np.iinfo(np.int64).min')] + +}} + +{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if dtype == 'object'}} + + +def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average', + ascending=True, na_option='keep', pct=False): +{{else}} + + +def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, + na_option='keep', pct=False): +{{endif}} + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 + + {{if dtype == 'object'}} + ndarray sorted_data, values + {{else}} + ndarray[{{ctype}}] sorted_data, values + {{endif}} + + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + + {{if dtype == 'uint64'}} + {{ctype}} val + {{else}} + {{ctype}} val, nan_value + {{endif}} + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + tiebreak = tiebreakers[ties_method] + + {{if dtype == 'float64'}} + values = np.asarray(in_arr).copy() + {{elif dtype == 'object'}} + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + {{else}} + values = np.asarray(in_arr) + {{endif}} + + keep_na = na_option == 'keep' + + {{if dtype != 'uint64'}} + if ascending ^ (na_option == 'top'): + nan_value = {{pos_nan_value}} + else: + nan_value = {{neg_nan_value}} + + {{if dtype == 'object'}} + mask = lib.isnullobj(values) + {{elif dtype == 'float64'}} + mask = np.isnan(values) + {{elif dtype == 'int64'}} + mask = values == iNaT + {{endif}} + + np.putmask(values, mask, nan_value) + {{endif}} + + n = len(values) + ranks = np.empty(n, dtype='f8') + + {{if dtype == 'object'}} + try: + _as = values.argsort() + except TypeError: + if not retry: + raise + + valid_locs = (~mask).nonzero()[0] + ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0, + ties_method=ties_method, + ascending=ascending)) + np.putmask(ranks, mask, np.nan) + return ranks + {{else}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + {{endif}} + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + + {{if dtype == 'object'}} + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = util.get_value_at(sorted_data, i) + + if (val is nan_value) and keep_na: + ranks[argsorted[i]] = nan + continue + + count += 1.0 + + if (i == n - 1 or + are_diff(util.get_value_at(sorted_data, i + 1), val)): + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + raise ValueError('first not supported for non-numeric data') + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + {{else}} + with nogil: + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + {{if dtype != 'uint64'}} + if (val == nan_value) and keep_na: + ranks[argsorted[i]] = nan + continue + {{endif}} + + count += 1.0 + + {{if dtype == 'float64'}} + if i == n - 1 or sorted_data[i + 1] != val: + {{else}} + if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: + {{endif}} + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + {{endif}} + if pct: + return ranks / count + else: + return ranks + + +def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + + {{if dtype == 'object'}} + Py_ssize_t infs + {{endif}} + + ndarray[float64_t, ndim=2] ranks + {{if dtype == 'int64' or dtype == 'uint64'}} + ndarray[{{ctype}}, ndim=2, cast=True] values + {{else}} + ndarray[{{ctype}}, ndim=2] values + {{endif}} + + ndarray[int64_t, ndim=2] argsorted + + {{if dtype == 'uint64'}} + {{ctype}} val + {{else}} + {{ctype}} val, nan_value + {{endif}} + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + in_arr = np.asarray(in_arr) + + if axis == 0: + values = in_arr.T.copy() + else: + values = in_arr.copy() + + {{if dtype == 'object'}} + if values.dtype != np.object_: + values = values.astype('O') + {{endif}} + + {{if dtype != 'uint64'}} + if ascending ^ (na_option == 'top'): + nan_value = {{pos_nan_value}} + else: + nan_value = {{neg_nan_value}} + + {{if dtype == 'object'}} + mask = lib.isnullobj2d(values) + {{elif dtype == 'float64'}} + mask = np.isnan(values) + {{elif dtype == 'int64'}} + mask = values == iNaT + {{endif}} + + np.putmask(values, mask, nan_value) + {{endif}} + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + {{if dtype == 'object'}} + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks + {{else}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + {{endif}} + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_{{dtype}}(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + {{if dtype == 'object'}} + dups = sum_ranks = infs = 0 + {{else}} + dups = sum_ranks = 0 + {{endif}} + + total_tie_count = 0 + count = 0.0 + for j in range(k): + {{if dtype != 'object'}} + sum_ranks += j + 1 + dups += 1 + {{endif}} + + val = values[i, j] + + {{if dtype != 'uint64'}} + {{if dtype == 'object'}} + if (val is nan_value) and keep_na: + {{else}} + if (val == nan_value) and keep_na: + {{endif}} + ranks[i, argsorted[i, j]] = nan + + {{if dtype == 'object'}} + infs += 1 + {{endif}} + + continue + {{endif}} + + count += 1.0 + + {{if dtype == 'object'}} + sum_ranks += (j - infs) + 1 + dups += 1 + {{endif}} + + {{if dtype == 'object'}} + if j == k - 1 or are_diff(values[i, j + 1], val): + {{elif dtype == 'float64'}} + if j == k - 1 or values[i, j + 1] != val: + {{else}} + if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + {{endif}} + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + {{if dtype == 'object'}} + raise ValueError('first not supported ' + 'for non-numeric data') + {{else}} + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + {{endif}} + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + +{{endfor}} diff --git a/pandas/src/algos_take_helper.pxi.in b/pandas/src/algos_take_helper.pxi.in index e9abbcd13f499..71bb1bb4fe9be 100644 --- a/pandas/src/algos_take_helper.pxi.in +++ b/pandas/src/algos_take_helper.pxi.in @@ -258,4 +258,35 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, else: out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} -{{endfor}} \ No newline at end of file +{{endfor}} + +#---------------------------------------------------------------------- +# take_2d internal function +#---------------------------------------------------------------------- + +{{py: + +# dtype, ctype, init_result +dtypes = [('float64', 'float64_t', 'np.empty_like(values)'), + ('uint64', 'uint64_t', 'np.empty_like(values)'), + ('object', 'object', 'values.copy()'), + ('int64', 'int64_t', 'np.empty_like(values)')] +}} + +{{for dtype, ctype, init_result in dtypes}} + +cdef _take_2d_{{dtype}}(ndarray[{{ctype}}, ndim=2] values, object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[{{ctype}}, ndim=2] result + object val + + N, K = ( values).shape + result = {{init_result}} + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +{{endfor}} diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 55c840b20c78b..b26839599ef38 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -204,7 +204,7 @@ cdef class HashTable: # name, dtype, null_condition, float_group dtypes = [('Float64', 'float64', 'val != val', True), - ('UInt64', 'uint64', 'val == 0', False), + ('UInt64', 'uint64', 'False', False), ('Int64', 'int64', 'val == iNaT', False)] }} diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index e360089928000..75dd887c9d290 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -969,21 +969,44 @@ def test_unique_label_indices(): check_dtype=False) -def test_rank(): - tm._skip_if_no_scipy() - from scipy.stats import rankdata - - def _check(arr): - mask = ~np.isfinite(arr) - arr = arr.copy() - result = _algos.rank_1d_float64(arr) - arr[mask] = np.inf - exp = rankdata(arr) - exp[mask] = nan - assert_almost_equal(result, exp) - - _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) - _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) +class TestRank(tm.TestCase): + + def test_scipy_compat(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + def _check(arr): + mask = ~np.isfinite(arr) + arr = arr.copy() + result = _algos.rank_1d_float64(arr) + arr[mask] = np.inf + exp = rankdata(arr) + exp[mask] = nan + assert_almost_equal(result, exp) + + _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) + _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + + def test_basic(self): + exp = np.array([1, 2], dtype=np.float64) + + for dtype in np.typecodes['AllInteger']: + s = Series([1, 100], dtype=dtype) + tm.assert_numpy_array_equal(algos.rank(s), exp) + + def test_uint64_overflow(self): + exp = np.array([1, 2], dtype=np.float64) + + for dtype in [np.float64, np.uint64]: + s = Series([1, 2**63], dtype=dtype) + tm.assert_numpy_array_equal(algos.rank(s), exp) + + def test_too_many_ndims(self): + arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]) + msg = "Array with ndim > 2 are not supported" + + with tm.assertRaisesRegexp(TypeError, msg): + algos.rank(arr) def test_pad_backfill_object_segfault(): diff --git a/setup.py b/setup.py index e3774d8e36ce9..0821a7d907e6c 100755 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') _pxi_dep_template = { 'algos': ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'algos_take_helper.pxi.in'], + 'algos_take_helper.pxi.in', 'algos_rank_helper.pxi.in'], '_join': ['join_helper.pxi.in', 'joins_func_helper.pxi.in'], 'hashtable': ['hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in'],