Skip to content

Commit d33a50f

Browse files
committed
added percentage rank to DataFrame.Rank
1 parent 1ff776a commit d33a50f

File tree

6 files changed

+42
-14
lines changed

6 files changed

+42
-14
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ Improvements to existing features
190190
- implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`)
191191
- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
192192
- Testing statements updated to use specialized asserts (:issue:`6175`)
193+
- ``DataFrame.rank()`` now has a percentage rank option (:issue:`5971`)
193194
- ``Series.rank()`` now has a percentage rank option (:issue:`5971`)
194195
- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`)
195196
- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when

pandas/algos.pyx

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
283283

284284

285285
def rank_2d_float64(object in_arr, axis=0, ties_method='average',
286-
ascending=True, na_option='keep'):
286+
ascending=True, na_option='keep', pct=False):
287287
"""
288288
Fast NaN-friendly version of scipy.stats.rankdata
289289
"""
@@ -296,6 +296,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
296296
float64_t sum_ranks = 0
297297
int tiebreak = 0
298298
bint keep_na = 0
299+
float count = 0.0
299300

300301
tiebreak = tiebreakers[ties_method]
301302

@@ -335,13 +336,15 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
335336
for i in range(n):
336337
dups = sum_ranks = 0
337338
total_tie_count = 0
339+
count = 0.0
338340
for j in range(k):
339341
sum_ranks += j + 1
340342
dups += 1
341343
val = values[i, j]
342344
if val == nan_value and keep_na:
343345
ranks[i, argsorted[i, j]] = nan
344346
continue
347+
count += 1.0
345348
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
346349
if tiebreak == TIEBREAK_AVERAGE:
347350
for z in range(j - dups + 1, j + 1):
@@ -363,15 +366,16 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
363366
for z in range(j - dups + 1, j + 1):
364367
ranks[i, argsorted[i, z]] = total_tie_count
365368
sum_ranks = dups = 0
366-
369+
if pct:
370+
ranks[i, :] /= count
367371
if axis == 0:
368372
return ranks.T
369373
else:
370374
return ranks
371375

372376

373377
def rank_2d_int64(object in_arr, axis=0, ties_method='average',
374-
ascending=True, na_option='keep'):
378+
ascending=True, na_option='keep', pct=False):
375379
"""
376380
Fast NaN-friendly version of scipy.stats.rankdata
377381
"""
@@ -384,6 +388,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
384388
int64_t val
385389
float64_t sum_ranks = 0
386390
int tiebreak = 0
391+
float count = 0.0
387392
tiebreak = tiebreakers[ties_method]
388393

389394
if axis == 0:
@@ -411,10 +416,12 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
411416
for i in range(n):
412417
dups = sum_ranks = 0
413418
total_tie_count = 0
419+
count = 0.0
414420
for j in range(k):
415421
sum_ranks += j + 1
416422
dups += 1
417423
val = values[i, j]
424+
count += 1.0
418425
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
419426
if tiebreak == TIEBREAK_AVERAGE:
420427
for z in range(j - dups + 1, j + 1):
@@ -436,7 +443,8 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
436443
for z in range(j - dups + 1, j + 1):
437444
ranks[i, argsorted[i, z]] = total_tie_count
438445
sum_ranks = dups = 0
439-
446+
if pct:
447+
ranks[i, :] /= count
440448
if axis == 0:
441449
return ranks.T
442450
else:
@@ -528,7 +536,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
528536
ranks[argsorted[j]] = total_tie_count
529537
sum_ranks = dups = 0
530538
if pct:
531-
ranks / count
539+
return ranks / count
532540
else:
533541
return ranks
534542

@@ -562,7 +570,7 @@ class NegInfinity(object):
562570
__cmp__ = _return_true
563571

564572
def rank_2d_generic(object in_arr, axis=0, ties_method='average',
565-
ascending=True, na_option='keep'):
573+
ascending=True, na_option='keep', pct=False):
566574
"""
567575
Fast NaN-friendly version of scipy.stats.rankdata
568576
"""
@@ -577,6 +585,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
577585
float64_t sum_ranks = 0
578586
int tiebreak = 0
579587
bint keep_na = 0
588+
float count = 0.0
580589

581590
tiebreak = tiebreakers[ties_method]
582591

@@ -611,7 +620,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
611620
for i in range(len(values)):
612621
ranks[i] = rank_1d_generic(in_arr[i],
613622
ties_method=ties_method,
614-
ascending=ascending)
623+
ascending=ascending,
624+
pct=pct)
615625
if axis == 0:
616626
return ranks.T
617627
else:
@@ -626,12 +636,14 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
626636
for i in range(n):
627637
dups = sum_ranks = infs = 0
628638
total_tie_count = 0
639+
count = 0.0
629640
for j in range(k):
630641
val = values[i, j]
631642
if val is nan_value and keep_na:
632643
ranks[i, argsorted[i, j]] = nan
633644
infs += 1
634645
continue
646+
count += 1.0
635647
sum_ranks += (j - infs) + 1
636648
dups += 1
637649
if j == k - 1 or are_diff(values[i, j + 1], val):
@@ -652,7 +664,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
652664
for z in range(j - dups + 1, j + 1):
653665
ranks[i, argsorted[i, z]] = total_tie_count
654666
sum_ranks = dups = 0
655-
667+
if pct:
668+
ranks[i, :] /= count
656669
if axis == 0:
657670
return ranks.T
658671
else:

pandas/core/algorithms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ def rank(values, axis=0, method='average', na_option='keep',
277277
elif values.ndim == 2:
278278
f, values = _get_data_algo(values, _rank2d_functions)
279279
ranks = f(values, axis=axis, ties_method=method,
280-
ascending=ascending, na_option=na_option)
280+
ascending=ascending, na_option=na_option, pct=pct)
281281

282282
return ranks
283283

pandas/core/frame.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4182,7 +4182,7 @@ def f(arr):
41824182
return data.apply(f, axis=axis)
41834183

41844184
def rank(self, axis=0, numeric_only=None, method='average',
4185-
na_option='keep', ascending=True):
4185+
na_option='keep', ascending=True, pct=False):
41864186
"""
41874187
Compute numerical data ranks (1 through n) along axis. Equal values are
41884188
assigned a rank that is the average of the ranks of those values
@@ -4205,6 +4205,8 @@ def rank(self, axis=0, numeric_only=None, method='average',
42054205
* bottom: smallest rank if descending
42064206
ascending : boolean, default True
42074207
False for ranks by high (1) to low (N)
4208+
pct : boolean, default False
4209+
Computes percentage rank of data
42084210
42094211
Returns
42104212
-------
@@ -4214,18 +4216,18 @@ def rank(self, axis=0, numeric_only=None, method='average',
42144216
if numeric_only is None:
42154217
try:
42164218
ranks = algos.rank(self.values, axis=axis, method=method,
4217-
ascending=ascending, na_option=na_option)
4219+
ascending=ascending, na_option=na_option,
4220+
pct=pct)
42184221
return self._constructor(ranks, index=self.index,
42194222
columns=self.columns)
42204223
except TypeError:
42214224
numeric_only = True
4222-
42234225
if numeric_only:
42244226
data = self._get_numeric_data()
42254227
else:
42264228
data = self
42274229
ranks = algos.rank(data.values, axis=axis, method=method,
4228-
ascending=ascending, na_option=na_option)
4230+
ascending=ascending, na_option=na_option, pct=pct)
42294231
return self._constructor(ranks, index=data.index, columns=data.columns)
42304232

42314233
def to_timestamp(self, freq=None, how='start', axis=0, copy=True):

pandas/core/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1732,7 +1732,7 @@ def rank(self, method='average', na_option='keep', ascending=True,
17321732
keep: leave NA values where they are
17331733
ascending : boolean, default True
17341734
False for ranks by high (1) to low (N)
1735-
pct : boolean, defeault False
1735+
pct : boolean, default False
17361736
Computes percentage rank of data
17371737
17381738
Returns

pandas/tests/test_frame.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10933,12 +10933,24 @@ def test_rank(self):
1093310933

1093410934
def test_rank2(self):
1093510935
from datetime import datetime
10936+
df = DataFrame([[1, 3, 2], [1, 2, 3]])
10937+
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
10938+
result = df.rank(1, pct=True)
10939+
assert_frame_equal(result, expected)
10940+
10941+
df = DataFrame([[1, 3, 2], [1, 2, 3]])
10942+
expected = df.rank(0) / 2.0
10943+
result = df.rank(0, pct=True)
10944+
assert_frame_equal(result, expected)
10945+
10946+
1093610947

1093710948
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
1093810949
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
1093910950
result = df.rank(1, numeric_only=False)
1094010951
assert_frame_equal(result, expected)
1094110952

10953+
1094210954
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
1094310955
result = df.rank(0, numeric_only=False)
1094410956
assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)