added percentage rank to DataFrame.Rank

MichaelWS · MichaelWS · commit d33a50f0290b · 2014-03-28T10:41:42.000-04:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -190,6 +190,7 @@ Improvements to existing features
 - implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`)
 - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
 - Testing statements updated to use specialized asserts (:issue:`6175`)
+- ``DataFrame.rank()`` now has a percentage rank option (:issue:`5971`)
 - ``Series.rank()`` now has a percentage rank option (:issue:`5971`)
 - ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`)
 - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -283,7 +283,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
 
 
 def rank_2d_float64(object in_arr, axis=0, ties_method='average',
-                    ascending=True, na_option='keep'):
+                    ascending=True, na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -296,6 +296,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
+        float count = 0.0
 
     tiebreak = tiebreakers[ties_method]
 
@@ -335,13 +336,15 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
     for i in range(n):
         dups = sum_ranks = 0
         total_tie_count = 0
+        count = 0.0
         for j in range(k):
             sum_ranks += j + 1
             dups += 1
             val = values[i, j]
             if val == nan_value and keep_na:
                 ranks[i, argsorted[i, j]] = nan
                 continue
+            count += 1.0
             if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
                 if tiebreak == TIEBREAK_AVERAGE:
                     for z in range(j - dups + 1, j + 1):
@@ -363,15 +366,16 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = total_tie_count
                 sum_ranks = dups = 0
-
+        if pct:
+            ranks[i, :] /= count
     if axis == 0:
         return ranks.T
     else:
         return ranks
 
 
 def rank_2d_int64(object in_arr, axis=0, ties_method='average',
-                    ascending=True, na_option='keep'):
+                    ascending=True, na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -384,6 +388,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
         int64_t val
         float64_t sum_ranks = 0
         int tiebreak = 0
+        float count = 0.0
     tiebreak = tiebreakers[ties_method]
 
     if axis == 0:
@@ -411,10 +416,12 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
     for i in range(n):
         dups = sum_ranks = 0
         total_tie_count = 0
+        count = 0.0
         for j in range(k):
             sum_ranks += j + 1
             dups += 1
             val = values[i, j]
+            count += 1.0
             if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
                 if tiebreak == TIEBREAK_AVERAGE:
                     for z in range(j - dups + 1, j + 1):
@@ -436,7 +443,8 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = total_tie_count
                 sum_ranks = dups = 0
-
+        if pct:
+            ranks[i, :] /= count
     if axis == 0:
         return ranks.T
     else:
@@ -528,7 +536,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
                     ranks[argsorted[j]] = total_tie_count
             sum_ranks = dups = 0
     if pct:
-        ranks / count
+        return ranks / count
     else:
         return ranks
 
@@ -562,7 +570,7 @@ class NegInfinity(object):
     __cmp__ = _return_true
 
 def rank_2d_generic(object in_arr, axis=0, ties_method='average',
-                    ascending=True, na_option='keep'):
+                    ascending=True, na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -577,6 +585,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
+        float count = 0.0
 
     tiebreak = tiebreakers[ties_method]
 
@@ -611,7 +620,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
         for i in range(len(values)):
             ranks[i] = rank_1d_generic(in_arr[i],
                                        ties_method=ties_method,
-                                       ascending=ascending)
+                                       ascending=ascending,
+                                       pct=pct)
         if axis == 0:
             return ranks.T
         else:
@@ -626,12 +636,14 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
     for i in range(n):
         dups = sum_ranks = infs = 0
         total_tie_count = 0
+        count = 0.0
         for j in range(k):
             val = values[i, j]
             if val is nan_value and keep_na:
                 ranks[i, argsorted[i, j]] = nan
                 infs += 1
                 continue
+            count += 1.0
             sum_ranks += (j - infs) + 1
             dups += 1
             if j == k - 1 or are_diff(values[i, j + 1], val):
@@ -652,7 +664,8 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = total_tie_count
                 sum_ranks = dups = 0
-
+        if pct:
+            ranks[i, :] /= count
     if axis == 0:
         return ranks.T
     else:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -277,7 +277,7 @@ def rank(values, axis=0, method='average', na_option='keep',
     elif values.ndim == 2:
         f, values = _get_data_algo(values, _rank2d_functions)
         ranks = f(values, axis=axis, ties_method=method,
-                  ascending=ascending, na_option=na_option)
+                  ascending=ascending, na_option=na_option, pct=pct)
 
     return ranks
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4182,7 +4182,7 @@ def f(arr):
         return data.apply(f, axis=axis)
 
     def rank(self, axis=0, numeric_only=None, method='average',
-             na_option='keep', ascending=True):
+             na_option='keep', ascending=True, pct=False):
         """
         Compute numerical data ranks (1 through n) along axis. Equal values are
         assigned a rank that is the average of the ranks of those values
@@ -4205,6 +4205,8 @@ def rank(self, axis=0, numeric_only=None, method='average',
             * bottom: smallest rank if descending
         ascending : boolean, default True
             False for ranks by high (1) to low (N)
+        pct : boolean, default False
+            Computes percentage rank of data
 
         Returns
         -------
@@ -4214,18 +4216,18 @@ def rank(self, axis=0, numeric_only=None, method='average',
         if numeric_only is None:
             try:
                 ranks = algos.rank(self.values, axis=axis, method=method,
-                                   ascending=ascending, na_option=na_option)
+                                   ascending=ascending, na_option=na_option,
+                                   pct=pct)
                 return self._constructor(ranks, index=self.index,
                                          columns=self.columns)
             except TypeError:
                 numeric_only = True
-
         if numeric_only:
             data = self._get_numeric_data()
         else:
             data = self
         ranks = algos.rank(data.values, axis=axis, method=method,
-                           ascending=ascending, na_option=na_option)
+                           ascending=ascending, na_option=na_option, pct=pct)
         return self._constructor(ranks, index=data.index, columns=data.columns)
 
     def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1732,7 +1732,7 @@ def rank(self, method='average', na_option='keep', ascending=True,
             keep: leave NA values where they are
         ascending : boolean, default True
             False for ranks by high (1) to low (N)
-        pct : boolean, defeault False
+        pct : boolean, default False
             Computes percentage rank of data
 
         Returns
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10933,12 +10933,24 @@ def test_rank(self):
 
     def test_rank2(self):
         from datetime import datetime
+        df = DataFrame([[1, 3, 2], [1, 2, 3]])
+        expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
+        result = df.rank(1, pct=True)
+        assert_frame_equal(result, expected)
+        
+        df = DataFrame([[1, 3, 2], [1, 2, 3]])
+        expected = df.rank(0) / 2.0
+        result = df.rank(0, pct=True)
+        assert_frame_equal(result, expected)
+
+
 
         df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
         expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
         result = df.rank(1, numeric_only=False)
         assert_frame_equal(result, expected)
 
+        
         expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
         result = df.rank(0, numeric_only=False)
         assert_frame_equal(result, expected)