134
134
135
135
from fractions import Fraction
136
136
from decimal import Decimal
137
- from itertools import groupby , repeat
137
+ from itertools import count , groupby , repeat
138
138
from bisect import bisect_left , bisect_right
139
139
from math import hypot , sqrt , fabs , exp , erf , tau , log , fsum
140
140
from functools import reduce
141
- from operator import mul
141
+ from operator import mul , itemgetter
142
142
from collections import Counter , namedtuple , defaultdict
143
143
144
144
_SQRT2 = sqrt (2.0 )
@@ -355,6 +355,50 @@ def _fail_neg(values, errmsg='negative value'):
355
355
raise StatisticsError (errmsg )
356
356
yield x
357
357
358
+ def _rank (data , / , * , key = None , reverse = False , ties = 'average' ) -> list [float ]:
359
+ """Rank order a dataset. The lowest value has rank 1.
360
+
361
+ Ties are averaged so that equal values receive the same rank:
362
+
363
+ >>> data = [31, 56, 31, 25, 75, 18]
364
+ >>> _rank(data)
365
+ [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
366
+
367
+ The operation is idempotent:
368
+
369
+ >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0])
370
+ [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
371
+
372
+ It is possible to rank the data in reverse order so that
373
+ the highest value has rank 1. Also, a key-function can
374
+ extract the field to be ranked:
375
+
376
+ >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)]
377
+ >>> _rank(goals, key=itemgetter(1), reverse=True)
378
+ [2.0, 1.0, 3.0]
379
+
380
+ """
381
+ # If this function becomes public at some point, more thought
382
+ # needs to be given to the signature. A list of ints is
383
+ # plausible when ties is "min" or "max". When ties is "average",
384
+ # either list[float] or list[Fraction] is plausible.
385
+
386
+ # Default handling of ties matches scipy.stats.mstats.spearmanr.
387
+ if ties != 'average' :
388
+ raise ValueError (f'Unknown tie resolution method: { ties !r} ' )
389
+ if key is not None :
390
+ data = map (key , data )
391
+ val_pos = sorted (zip (data , count ()), reverse = reverse )
392
+ i = 0 # To rank starting at 0 instead of 1, set i = -1.
393
+ result = [0 ] * len (val_pos )
394
+ for _ , g in groupby (val_pos , key = itemgetter (0 )):
395
+ group = list (g )
396
+ size = len (group )
397
+ rank = i + (size + 1 ) / 2
398
+ for value , orig_pos in group :
399
+ result [orig_pos ] = rank
400
+ i += size
401
+ return result
358
402
359
403
def _integer_sqrt_of_frac_rto (n : int , m : int ) -> int :
360
404
"""Square root of n/m, rounded to the nearest integer using round-to-odd."""
@@ -988,14 +1032,12 @@ def covariance(x, y, /):
988
1032
return sxy / (n - 1 )
989
1033
990
1034
991
- def correlation (x , y , / ):
1035
+ def correlation (x , y , / , * , method = 'linear' ):
992
1036
"""Pearson's correlation coefficient
993
1037
994
1038
Return the Pearson's correlation coefficient for two inputs. Pearson's
995
- correlation coefficient *r* takes values between -1 and +1. It measures the
996
- strength and direction of the linear relationship, where +1 means very
997
- strong, positive linear relationship, -1 very strong, negative linear
998
- relationship, and 0 no linear relationship.
1039
+ correlation coefficient *r* takes values between -1 and +1. It measures
1040
+ the strength and direction of a linear relationship.
999
1041
1000
1042
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
1001
1043
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
@@ -1004,12 +1046,25 @@ def correlation(x, y, /):
1004
1046
>>> correlation(x, y)
1005
1047
-1.0
1006
1048
1049
+ If *method* is "ranked", computes Spearman's rank correlation coefficient
1050
+ for two inputs. The data is replaced by ranks. Ties are averaged
1051
+ so that equal values receive the same rank. The resulting coefficient
1052
+ measures the strength of a monotonic relationship.
1053
+
1054
+ Spearman's rank correlation coefficient is appropriate for ordinal
1055
+ data or for continuous data that doesn't meet the linear proportion
1056
+ requirement for Pearson's correlation coefficient.
1007
1057
"""
1008
1058
n = len (x )
1009
1059
if len (y ) != n :
1010
1060
raise StatisticsError ('correlation requires that both inputs have same number of data points' )
1011
1061
if n < 2 :
1012
1062
raise StatisticsError ('correlation requires at least two data points' )
1063
+ if method not in {'linear' , 'ranked' }:
1064
+ raise ValueError (f'Unknown method: { method !r} ' )
1065
+ if method == 'ranked' :
1066
+ x = _rank (x )
1067
+ y = _rank (y )
1013
1068
xbar = fsum (x ) / n
1014
1069
ybar = fsum (y ) / n
1015
1070
sxy = fsum ((xi - xbar ) * (yi - ybar ) for xi , yi in zip (x , y ))
0 commit comments