Skip to content

Commit 29c8f80

Browse files
authored
GH-95861: Add support for Spearman's rank correlation coefficient (GH-95863)
1 parent 91afe66 commit 29c8f80

File tree

4 files changed

+117
-18
lines changed

4 files changed

+117
-18
lines changed

Doc/library/statistics.rst

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ These functions calculate statistics regarding relations between two inputs.
104104

105105
========================= =====================================================
106106
:func:`covariance` Sample covariance for two variables.
107-
:func:`correlation` Pearson's correlation coefficient for two variables.
107+
:func:`correlation` Pearson and Spearman's correlation coefficients.
108108
:func:`linear_regression` Slope and intercept for simple linear regression.
109109
========================= =====================================================
110110

@@ -648,31 +648,57 @@ However, for reading convenience, most of the examples show sorted sequences.
648648

649649
.. versionadded:: 3.10
650650

651-
.. function:: correlation(x, y, /)
651+
.. function:: correlation(x, y, /, *, method='linear')
652652

653653
Return the `Pearson's correlation coefficient
654654
<https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
655655
for two inputs. Pearson's correlation coefficient *r* takes values
656-
between -1 and +1. It measures the strength and direction of the linear
657-
relationship, where +1 means very strong, positive linear relationship,
658-
-1 very strong, negative linear relationship, and 0 no linear relationship.
656+
between -1 and +1. It measures the strength and direction of a linear
657+
relationship.
658+
659+
If *method* is "ranked", computes `Spearman's rank correlation coefficient
660+
<https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
661+
for two inputs. The data is replaced by ranks. Ties are averaged so that
662+
equal values receive the same rank. The resulting coefficient measures the
663+
strength of a monotonic relationship.
664+
665+
Spearman's correlation coefficient is appropriate for ordinal data or for
666+
continuous data that doesn't meet the linear proportion requirement for
667+
Pearson's correlation coefficient.
659668

660669
Both inputs must be of the same length (no less than two), and need
661670
not to be constant, otherwise :exc:`StatisticsError` is raised.
662671

663-
Examples:
672+
Example with `Kepler's laws of planetary motion
673+
<https://en.wikipedia.org/wiki/Kepler's_laws_of_planetary_motion>`_:
664674

665675
.. doctest::
666676

667-
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
668-
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
669-
>>> correlation(x, x)
677+
>>> # Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune
678+
>>> orbital_period = [88, 225, 365, 687, 4331, 10_756, 30_687, 60_190] # days
679+
>>> dist_from_sun = [58, 108, 150, 228, 778, 1_400, 2_900, 4_500] # million km
680+
681+
>>> # Show that a perfect monotonic relationship exists
682+
>>> correlation(orbital_period, dist_from_sun, method='ranked')
683+
1.0
684+
685+
>>> # Observe that a linear relationship is imperfect
686+
>>> round(correlation(orbital_period, dist_from_sun), 4)
687+
0.9882
688+
689+
>>> # Demonstrate Kepler's third law: There is a linear correlation
690+
>>> # between the square of the orbital period and the cube of the
691+
>>> # distance from the sun.
692+
>>> period_squared = [p * p for p in orbital_period]
693+
>>> dist_cubed = [d * d * d for d in dist_from_sun]
694+
>>> round(correlation(period_squared, dist_cubed), 4)
670695
1.0
671-
>>> correlation(x, y)
672-
-1.0
673696

674697
.. versionadded:: 3.10
675698

699+
.. versionchanged:: 3.12
700+
Added support for Spearman's rank correlation coefficient.
701+
676702
.. function:: linear_regression(x, y, /, *, proportional=False)
677703

678704
Return the slope and intercept of `simple linear regression

Lib/statistics.py

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,11 @@
134134

135135
from fractions import Fraction
136136
from decimal import Decimal
137-
from itertools import groupby, repeat
137+
from itertools import count, groupby, repeat
138138
from bisect import bisect_left, bisect_right
139139
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
140140
from functools import reduce
141-
from operator import mul
141+
from operator import mul, itemgetter
142142
from collections import Counter, namedtuple, defaultdict
143143

144144
_SQRT2 = sqrt(2.0)
@@ -355,6 +355,50 @@ def _fail_neg(values, errmsg='negative value'):
355355
raise StatisticsError(errmsg)
356356
yield x
357357

358+
def _rank(data, /, *, key=None, reverse=False, ties='average') -> list[float]:
359+
"""Rank order a dataset. The lowest value has rank 1.
360+
361+
Ties are averaged so that equal values receive the same rank:
362+
363+
>>> data = [31, 56, 31, 25, 75, 18]
364+
>>> _rank(data)
365+
[3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
366+
367+
The operation is idempotent:
368+
369+
>>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0])
370+
[3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
371+
372+
It is possible to rank the data in reverse order so that
373+
the highest value has rank 1. Also, a key-function can
374+
extract the field to be ranked:
375+
376+
>>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)]
377+
>>> _rank(goals, key=itemgetter(1), reverse=True)
378+
[2.0, 1.0, 3.0]
379+
380+
"""
381+
# If this function becomes public at some point, more thought
382+
# needs to be given to the signature. A list of ints is
383+
# plausible when ties is "min" or "max". When ties is "average",
384+
# either list[float] or list[Fraction] is plausible.
385+
386+
# Default handling of ties matches scipy.stats.mstats.spearmanr.
387+
if ties != 'average':
388+
raise ValueError(f'Unknown tie resolution method: {ties!r}')
389+
if key is not None:
390+
data = map(key, data)
391+
val_pos = sorted(zip(data, count()), reverse=reverse)
392+
i = 0 # To rank starting at 0 instead of 1, set i = -1.
393+
result = [0] * len(val_pos)
394+
for _, g in groupby(val_pos, key=itemgetter(0)):
395+
group = list(g)
396+
size = len(group)
397+
rank = i + (size + 1) / 2
398+
for value, orig_pos in group:
399+
result[orig_pos] = rank
400+
i += size
401+
return result
358402

359403
def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
360404
"""Square root of n/m, rounded to the nearest integer using round-to-odd."""
@@ -988,14 +1032,12 @@ def covariance(x, y, /):
9881032
return sxy / (n - 1)
9891033

9901034

991-
def correlation(x, y, /):
1035+
def correlation(x, y, /, *, method='linear'):
9921036
"""Pearson's correlation coefficient
9931037
9941038
Return the Pearson's correlation coefficient for two inputs. Pearson's
995-
correlation coefficient *r* takes values between -1 and +1. It measures the
996-
strength and direction of the linear relationship, where +1 means very
997-
strong, positive linear relationship, -1 very strong, negative linear
998-
relationship, and 0 no linear relationship.
1039+
correlation coefficient *r* takes values between -1 and +1. It measures
1040+
the strength and direction of a linear relationship.
9991041
10001042
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
10011043
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
@@ -1004,12 +1046,25 @@ def correlation(x, y, /):
10041046
>>> correlation(x, y)
10051047
-1.0
10061048
1049+
If *method* is "ranked", computes Spearman's rank correlation coefficient
1050+
for two inputs. The data is replaced by ranks. Ties are averaged
1051+
so that equal values receive the same rank. The resulting coefficient
1052+
measures the strength of a monotonic relationship.
1053+
1054+
Spearman's rank correlation coefficient is appropriate for ordinal
1055+
data or for continuous data that doesn't meet the linear proportion
1056+
requirement for Pearson's correlation coefficient.
10071057
"""
10081058
n = len(x)
10091059
if len(y) != n:
10101060
raise StatisticsError('correlation requires that both inputs have same number of data points')
10111061
if n < 2:
10121062
raise StatisticsError('correlation requires at least two data points')
1063+
if method not in {'linear', 'ranked'}:
1064+
raise ValueError(f'Unknown method: {method!r}')
1065+
if method == 'ranked':
1066+
x = _rank(x)
1067+
y = _rank(y)
10131068
xbar = fsum(x) / n
10141069
ybar = fsum(y) / n
10151070
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))

Lib/test/test_statistics.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2565,6 +2565,22 @@ def test_different_scales(self):
25652565
self.assertAlmostEqual(statistics.covariance(x, y), 0.1)
25662566

25672567

2568+
def test_correlation_spearman(self):
2569+
# https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
2570+
# Compare with:
2571+
# >>> import scipy.stats.mstats
2572+
# >>> scipy.stats.mstats.spearmanr(reading, mathematics)
2573+
# SpearmanrResult(correlation=0.6686960980480712, pvalue=0.03450954165178532)
2574+
# And Wolfram Alpha gives: 0.668696
2575+
# https://www.wolframalpha.com/input?i=SpearmanRho%5B%7B56%2C+75%2C+45%2C+71%2C+61%2C+64%2C+58%2C+80%2C+76%2C+61%7D%2C+%7B66%2C+70%2C+40%2C+60%2C+65%2C+56%2C+59%2C+77%2C+67%2C+63%7D%5D
2576+
reading = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
2577+
mathematics = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
2578+
self.assertAlmostEqual(statistics.correlation(reading, mathematics, method='ranked'),
2579+
0.6686960980480712)
2580+
2581+
with self.assertRaises(ValueError):
2582+
statistics.correlation(reading, mathematics, method='bad_method')
2583+
25682584
class TestLinearRegression(unittest.TestCase):
25692585

25702586
def test_constant_input_error(self):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Add support for computing Spearman's correlation coefficient to the existing
2+
statistics.correlation() function.

0 commit comments

Comments
 (0)