diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index bf869903c0f84e..c3f9c1f5239e8b 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -104,7 +104,7 @@ These functions calculate statistics regarding relations between two inputs. ========================= ===================================================== :func:`covariance` Sample covariance for two variables. -:func:`correlation` Pearson's correlation coefficient for two variables. +:func:`correlation` Pearson and Spearman's correlation coefficients. :func:`linear_regression` Slope and intercept for simple linear regression. ========================= ===================================================== @@ -648,31 +648,57 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionadded:: 3.10 -.. function:: correlation(x, y, /) +.. function:: correlation(x, y, /, *, method='linear') Return the `Pearson's correlation coefficient `_ for two inputs. Pearson's correlation coefficient *r* takes values - between -1 and +1. It measures the strength and direction of the linear - relationship, where +1 means very strong, positive linear relationship, - -1 very strong, negative linear relationship, and 0 no linear relationship. + between -1 and +1. It measures the strength and direction of a linear + relationship. + + If *method* is "ranked", computes `Spearman's rank correlation coefficient + `_ + for two inputs. The data is replaced by ranks. Ties are averaged so that + equal values receive the same rank. The resulting coefficient measures the + strength of a monotonic relationship. + + Spearman's correlation coefficient is appropriate for ordinal data or for + continuous data that doesn't meet the linear proportion requirement for + Pearson's correlation coefficient. Both inputs must be of the same length (no less than two), and need not to be constant, otherwise :exc:`StatisticsError` is raised. - Examples: + Example with `Kepler's laws of planetary motion + `_: .. doctest:: - >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] - >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] - >>> correlation(x, x) + >>> # Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune + >>> orbital_period = [88, 225, 365, 687, 4331, 10_756, 30_687, 60_190] # days + >>> dist_from_sun = [58, 108, 150, 228, 778, 1_400, 2_900, 4_500] # million km + + >>> # Show that a perfect monotonic relationship exists + >>> correlation(orbital_period, dist_from_sun, method='ranked') + 1.0 + + >>> # Observe that a linear relationship is imperfect + >>> round(correlation(orbital_period, dist_from_sun), 4) + 0.9882 + + >>> # Demonstrate Kepler's third law: There is a linear correlation + >>> # between the square of the orbital period and the cube of the + >>> # distance from the sun. + >>> period_squared = [p * p for p in orbital_period] + >>> dist_cubed = [d * d * d for d in dist_from_sun] + >>> round(correlation(period_squared, dist_cubed), 4) 1.0 - >>> correlation(x, y) - -1.0 .. versionadded:: 3.10 + .. versionchanged:: 3.12 + Added support for Spearman's rank correlation coefficient. + .. function:: linear_regression(x, y, /, *, proportional=False) Return the slope and intercept of `simple linear regression diff --git a/Lib/statistics.py b/Lib/statistics.py index c78d64518853e7..a3f915c130220e 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,11 +134,11 @@ from fractions import Fraction from decimal import Decimal -from itertools import groupby, repeat +from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum from functools import reduce -from operator import mul +from operator import mul, itemgetter from collections import Counter, namedtuple, defaultdict _SQRT2 = sqrt(2.0) @@ -355,6 +355,50 @@ def _fail_neg(values, errmsg='negative value'): raise StatisticsError(errmsg) yield x +def _rank(data, /, *, key=None, reverse=False, ties='average') -> list[float]: + """Rank order a dataset. The lowest value has rank 1. + + Ties are averaged so that equal values receive the same rank: + + >>> data = [31, 56, 31, 25, 75, 18] + >>> _rank(data) + [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] + + The operation is idempotent: + + >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0]) + [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] + + It is possible to rank the data in reverse order so that + the highest value has rank 1. Also, a key-function can + extract the field to be ranked: + + >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)] + >>> _rank(goals, key=itemgetter(1), reverse=True) + [2.0, 1.0, 3.0] + + """ + # If this function becomes public at some point, more thought + # needs to be given to the signature. A list of ints is + # plausible when ties is "min" or "max". When ties is "average", + # either list[float] or list[Fraction] is plausible. + + # Default handling of ties matches scipy.stats.mstats.spearmanr. + if ties != 'average': + raise ValueError(f'Unknown tie resolution method: {ties!r}') + if key is not None: + data = map(key, data) + val_pos = sorted(zip(data, count()), reverse=reverse) + i = 0 # To rank starting at 0 instead of 1, set i = -1. + result = [0] * len(val_pos) + for _, g in groupby(val_pos, key=itemgetter(0)): + group = list(g) + size = len(group) + rank = i + (size + 1) / 2 + for value, orig_pos in group: + result[orig_pos] = rank + i += size + return result def _integer_sqrt_of_frac_rto(n: int, m: int) -> int: """Square root of n/m, rounded to the nearest integer using round-to-odd.""" @@ -988,14 +1032,12 @@ def covariance(x, y, /): return sxy / (n - 1) -def correlation(x, y, /): +def correlation(x, y, /, *, method='linear'): """Pearson's correlation coefficient Return the Pearson's correlation coefficient for two inputs. Pearson's - correlation coefficient *r* takes values between -1 and +1. It measures the - strength and direction of the linear relationship, where +1 means very - strong, positive linear relationship, -1 very strong, negative linear - relationship, and 0 no linear relationship. + correlation coefficient *r* takes values between -1 and +1. It measures + the strength and direction of a linear relationship. >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] @@ -1004,12 +1046,25 @@ def correlation(x, y, /): >>> correlation(x, y) -1.0 + If *method* is "ranked", computes Spearman's rank correlation coefficient + for two inputs. The data is replaced by ranks. Ties are averaged + so that equal values receive the same rank. The resulting coefficient + measures the strength of a monotonic relationship. + + Spearman's rank correlation coefficient is appropriate for ordinal + data or for continuous data that doesn't meet the linear proportion + requirement for Pearson's correlation coefficient. """ n = len(x) if len(y) != n: raise StatisticsError('correlation requires that both inputs have same number of data points') if n < 2: raise StatisticsError('correlation requires at least two data points') + if method not in {'linear', 'ranked'}: + raise ValueError(f'Unknown method: {method!r}') + if method == 'ranked': + x = _rank(x) + y = _rank(y) xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index bf85525dd129a5..05ce79f126590a 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2565,6 +2565,22 @@ def test_different_scales(self): self.assertAlmostEqual(statistics.covariance(x, y), 0.1) + def test_correlation_spearman(self): + # https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php + # Compare with: + # >>> import scipy.stats.mstats + # >>> scipy.stats.mstats.spearmanr(reading, mathematics) + # SpearmanrResult(correlation=0.6686960980480712, pvalue=0.03450954165178532) + # And Wolfram Alpha gives: 0.668696 + # https://www.wolframalpha.com/input?i=SpearmanRho%5B%7B56%2C+75%2C+45%2C+71%2C+61%2C+64%2C+58%2C+80%2C+76%2C+61%7D%2C+%7B66%2C+70%2C+40%2C+60%2C+65%2C+56%2C+59%2C+77%2C+67%2C+63%7D%5D + reading = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61] + mathematics = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63] + self.assertAlmostEqual(statistics.correlation(reading, mathematics, method='ranked'), + 0.6686960980480712) + + with self.assertRaises(ValueError): + statistics.correlation(reading, mathematics, method='bad_method') + class TestLinearRegression(unittest.TestCase): def test_constant_input_error(self): diff --git a/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst b/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst new file mode 100644 index 00000000000000..aae76c74e2fd7d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst @@ -0,0 +1,2 @@ +Add support for computing Spearman's correlation coefficient to the existing +statistics.correlation() function.