From 8750d9c1b021cfd092f9f535c820963619df320a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 10 Aug 2022 17:34:50 -0500 Subject: [PATCH 1/9] Add support for earson's correlation coefficient --- Lib/statistics.py | 35 +++++++++++++++++-- ...2-08-10-17-34-07.gh-issue-95861.qv-T5s.rst | 2 ++ 2 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst diff --git a/Lib/statistics.py b/Lib/statistics.py index c78d64518853e7..60cdfe34833e7d 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,11 +134,11 @@ from fractions import Fraction from decimal import Decimal -from itertools import groupby, repeat +from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum from functools import reduce -from operator import mul +from operator import mul, itemgetter from collections import Counter, namedtuple, defaultdict _SQRT2 = sqrt(2.0) @@ -355,6 +355,25 @@ def _fail_neg(values, errmsg='negative value'): raise StatisticsError(errmsg) yield x +def _rank(data, /, *, reverse=False) -> list[float]: + """Rank order a dataset. + + By default, the lowest value has rank 1. + Ties are averaged so that equal values receive the same rank. + + """ + # Handling of ties matches scipy.stats.mstats.spearmanr + val_pos = sorted(zip(data, count()), reverse=reverse) + i = 0 + result = [0] * len(val_pos) + for _, g in groupby(val_pos, key=itemgetter(0)): + group = list(g) + size = len(group) + rank = i + (size + 1) / 2 + for value, orig_pos in group: + result[orig_pos] = rank + i += size + return result def _integer_sqrt_of_frac_rto(n: int, m: int) -> int: """Square root of n/m, rounded to the nearest integer using round-to-odd.""" @@ -988,7 +1007,7 @@ def covariance(x, y, /): return sxy / (n - 1) -def correlation(x, y, /): +def correlation(x, y, /, *, ranked=False): """Pearson's correlation coefficient Return the Pearson's correlation coefficient for two inputs. Pearson's @@ -1004,12 +1023,22 @@ def correlation(x, y, /): >>> correlation(x, y) -1.0 + If *ranked* is true, computes Spearman's correlation coefficient for + two inputs. The data is replaced by ranks. Ties are averaged so + that equal values receive the same rank. + + Spearman's correlation coefficient is appropriate for ordinal data + or for continuous data that doesn't meet the linear proportion + requirement for Pearson's correlation coefficient. """ n = len(x) if len(y) != n: raise StatisticsError('correlation requires that both inputs have same number of data points') if n < 2: raise StatisticsError('correlation requires at least two data points') + if ranked: + x = _rank(x) + y = _rank(y) xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) diff --git a/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst b/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst new file mode 100644 index 00000000000000..aae76c74e2fd7d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-08-10-17-34-07.gh-issue-95861.qv-T5s.rst @@ -0,0 +1,2 @@ +Add support for computing Spearman's correlation coefficient to the existing +statistics.correlation() function. From cb525248a70c88c54b06ae3e994e1dc19ead71e4 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 12 Aug 2022 16:16:44 -0500 Subject: [PATCH 2/9] Add doctest. Remove unused "reversed" argument. Rename "ranked" to "by_rank". --- Lib/statistics.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 60cdfe34833e7d..66d6ab74b2b286 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -355,15 +355,21 @@ def _fail_neg(values, errmsg='negative value'): raise StatisticsError(errmsg) yield x -def _rank(data, /, *, reverse=False) -> list[float]: - """Rank order a dataset. +def _rank(data, /) -> list[float]: + """Rank order a dataset. The lowest value has rank 1. - By default, the lowest value has rank 1. Ties are averaged so that equal values receive the same rank. + The operation is idempotent. + + >>> data = [31, 56, 31, 25, 75, 18] + >>> _rank(data) + [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] + >>> _rank(_) + [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] """ # Handling of ties matches scipy.stats.mstats.spearmanr - val_pos = sorted(zip(data, count()), reverse=reverse) + val_pos = sorted(zip(data, count())) i = 0 result = [0] * len(val_pos) for _, g in groupby(val_pos, key=itemgetter(0)): @@ -1007,7 +1013,7 @@ def covariance(x, y, /): return sxy / (n - 1) -def correlation(x, y, /, *, ranked=False): +def correlation(x, y, /, *, by_rank=False): """Pearson's correlation coefficient Return the Pearson's correlation coefficient for two inputs. Pearson's @@ -1023,9 +1029,9 @@ def correlation(x, y, /, *, ranked=False): >>> correlation(x, y) -1.0 - If *ranked* is true, computes Spearman's correlation coefficient for - two inputs. The data is replaced by ranks. Ties are averaged so - that equal values receive the same rank. + If *by_rank* is true, computes Spearman's correlation coefficient + for two inputs. The data is replaced by ranks. Ties are averaged + so that equal values receive the same rank. Spearman's correlation coefficient is appropriate for ordinal data or for continuous data that doesn't meet the linear proportion From 102a4c5a6ad30931ad0cfd0d8f1e0a8db9bbebe6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 12 Aug 2022 17:11:32 -0500 Subject: [PATCH 3/9] Add docs --- Doc/library/statistics.rst | 46 +++++++++++++++++++++++++++++--------- Lib/statistics.py | 15 ++++++------- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index bf869903c0f84e..77984d1fcda77a 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -648,31 +648,57 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionadded:: 3.10 -.. function:: correlation(x, y, /) +.. function:: correlation(x, y, /, *, by_rank=False) Return the `Pearson's correlation coefficient `_ for two inputs. Pearson's correlation coefficient *r* takes values - between -1 and +1. It measures the strength and direction of the linear - relationship, where +1 means very strong, positive linear relationship, - -1 very strong, negative linear relationship, and 0 no linear relationship. + between -1 and +1. It measures the strength and direction of a linear + relationship. + + If *by_rank* is true, computes `Spearman's rank correlation coefficient + `_ + for two inputs. The data is replaced by ranks. Ties are averaged so that + equal values receive the same rank. The resulting coefficient measures the + strength of a monotonic relationship. + + Spearman's correlation coefficient is appropriate for ordinal data or for + continuous data that doesn't meet the linear proportion requirement for + Pearson's correlation coefficient. Both inputs must be of the same length (no less than two), and need not to be constant, otherwise :exc:`StatisticsError` is raised. - Examples: + Example with `Kepler's laws of planetary motion + `_: .. doctest:: - >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] - >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] - >>> correlation(x, x) + >>> # Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune + >>> orbital_period = [88, 225, 365, 687, 4331, 10_756, 30_687, 60_190] # days + >>> dist_from_sun = [58, 108, 150, 228, 778, 1_400, 2_900, 4_500] # million km + + >>> # Show that a perfect monotonic relationship exists + >>> correlation(orbital_period, dist_from_sun, by_rank=True) + 1.0 + + >>> # Observe that a linear relationship is imperfect + >>> round(correlation(orbital_period, dist_from_sun), 4) + 0.9882 + + >>> # Demonstrate Kepler's third law: There is a linear correlation + >>> # between the square of the orbital period and the cube of the + >>> # distance from the sun + >>> period_squared = [p * p for p in orbital_period] + >>> dist_cubed = [d * d * d for d in dist_from_sun] + >>> round(correlation(period_squared, dist_cubed), 4) 1.0 - >>> correlation(x, y) - -1.0 .. versionadded:: 3.10 + .. versionchanged:: 3.12 + Added support for Spearman's rank correlation coefficient. + .. function:: linear_regression(x, y, /, *, proportional=False) Return the slope and intercept of `simple linear regression diff --git a/Lib/statistics.py b/Lib/statistics.py index 66d6ab74b2b286..7a779ea96659d4 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1017,10 +1017,8 @@ def correlation(x, y, /, *, by_rank=False): """Pearson's correlation coefficient Return the Pearson's correlation coefficient for two inputs. Pearson's - correlation coefficient *r* takes values between -1 and +1. It measures the - strength and direction of the linear relationship, where +1 means very - strong, positive linear relationship, -1 very strong, negative linear - relationship, and 0 no linear relationship. + correlation coefficient *r* takes values between -1 and +1. It measures + the strength and direction of a linear relationship. >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] @@ -1029,12 +1027,13 @@ def correlation(x, y, /, *, by_rank=False): >>> correlation(x, y) -1.0 - If *by_rank* is true, computes Spearman's correlation coefficient + If *by_rank* is true, computes Spearman's rank correlation coefficient for two inputs. The data is replaced by ranks. Ties are averaged - so that equal values receive the same rank. + so that equal values receive the same rank. The resulting coefficient measures + the strength of a monotonic relationship. - Spearman's correlation coefficient is appropriate for ordinal data - or for continuous data that doesn't meet the linear proportion + Spearman's rank correlation coefficient is appropriate for ordinal + data or for continuous data that doesn't meet the linear proportion requirement for Pearson's correlation coefficient. """ n = len(x) From 97bfe04afb6f88837f2dfd2f5e4e2daf73c4ef25 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 12 Aug 2022 17:42:06 -0500 Subject: [PATCH 4/9] Add test --- Lib/statistics.py | 2 +- Lib/test/test_statistics.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 7a779ea96659d4..fe2e05b10eac71 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1041,7 +1041,7 @@ def correlation(x, y, /, *, by_rank=False): raise StatisticsError('correlation requires that both inputs have same number of data points') if n < 2: raise StatisticsError('correlation requires at least two data points') - if ranked: + if by_rank: x = _rank(x) y = _rank(y) xbar = fsum(x) / n diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index bf85525dd129a5..c4d0921d771ec7 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2565,6 +2565,19 @@ def test_different_scales(self): self.assertAlmostEqual(statistics.covariance(x, y), 0.1) + def test_correlation_spearman(self): + # https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php + # Compare with: + # >>> import scipy.stats.mstats + # >>> scipy.stats.mstats.spearmanr(reading, mathematics) + # SpearmanrResult(correlation=0.6686960980480712, pvalue=0.03450954165178532) + # And Wolfram Alpha gives: 0.668696 + # https://www.wolframalpha.com/input?i=SpearmanRho%5B%7B56%2C+75%2C+45%2C+71%2C+61%2C+64%2C+58%2C+80%2C+76%2C+61%7D%2C+%7B66%2C+70%2C+40%2C+60%2C+65%2C+56%2C+59%2C+77%2C+67%2C+63%7D%5D + reading = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61] + mathematics = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63] + self.assertAlmostEqual(statistics.correlation(reading, mathematics, by_rank=True), + 0.6686960980480712) + class TestLinearRegression(unittest.TestCase): def test_constant_input_error(self): From bd3da1ca26ae8d372511e4e5f93aa4cea5c899d6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 13 Aug 2022 01:38:53 -0500 Subject: [PATCH 5/9] Remove use of underscorde in doctest. --- Lib/statistics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index fe2e05b10eac71..7aee207150c8f8 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -359,12 +359,14 @@ def _rank(data, /) -> list[float]: """Rank order a dataset. The lowest value has rank 1. Ties are averaged so that equal values receive the same rank. - The operation is idempotent. >>> data = [31, 56, 31, 25, 75, 18] >>> _rank(data) [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] - >>> _rank(_) + + The operation is idempotent. + + >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0]) [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] """ From d0260bf454ab741c97d0af88c0fb3a99240f69f9 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 13 Aug 2022 20:18:29 -0500 Subject: [PATCH 6/9] Update the dispatch table --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 77984d1fcda77a..b2b9ce96fe64ba 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -104,7 +104,7 @@ These functions calculate statistics regarding relations between two inputs. ========================= ===================================================== :func:`covariance` Sample covariance for two variables. -:func:`correlation` Pearson's correlation coefficient for two variables. +:func:`correlation` Pearson and Spearman's correlation coefficients. :func:`linear_regression` Slope and intercept for simple linear regression. ========================= ===================================================== From 95a217652d911a3c882610b8432eb41824284b26 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 18 Aug 2022 11:24:06 -0500 Subject: [PATCH 7/9] Prepare _rank() to be made public in the future. --- Lib/statistics.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 7aee207150c8f8..f2655feec17d2b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -355,7 +355,7 @@ def _fail_neg(values, errmsg='negative value'): raise StatisticsError(errmsg) yield x -def _rank(data, /) -> list[float]: +def _rank(data, /, *, key=None, reverse=False, ties='average') -> list[float]: """Rank order a dataset. The lowest value has rank 1. Ties are averaged so that equal values receive the same rank. @@ -369,10 +369,27 @@ def _rank(data, /) -> list[float]: >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0]) [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] + It is possible to rank the data in reverse order so that + the highest value has rank 1. Also, a key-function can + extract the field to be ranked: + + >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)] + >>> _rank(goals, key=itemgetter(1), reverse=True) + [2.0, 1.0, 3.0] + """ - # Handling of ties matches scipy.stats.mstats.spearmanr - val_pos = sorted(zip(data, count())) - i = 0 + # If this function becomes public at some point, more thought + # needs to be given to the signature. A list of ints is + # plausible when ties is "min" or "max". When ties is "average", + # either list[float] or list[Fraction] is plausible. + + # Default handling of ties matches scipy.stats.mstats.spearmanr. + if ties != 'average': + raise ValueError(f'Unknown tie resolution method: {ties!r}') + if key is not None: + data = map(key, data) + val_pos = sorted(zip(data, count()), reverse=reverse) + i = 0 # To rank starting at 0 instead of 1, set i = -1. result = [0] * len(val_pos) for _, g in groupby(val_pos, key=itemgetter(0)): group = list(g) From 222699ee0696ebc6da9c3cdcd8fef8394a185ff7 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 18 Aug 2022 12:07:13 -0500 Subject: [PATCH 8/9] Replace flag with a "method" keyword argument for future proofing. --- Doc/library/statistics.rst | 8 ++++---- Lib/statistics.py | 12 +++++++----- Lib/test/test_statistics.py | 5 ++++- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index b2b9ce96fe64ba..c3f9c1f5239e8b 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -648,7 +648,7 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionadded:: 3.10 -.. function:: correlation(x, y, /, *, by_rank=False) +.. function:: correlation(x, y, /, *, method='linear') Return the `Pearson's correlation coefficient `_ @@ -656,7 +656,7 @@ However, for reading convenience, most of the examples show sorted sequences. between -1 and +1. It measures the strength and direction of a linear relationship. - If *by_rank* is true, computes `Spearman's rank correlation coefficient + If *method* is "ranked", computes `Spearman's rank correlation coefficient `_ for two inputs. The data is replaced by ranks. Ties are averaged so that equal values receive the same rank. The resulting coefficient measures the @@ -679,7 +679,7 @@ However, for reading convenience, most of the examples show sorted sequences. >>> dist_from_sun = [58, 108, 150, 228, 778, 1_400, 2_900, 4_500] # million km >>> # Show that a perfect monotonic relationship exists - >>> correlation(orbital_period, dist_from_sun, by_rank=True) + >>> correlation(orbital_period, dist_from_sun, method='ranked') 1.0 >>> # Observe that a linear relationship is imperfect @@ -688,7 +688,7 @@ However, for reading convenience, most of the examples show sorted sequences. >>> # Demonstrate Kepler's third law: There is a linear correlation >>> # between the square of the orbital period and the cube of the - >>> # distance from the sun + >>> # distance from the sun. >>> period_squared = [p * p for p in orbital_period] >>> dist_cubed = [d * d * d for d in dist_from_sun] >>> round(correlation(period_squared, dist_cubed), 4) diff --git a/Lib/statistics.py b/Lib/statistics.py index f2655feec17d2b..55a76a903b2f59 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1032,7 +1032,7 @@ def covariance(x, y, /): return sxy / (n - 1) -def correlation(x, y, /, *, by_rank=False): +def correlation(x, y, /, *, method='linear'): """Pearson's correlation coefficient Return the Pearson's correlation coefficient for two inputs. Pearson's @@ -1046,10 +1046,10 @@ def correlation(x, y, /, *, by_rank=False): >>> correlation(x, y) -1.0 - If *by_rank* is true, computes Spearman's rank correlation coefficient + If *method* is "ranked", computes Spearman's rank correlation coefficient for two inputs. The data is replaced by ranks. Ties are averaged - so that equal values receive the same rank. The resulting coefficient measures - the strength of a monotonic relationship. + so that equal values receive the same rank. The resulting coefficient + measures the strength of a monotonic relationship. Spearman's rank correlation coefficient is appropriate for ordinal data or for continuous data that doesn't meet the linear proportion @@ -1060,7 +1060,9 @@ def correlation(x, y, /, *, by_rank=False): raise StatisticsError('correlation requires that both inputs have same number of data points') if n < 2: raise StatisticsError('correlation requires at least two data points') - if by_rank: + if method not in {'linear', 'ranked'}: + raise ValueError(f'Unknown method: {method!r}') + if method == 'ranked': x = _rank(x) y = _rank(y) xbar = fsum(x) / n diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index c4d0921d771ec7..05ce79f126590a 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2575,9 +2575,12 @@ def test_correlation_spearman(self): # https://www.wolframalpha.com/input?i=SpearmanRho%5B%7B56%2C+75%2C+45%2C+71%2C+61%2C+64%2C+58%2C+80%2C+76%2C+61%7D%2C+%7B66%2C+70%2C+40%2C+60%2C+65%2C+56%2C+59%2C+77%2C+67%2C+63%7D%5D reading = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61] mathematics = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63] - self.assertAlmostEqual(statistics.correlation(reading, mathematics, by_rank=True), + self.assertAlmostEqual(statistics.correlation(reading, mathematics, method='ranked'), 0.6686960980480712) + with self.assertRaises(ValueError): + statistics.correlation(reading, mathematics, method='bad_method') + class TestLinearRegression(unittest.TestCase): def test_constant_input_error(self): From 873e24017c8d0b714dab869490b62f7f0bb5fe88 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 18 Aug 2022 12:40:28 -0500 Subject: [PATCH 9/9] Neaten up the docstring for _rank(). --- Lib/statistics.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 55a76a903b2f59..a3f915c130220e 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -358,24 +358,24 @@ def _fail_neg(values, errmsg='negative value'): def _rank(data, /, *, key=None, reverse=False, ties='average') -> list[float]: """Rank order a dataset. The lowest value has rank 1. - Ties are averaged so that equal values receive the same rank. + Ties are averaged so that equal values receive the same rank: - >>> data = [31, 56, 31, 25, 75, 18] - >>> _rank(data) - [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] + >>> data = [31, 56, 31, 25, 75, 18] + >>> _rank(data) + [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] - The operation is idempotent. + The operation is idempotent: - >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0]) - [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] + >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0]) + [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] It is possible to rank the data in reverse order so that the highest value has rank 1. Also, a key-function can extract the field to be ranked: - >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)] - >>> _rank(goals, key=itemgetter(1), reverse=True) - [2.0, 1.0, 3.0] + >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)] + >>> _rank(goals, key=itemgetter(1), reverse=True) + [2.0, 1.0, 3.0] """ # If this function becomes public at some point, more thought