From 86836584a22f41858a235d121ad757d79b201d38 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Tue, 29 Aug 2023 14:37:39 -0700 Subject: [PATCH] gh-108322: preserve backwards compatibility in `NormalDist.samples()` https://github.com/python/cpython/pull/108324 switched to a faster implementation, but a caveat is that it changes the specific outputs. This preserves backwards compatibility when the user has asked for specific outputs by supplying a seed value. With an option for them to ask for the new algorithm implementation if desired. When return random values anyway (no seed), the new faster algorithm is used by default. --- Doc/library/statistics.rst | 8 ++++++-- Lib/statistics.py | 13 +++++++++---- Lib/test/test_statistics.py | 19 +++++++++++++++++++ ...-08-22-12-05-47.gh-issue-108322.kf3NJX.rst | 9 +++++++-- 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index a8a79012565321..c778a46a7144db 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -830,8 +830,12 @@ of applications in statistics. .. versionchanged:: 3.13 - Switched to a faster algorithm. To reproduce samples from previous - versions, use :func:`random.seed` and :func:`random.gauss`. + The *use_gauss* keyword argument was added to facilitate a switch to a + faster algorithm. The faster algorithm is used by default when no + *seed* is supplied. The previous, :func:`random.gauss` based, slower + algorithm is used when a *seed* is provided in order to preserve + reproducability between Python versions. To always use the faster + algorithm even when supplying *seed*, pass ``use_gauss=False``. .. method:: NormalDist.pdf(x) diff --git a/Lib/statistics.py b/Lib/statistics.py index 96c803483057e7..a3f62bce658213 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1135,7 +1135,7 @@ def linear_regression(x, y, /, *, proportional=False): >>> noise = NormalDist().samples(5, seed=42) >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)] >>> linear_regression(x, y) #doctest: +ELLIPSIS - LinearRegression(slope=3.17495..., intercept=1.00925...) + LinearRegression(slope=3.09078914170..., intercept=1.75684970486...) If *proportional* is true, the independent variable *x* and the dependent variable *y* are assumed to be directly proportional. @@ -1148,7 +1148,7 @@ def linear_regression(x, y, /, *, proportional=False): >>> y = [3 * x[i] + noise[i] for i in range(5)] >>> linear_regression(x, y, proportional=True) #doctest: +ELLIPSIS - LinearRegression(slope=2.90475..., intercept=0.0) + LinearRegression(slope=3.02447542484..., intercept=0.0) """ n = len(x) @@ -1277,8 +1277,13 @@ def from_samples(cls, data): "Make a normal distribution instance from sample data." return cls(*_mean_stdev(data)) - def samples(self, n, *, seed=None): - "Generate *n* samples for a given mean and standard deviation." + def samples(self, n, *, seed=None, use_gauss=None): + """Generate *n* samples for a given mean and standard deviation.""" + if ((seed is not None and use_gauss is None) or use_gauss): + # This is the Python <= 3.12 behavior (slower, different results). + gauss = random.gauss if seed is None else random.Random(seed).gauss + mu, sigma = self._mu, self._sigma + return [gauss(mu, sigma) for _ in repeat(None, n)] rnd = random.random if seed is None else random.Random(seed).random inv_cdf = _normal_dist_inv_cdf mu = self._mu diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index aa2cf2b1edc584..18443b5629f587 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2769,6 +2769,14 @@ def test_sample_generation(self): xbar = self.module.mean(data) self.assertTrue(mu - sigma*8 <= xbar <= mu + sigma*8) + # Ensure the <=3.12 legacy implementation continues working as well. + data = X.samples(n, use_gauss=True) + self.assertEqual(len(data), n) + self.assertEqual(set(map(type, data)), {float}) + # mean(data) expected to fall within 8 standard deviations + xbar = self.module.mean(data) + self.assertTrue(mu - sigma*8 <= xbar <= mu + sigma*8) + # verify that seeding makes reproducible sequences n = 100 data1 = X.samples(n, seed='happiness and joy') @@ -2779,6 +2787,17 @@ def test_sample_generation(self): self.assertEqual(data2, data4) self.assertNotEqual(data1, data2) + # Verify that seeding makes reproducible sequences with the faster + # 3.13+ implementation as well. + n = 100 + data1 = X.samples(n, seed='happiness and joy', use_gauss=False) + data2 = X.samples(n, seed='trouble and despair', use_gauss=False) + data3 = X.samples(n, seed='happiness and joy', use_gauss=False) + data4 = X.samples(n, seed='trouble and despair', use_gauss=False) + self.assertEqual(data1, data3) + self.assertEqual(data2, data4) + self.assertNotEqual(data1, data2) + def test_pdf(self): NormalDist = self.module.NormalDist X = NormalDist(100, 15) diff --git a/Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst b/Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst index 5416c01a43f113..03251d4760e61c 100644 --- a/Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst +++ b/Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst @@ -1,2 +1,7 @@ -Speed-up NormalDist.samples() by using the inverse CDF method instead of -calling random.gauss(). +Speed-up :meth:`statistics.NormalDist.samples` by using the inverse CDF method +instead of calling :func:`random.gauss`. When an explicit ``seed=`` is +specified the original slower gauss based results remain the default to avoid +introducing behavior differences between Python versions where people expect a +consistent unchanging set of results. Users can pass the new +``use_gauss=False`` parameter along with ``seed=`` for better performance when +using a fixed seed.