Skip to content

bpo-39218: Improve accuracy of variance calculation #27960

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 31, 2021
33 changes: 19 additions & 14 deletions Lib/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,15 +728,19 @@ def _ss(data, c=None):
lead to garbage results.
"""
if c is not None:
T, total, count = _sum((x-c)**2 for x in data)
T, total, count = _sum((d := x - c) * d for x in data)
return (T, total)
# Compute the mean accurate to within 1/2 ulp
c = mean(data)
T, total, count = _sum((x-c)**2 for x in data)
# The following sum should mathematically equal zero, but due to rounding
# error may not.
U, total2, count2 = _sum((x - c) for x in data)
assert T == U and count == count2
total -= total2 ** 2 / len(data)
# Initial computation for the sum of square deviations
T, total, count = _sum((d := x - c) * d for x in data)
# Correct any remaining inaccuracy in the mean c.
# The following sum should mathematically equal zero,
# but due to the final rounding of the mean, it may not.
U, error, count2 = _sum((x - c) for x in data)
assert count == count2
correction = error * error / len(data)
total -= correction
assert not total < 0, 'negative sum of square deviations: %f' % total
return (T, total)

Expand Down Expand Up @@ -924,8 +928,8 @@ def correlation(x, y, /):
xbar = fsum(x) / n
ybar = fsum(y) / n
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
sxx = fsum((xi - xbar) ** 2.0 for xi in x)
syy = fsum((yi - ybar) ** 2.0 for yi in y)
sxx = fsum((d := xi - xbar) * d for xi in x)
syy = fsum((d := yi - ybar) * d for yi in y)
try:
return sxy / sqrt(sxx * syy)
except ZeroDivisionError:
Expand Down Expand Up @@ -968,7 +972,7 @@ def linear_regression(x, y, /):
xbar = fsum(x) / n
ybar = fsum(y) / n
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
sxx = fsum((xi - xbar) ** 2.0 for xi in x)
sxx = fsum((d := xi - xbar) * d for xi in x)
try:
slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x)
except ZeroDivisionError:
Expand Down Expand Up @@ -1094,10 +1098,11 @@ def samples(self, n, *, seed=None):

def pdf(self, x):
"Probability density function. P(x <= X < x+dx) / dx"
variance = self._sigma ** 2.0
variance = self._sigma * self._sigma
if not variance:
raise StatisticsError('pdf() not defined when sigma is zero')
return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
diff = x - self._mu
return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance)

def cdf(self, x):
"Cumulative distribution function. P(X <= x)"
Expand Down Expand Up @@ -1161,7 +1166,7 @@ def overlap(self, other):
if not dv:
return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
a = X._mu * Y_var - Y._mu * X_var
b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var))
x1 = (a + b) / dv
x2 = (a - b) / dv
return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
Expand Down Expand Up @@ -1204,7 +1209,7 @@ def stdev(self):
@property
def variance(self):
"Square of the standard deviation."
return self._sigma ** 2.0
return self._sigma * self._sigma

def __add__(x1, x2):
"""Add a constant or another NormalDist instance.
Expand Down
3 changes: 3 additions & 0 deletions Lib/test/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,6 +1210,9 @@ def __pow__(self, other):
def __add__(self, other):
return type(self)(super().__add__(other))
__radd__ = __add__
def __mul__(self, other):
return type(self)(super().__mul__(other))
__rmul__ = __mul__
return (float, Decimal, Fraction, MyFloat)

def test_types_conserved(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve accuracy of variance calculations by using x*x instead of x**2.