From b0b50f9240802a5d5e8735415ac983af445a6850 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 18 Aug 2020 00:31:42 -0700 Subject: [PATCH 01/12] First draft. Tests fail. --- Modules/mathmodule.c | 47 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 489802cc367450..d3931f4be51fcd 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2446,10 +2446,13 @@ even if not otherwise needed to prevent overflow or loss of precision. */ +static const double T27 = 134217729.0; /* T27=ldexp(1.0, 27)+1.0) */ + static inline double vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) { double x, csum = 1.0, oldcsum, frac = 0.0, scale; + double t, hi, lo, result; int max_e; Py_ssize_t i; @@ -2474,11 +2477,53 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) x = x*x; assert(x <= 1.0); assert(csum >= x); + + t = x * T27; + hi = t - (t - x); + lo = x - hi; + assert (hi + lo == x); + + x = lo * lo; + oldcsum = csum; + csum += x; + frac += (oldcsum - csum) + x; + + x = 2.0 * hi * lo; + oldcsum = csum; + csum += x; + frac += (oldcsum - csum) + x; + + x = hi * hi; oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; } - return sqrt(csum - 1.0 + frac) / scale; + result = sqrt(csum - 1.0 + frac) / scale; + + x = result; + t = x * T27; + hi = t - (t - x); + lo = x - hi; + assert (hi + lo == x); + + x = -lo * lo; + oldcsum = csum; + csum += x; + frac += (oldcsum - csum) + x; + + x = -2.0 * hi * lo; + oldcsum = csum; + csum += x; + frac += (oldcsum - csum) + x; + + x = -hi * hi; + oldcsum = csum; + csum += x; + frac += (oldcsum - csum) + x; + + x = csum - 1.0 + frac; + result += x / (2.0 * result); + return result / scale; } /* When max_e < -1023, ldexp(1.0, -max_e) overflows. So instead of multiplying by a scale, we just divide by *max*. From 3447a73577255fb8a2011e3848e5f39ed06155b0 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 18 Aug 2020 00:56:41 -0700 Subject: [PATCH 02/12] Was squaring twice --- Modules/mathmodule.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index d3931f4be51fcd..5773d0353ad0a4 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2474,7 +2474,6 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) x = vec[i]; assert(Py_IS_FINITE(x) && fabs(x) <= max); x *= scale; - x = x*x; assert(x <= 1.0); assert(csum >= x); From b8ba65d3ae6cd0413160b8a55562781d4fd02e90 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 18 Aug 2020 01:01:39 -0700 Subject: [PATCH 03/12] Divided by scale too early --- Modules/mathmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 5773d0353ad0a4..1464c81e1b66b2 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2497,7 +2497,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) csum += x; frac += (oldcsum - csum) + x; } - result = sqrt(csum - 1.0 + frac) / scale; + result = sqrt(csum - 1.0 + frac); x = result; t = x * T27; From df0109f9e2a4619add5635f617da908798e41d51 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 18 Aug 2020 01:25:19 -0700 Subject: [PATCH 04/12] Fix order of application. Add more assertions --- Modules/mathmodule.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 1464c81e1b66b2..88ee80c2485350 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2475,24 +2475,26 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) assert(Py_IS_FINITE(x) && fabs(x) <= max); x *= scale; assert(x <= 1.0); - assert(csum >= x); t = x * T27; hi = t - (t - x); lo = x - hi; assert (hi + lo == x); - x = lo * lo; + x = hi * hi; + assert(csum >= x); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; x = 2.0 * hi * lo; + assert(csum >= x); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; - x = hi * hi; + x = lo * lo; + assert(csum >= x); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; @@ -2506,16 +2508,19 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) assert (hi + lo == x); x = -lo * lo; + assert(csum >= x); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; x = -2.0 * hi * lo; + assert(csum >= x); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; x = -hi * hi; + assert(csum >= x); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; From 1fd8643a097412cf4ec11614b69d8326f22c6693 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 02:02:33 -0700 Subject: [PATCH 05/12] Start to refine assertions --- Modules/mathmodule.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 88ee80c2485350..4cf69368e0355f 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2472,9 +2472,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) assert(max * scale < 1.0); for (i=0 ; i < n ; i++) { x = vec[i]; - assert(Py_IS_FINITE(x) && fabs(x) <= max); + assert(Py_IS_FINITE(x) && x >= 0.0 && fabs(x) <= max); x *= scale; - assert(x <= 1.0); + assert(x < 1.0); t = x * T27; hi = t - (t - x); From 26ba86e3c99cf8799398a4e35880fe62bd4d24aa Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 12:31:59 -0700 Subject: [PATCH 06/12] The actual summation invariant is expressed in magnitudes --- Modules/mathmodule.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 4cf69368e0355f..fe53a1622abd92 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2479,22 +2479,23 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) t = x * T27; hi = t - (t - x); lo = x - hi; - assert (hi + lo == x); + assert(hi + lo == x); x = hi * hi; - assert(csum >= x); + assert(x <= 1.0); + assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; x = 2.0 * hi * lo; - assert(csum >= x); + assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; x = lo * lo; - assert(csum >= x); + assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; @@ -2508,19 +2509,19 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) assert (hi + lo == x); x = -lo * lo; - assert(csum >= x); + assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; x = -2.0 * hi * lo; - assert(csum >= x); + assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; x = -hi * hi; - assert(csum >= x); + assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; @@ -2538,7 +2539,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) x /= max; x = x*x; assert(x <= 1.0); - assert(csum >= x); + assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; frac += (oldcsum - csum) + x; From ef0c4b00b43fdd21eb0e09bdfaa6f542167f98ad Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 14:09:34 -0700 Subject: [PATCH 07/12] Explain the algorithm in the comments. Provide references. Improve variable names. --- Modules/mathmodule.c | 104 +++++++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 38 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index fe53a1622abd92..e49356edda4740 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2404,45 +2404,73 @@ math_fmod_impl(PyObject *module, double x, double y) } /* -Given an *n* length *vec* of values and a value *max*, compute: +Given an *n* length *vec* of values, compute the vector norm: - sqrt(sum((x * scale) ** 2 for x in vec)) / scale + sqrt(sum(x ** 2 for x in vec) - where scale is the first power of two - greater than max. - -or compute: - - max * sqrt(sum((x / max) ** 2 for x in vec)) - -The value of the *max* variable must be non-negative and -equal to the absolute value of the largest magnitude -entry in the vector. If n==0, then *max* should be 0.0. +The *vec* values should be non-negative. +The *max* variable should be equal to the largest *x*. +The *n* variable is the length of *vec*. +If n==0, then *max* should be 0.0. If an infinity is present in the vec, *max* should be INF. - The *found_nan* variable indicates whether some member of the *vec* is a NaN. -To improve accuracy and to increase the number of cases where -vector_norm() is commutative, we use a variant of Neumaier -summation specialized to exploit that we always know that -|csum| >= |x|. - -The *csum* variable tracks the cumulative sum and *frac* tracks -the cumulative fractional errors at each step. Since this -variant assumes that |csum| >= |x| at each step, we establish -the precondition by starting the accumulation from 1.0 which -represents the largest possible value of (x*scale)**2 or (x/max)**2. - -After the loop is finished, the initial 1.0 is subtracted out -for a net zero effect on the final sum. Since *csum* will be -greater than 1.0, the subtraction of 1.0 will not cause -fractional digits to be dropped from *csum*. - -To get the full benefit from compensated summation, the -largest addend should be in the range: 0.5 <= x <= 1.0. -Accordingly, scaling or division by *max* should not be skipped -even if not otherwise needed to prevent overflow or loss of precision. +To achieve high accuracy, where results are almost always correctly +rounded, and to avoid overflow/underflow, four techniques are used: + +* lossless scaling using a power-of-two scaling factor +* accurate squaring using Veltkamp-Dekker splitting +* compensated summation using a variant of the Neumaier algorithm +* differential correction of the square root + +The usual presentation of the Neumaier summation algorithm has an +expensive branch depending on which operand has the larger +magnitude. We avoid this cost by arranging the calculation so that +fabs(csum) is always as large as fabs(x). + +To establish the invariant, *csum* is initialized to 1.0 which is +always larger than x**2 after scaling or division by *max*. +After the loop is finished, the initial 1.0 is subtracted out for a +net zero effect on the final sum. Since *csum* will be greater than +1.0, the subtraction of 1.0 will not cause fractional digits to be +dropped from *csum*. + +To get the full benefit from compensated summation, the largest +addend should be in the range: 0.5 <= x <= 1.0. Accordingly, +scaling or division by *max* should not be skipped even if not +otherwise needed to prevent overflow or loss of precision. + +The assertion that hi*hi >= 1.0 is a bit subtle. Each vector +element gets scaled to be below 1.0. The Veltkamp-Dekker splitting +algorithm gives a *hi* value that is correctly rounded to half +precision. When a value at or below 1.0 is correctly rounded, it +never goes above 1.0. And when values at or below 1.0 are squared, +they remain at or below 1.0, thus preserving the summation +invariant. + +The square root differential correction is needed because a +correctly rounded square root of the correctly rounded sum of +squares can still be off by as much as one ulp. + +The differential correction starts with a value *x* that is +the difference between the square of *h*, the possibly inaccurately +rounded square root, and the accurate computed sum of squares. +The correction is the first order term of the Maclaurin series +expansion of sqrt(h**2 - x) == h - x/(2*h) + O(x**2). + +Essentially, this differential correction is equivalent to one +refinement step in the Newton divide-and-average square root +algorithm, effectively doubling the number of accurate bits. + +This technique is used in Dekker's SQRT2 algorithm and again in +Borges' ALGORITHM 4 and 5. + +References: + +1. Veltkamp-Dekker splitting: http://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf +2. Compensated summation: http://www.ti3.tu-harburg.de/paper/rump/Ru08b.pdf +3. Square root diffential correction: https://arxiv.org/pdf/1904.09481.pdf */ @@ -2452,7 +2480,7 @@ static inline double vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) { double x, csum = 1.0, oldcsum, frac = 0.0, scale; - double t, hi, lo, result; + double t, hi, lo, h; int max_e; Py_ssize_t i; @@ -2473,6 +2501,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) for (i=0 ; i < n ; i++) { x = vec[i]; assert(Py_IS_FINITE(x) && x >= 0.0 && fabs(x) <= max); + x *= scale; assert(x < 1.0); @@ -2500,9 +2529,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) csum += x; frac += (oldcsum - csum) + x; } - result = sqrt(csum - 1.0 + frac); + h = sqrt(csum - 1.0 + frac); - x = result; + x = h; t = x * T27; hi = t - (t - x); lo = x - hi; @@ -2527,8 +2556,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) frac += (oldcsum - csum) + x; x = csum - 1.0 + frac; - result += x / (2.0 * result); - return result / scale; + return (h + x / (2.0 * h)) / scale; } /* When max_e < -1023, ldexp(1.0, -max_e) overflows. So instead of multiplying by a scale, we just divide by *max*. From e17ca4cb2ffd96ebecc16c766a62e0286085ec8b Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 14:23:27 -0700 Subject: [PATCH 08/12] Add blurb --- .../next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst diff --git a/Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst b/Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst new file mode 100644 index 00000000000000..b4d0d9b63cf87d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst @@ -0,0 +1,3 @@ +Improved the accuracy of math.hypot(). Internally, each step is computed +with extra precision so that the result is now almost always correctly +rounded. From c0134374d1f705713aaaf3ea00a56b89227af2d4 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 14:48:52 -0700 Subject: [PATCH 09/12] Move the splitting constant into the function that uses it. --- Modules/mathmodule.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index e49356edda4740..917c4e587c095d 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2474,11 +2474,10 @@ Borges' ALGORITHM 4 and 5. */ -static const double T27 = 134217729.0; /* T27=ldexp(1.0, 27)+1.0) */ - static inline double vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) { + const double T27 = 134217729.0; /* ldexp(1.0, 27)+1.0) */ double x, csum = 1.0, oldcsum, frac = 0.0, scale; double t, hi, lo, h; int max_e; From f2c325427da850c27b1e7f2739c62d768091e2c6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 17:18:56 -0700 Subject: [PATCH 10/12] Drop the requirement that input elements are non-negative. --- Modules/mathmodule.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 917c4e587c095d..b6b46e3c0292e8 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2404,20 +2404,19 @@ math_fmod_impl(PyObject *module, double x, double y) } /* -Given an *n* length *vec* of values, compute the vector norm: +Given a *vec* of values, compute the vector norm: - sqrt(sum(x ** 2 for x in vec) + sqrt(sum(x ** 2 for x in vec)) -The *vec* values should be non-negative. -The *max* variable should be equal to the largest *x*. +The *max* variable should be equal to the largest fabs(x). The *n* variable is the length of *vec*. If n==0, then *max* should be 0.0. If an infinity is present in the vec, *max* should be INF. The *found_nan* variable indicates whether some member of the *vec* is a NaN. -To achieve high accuracy, where results are almost always correctly -rounded, and to avoid overflow/underflow, four techniques are used: +To avoid overflow/underflow and to achieve high accuracy giving results +that are almost always correctly rounded, four techniques are used: * lossless scaling using a power-of-two scaling factor * accurate squaring using Veltkamp-Dekker splitting @@ -2437,32 +2436,30 @@ net zero effect on the final sum. Since *csum* will be greater than dropped from *csum*. To get the full benefit from compensated summation, the largest -addend should be in the range: 0.5 <= x <= 1.0. Accordingly, +addend should be in the range: 0.5 <= |x| <= 1.0. Accordingly, scaling or division by *max* should not be skipped even if not otherwise needed to prevent overflow or loss of precision. -The assertion that hi*hi >= 1.0 is a bit subtle. Each vector -element gets scaled to be below 1.0. The Veltkamp-Dekker splitting +The assertion that hi*hi >= 1.0 is a bit subtle. Each vector element +gets scaled to a magnitude below 1.0. The Veltkamp-Dekker splitting algorithm gives a *hi* value that is correctly rounded to half precision. When a value at or below 1.0 is correctly rounded, it never goes above 1.0. And when values at or below 1.0 are squared, -they remain at or below 1.0, thus preserving the summation -invariant. +they remain at or below 1.0, thus preserving the summation invariant. The square root differential correction is needed because a -correctly rounded square root of the correctly rounded sum of +correctly rounded square root of a correctly rounded sum of squares can still be off by as much as one ulp. The differential correction starts with a value *x* that is the difference between the square of *h*, the possibly inaccurately -rounded square root, and the accurate computed sum of squares. +rounded square root, and the accurately computed sum of squares. The correction is the first order term of the Maclaurin series expansion of sqrt(h**2 - x) == h - x/(2*h) + O(x**2). Essentially, this differential correction is equivalent to one refinement step in the Newton divide-and-average square root algorithm, effectively doubling the number of accurate bits. - This technique is used in Dekker's SQRT2 algorithm and again in Borges' ALGORITHM 4 and 5. @@ -2499,10 +2496,10 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) assert(max * scale < 1.0); for (i=0 ; i < n ; i++) { x = vec[i]; - assert(Py_IS_FINITE(x) && x >= 0.0 && fabs(x) <= max); + assert(Py_IS_FINITE(x) && fabs(x) <= max); x *= scale; - assert(x < 1.0); + assert(fabs(x) < 1.0); t = x * T27; hi = t - (t - x); From 106bec215d3fdf673747edd480267eabee8a7f60 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 18:47:47 -0700 Subject: [PATCH 11/12] Update Modules/mathmodule.c Co-authored-by: Tim Peters --- Modules/mathmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index b6b46e3c0292e8..aaa2b83763e471 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2455,7 +2455,7 @@ The differential correction starts with a value *x* that is the difference between the square of *h*, the possibly inaccurately rounded square root, and the accurately computed sum of squares. The correction is the first order term of the Maclaurin series -expansion of sqrt(h**2 - x) == h - x/(2*h) + O(x**2). +expansion of sqrt(h**2 + x) == h + x/(2*h) + O(x**2). Essentially, this differential correction is equivalent to one refinement step in the Newton divide-and-average square root From 39667b53b069c110dbbc258aa3c7f7daaef9c42d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 23 Aug 2020 19:07:30 -0700 Subject: [PATCH 12/12] Keep the summation order consistent between sections --- Modules/mathmodule.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index aaa2b83763e471..1d6174132814b2 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2533,7 +2533,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) lo = x - hi; assert (hi + lo == x); - x = -lo * lo; + x = -hi * hi; assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x; @@ -2545,7 +2545,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) csum += x; frac += (oldcsum - csum) + x; - x = -hi * hi; + x = -lo * lo; assert(fabs(csum) >= fabs(x)); oldcsum = csum; csum += x;