From b0b50f9240802a5d5e8735415ac983af445a6850 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Tue, 18 Aug 2020 00:31:42 -0700
Subject: [PATCH 01/12] First draft. Tests fail.

---
 Modules/mathmodule.c | 47 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 489802cc367450..d3931f4be51fcd 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2446,10 +2446,13 @@ even if not otherwise needed to prevent overflow or loss of precision.
 
 */
 
+static const double T27 = 134217729.0;   /* T27=ldexp(1.0, 27)+1.0) */
+
 static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
     double x, csum = 1.0, oldcsum, frac = 0.0, scale;
+    double t, hi, lo, result;
     int max_e;
     Py_ssize_t i;
 
@@ -2474,11 +2477,53 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
             x = x*x;
             assert(x <= 1.0);
             assert(csum >= x);
+
+            t = x * T27;
+            hi = t - (t - x);
+            lo = x - hi;
+            assert (hi + lo == x);
+
+            x = lo * lo;
+            oldcsum = csum;
+            csum += x;
+            frac += (oldcsum - csum) + x;
+
+            x = 2.0 * hi * lo;
+            oldcsum = csum;
+            csum += x;
+            frac += (oldcsum - csum) + x;
+
+            x = hi * hi;
             oldcsum = csum;
             csum += x;
             frac += (oldcsum - csum) + x;
         }
-        return sqrt(csum - 1.0 + frac) / scale;
+        result = sqrt(csum - 1.0 + frac) / scale;
+
+        x = result;
+        t = x * T27;
+        hi = t - (t - x);
+        lo = x - hi;
+        assert (hi + lo == x);
+
+        x = -lo * lo;
+        oldcsum = csum;
+        csum += x;
+        frac += (oldcsum - csum) + x;
+
+        x = -2.0 * hi * lo;
+        oldcsum = csum;
+        csum += x;
+        frac += (oldcsum - csum) + x;
+
+        x = -hi * hi;
+        oldcsum = csum;
+        csum += x;
+        frac += (oldcsum - csum) + x;
+
+        x = csum - 1.0 + frac;
+        result += x / (2.0 * result);
+        return result / scale;
     }
     /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
        So instead of multiplying by a scale, we just divide by *max*.

From 3447a73577255fb8a2011e3848e5f39ed06155b0 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Tue, 18 Aug 2020 00:56:41 -0700
Subject: [PATCH 02/12] Was squaring twice

---
 Modules/mathmodule.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index d3931f4be51fcd..5773d0353ad0a4 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2474,7 +2474,6 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
             x = vec[i];
             assert(Py_IS_FINITE(x) && fabs(x) <= max);
             x *= scale;
-            x = x*x;
             assert(x <= 1.0);
             assert(csum >= x);
 

From b8ba65d3ae6cd0413160b8a55562781d4fd02e90 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Tue, 18 Aug 2020 01:01:39 -0700
Subject: [PATCH 03/12] Divided by scale too early

---
 Modules/mathmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 5773d0353ad0a4..1464c81e1b66b2 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2497,7 +2497,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
             csum += x;
             frac += (oldcsum - csum) + x;
         }
-        result = sqrt(csum - 1.0 + frac) / scale;
+        result = sqrt(csum - 1.0 + frac);
 
         x = result;
         t = x * T27;

From df0109f9e2a4619add5635f617da908798e41d51 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Tue, 18 Aug 2020 01:25:19 -0700
Subject: [PATCH 04/12] Fix order of application. Add more assertions

---
 Modules/mathmodule.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 1464c81e1b66b2..88ee80c2485350 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2475,24 +2475,26 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
             assert(Py_IS_FINITE(x) && fabs(x) <= max);
             x *= scale;
             assert(x <= 1.0);
-            assert(csum >= x);
 
             t = x * T27;
             hi = t - (t - x);
             lo = x - hi;
             assert (hi + lo == x);
 
-            x = lo * lo;
+            x = hi * hi;
+            assert(csum >= x);
             oldcsum = csum;
             csum += x;
             frac += (oldcsum - csum) + x;
 
             x = 2.0 * hi * lo;
+            assert(csum >= x);
             oldcsum = csum;
             csum += x;
             frac += (oldcsum - csum) + x;
 
-            x = hi * hi;
+            x = lo * lo;
+            assert(csum >= x);
             oldcsum = csum;
             csum += x;
             frac += (oldcsum - csum) + x;
@@ -2506,16 +2508,19 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         assert (hi + lo == x);
 
         x = -lo * lo;
+        assert(csum >= x);
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;
 
         x = -2.0 * hi * lo;
+        assert(csum >= x);
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;
 
         x = -hi * hi;
+        assert(csum >= x);
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;

From 1fd8643a097412cf4ec11614b69d8326f22c6693 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 23 Aug 2020 02:02:33 -0700
Subject: [PATCH 05/12] Start to refine assertions

---
 Modules/mathmodule.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 88ee80c2485350..4cf69368e0355f 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2472,9 +2472,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         assert(max * scale < 1.0);
         for (i=0 ; i < n ; i++) {
             x = vec[i];
-            assert(Py_IS_FINITE(x) && fabs(x) <= max);
+            assert(Py_IS_FINITE(x) && x >= 0.0 && fabs(x) <= max);
             x *= scale;
-            assert(x <= 1.0);
+            assert(x < 1.0);
 
             t = x * T27;
             hi = t - (t - x);

From 26ba86e3c99cf8799398a4e35880fe62bd4d24aa Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 23 Aug 2020 12:31:59 -0700
Subject: [PATCH 06/12] The actual summation invariant is expressed in
 magnitudes

---
 Modules/mathmodule.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 4cf69368e0355f..fe53a1622abd92 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2479,22 +2479,23 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
             t = x * T27;
             hi = t - (t - x);
             lo = x - hi;
-            assert (hi + lo == x);
+            assert(hi + lo == x);
 
             x = hi * hi;
-            assert(csum >= x);
+            assert(x <= 1.0);
+            assert(fabs(csum) >= fabs(x));
             oldcsum = csum;
             csum += x;
             frac += (oldcsum - csum) + x;
 
             x = 2.0 * hi * lo;
-            assert(csum >= x);
+            assert(fabs(csum) >= fabs(x));
             oldcsum = csum;
             csum += x;
             frac += (oldcsum - csum) + x;
 
             x = lo * lo;
-            assert(csum >= x);
+            assert(fabs(csum) >= fabs(x));
             oldcsum = csum;
             csum += x;
             frac += (oldcsum - csum) + x;
@@ -2508,19 +2509,19 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         assert (hi + lo == x);
 
         x = -lo * lo;
-        assert(csum >= x);
+        assert(fabs(csum) >= fabs(x));
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;
 
         x = -2.0 * hi * lo;
-        assert(csum >= x);
+        assert(fabs(csum) >= fabs(x));
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;
 
         x = -hi * hi;
-        assert(csum >= x);
+        assert(fabs(csum) >= fabs(x));
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;
@@ -2538,7 +2539,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         x /= max;
         x = x*x;
         assert(x <= 1.0);
-        assert(csum >= x);
+        assert(fabs(csum) >= fabs(x));
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;

From ef0c4b00b43fdd21eb0e09bdfaa6f542167f98ad Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 23 Aug 2020 14:09:34 -0700
Subject: [PATCH 07/12] Explain the algorithm in the comments. Provide
 references. Improve variable names.

---
 Modules/mathmodule.c | 104 +++++++++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 38 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index fe53a1622abd92..e49356edda4740 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2404,45 +2404,73 @@ math_fmod_impl(PyObject *module, double x, double y)
 }
 
 /*
-Given an *n* length *vec* of values and a value *max*, compute:
+Given an *n* length *vec* of values, compute the vector norm:
 
-    sqrt(sum((x * scale) ** 2 for x in vec)) / scale
+    sqrt(sum(x ** 2 for x in vec)
 
-           where scale is the first power of two
-           greater than max.
-
-or compute:
-
-    max * sqrt(sum((x / max) ** 2 for x in vec))
-
-The value of the *max* variable must be non-negative and
-equal to the absolute value of the largest magnitude
-entry in the vector.  If n==0, then *max* should be 0.0.
+The *vec* values should be non-negative.
+The *max* variable should be equal to the largest *x*.
+The *n* variable is the length of *vec*.
+If n==0, then *max* should be 0.0.
 If an infinity is present in the vec, *max* should be INF.
-
 The *found_nan* variable indicates whether some member of
 the *vec* is a NaN.
 
-To improve accuracy and to increase the number of cases where
-vector_norm() is commutative, we use a variant of Neumaier
-summation specialized to exploit that we always know that
-|csum| >= |x|.
-
-The *csum* variable tracks the cumulative sum and *frac* tracks
-the cumulative fractional errors at each step.  Since this
-variant assumes that |csum| >= |x| at each step, we establish
-the precondition by starting the accumulation from 1.0 which
-represents the largest possible value of (x*scale)**2 or (x/max)**2.
-
-After the loop is finished, the initial 1.0 is subtracted out
-for a net zero effect on the final sum.  Since *csum* will be
-greater than 1.0, the subtraction of 1.0 will not cause
-fractional digits to be dropped from *csum*.
-
-To get the full benefit from compensated summation, the
-largest addend should be in the range:   0.5 <= x <= 1.0.
-Accordingly, scaling or division by *max* should not be skipped
-even if not otherwise needed to prevent overflow or loss of precision.
+To achieve high accuracy, where results are almost always correctly
+rounded, and to avoid overflow/underflow, four techniques are used:
+
+* lossless scaling using a power-of-two scaling factor
+* accurate squaring using Veltkamp-Dekker splitting
+* compensated summation using a variant of the Neumaier algorithm
+* differential correction of the square root
+
+The usual presentation of the Neumaier summation algorithm has an
+expensive branch depending on which operand has the larger
+magnitude.  We avoid this cost by arranging the calculation so that
+fabs(csum) is always as large as fabs(x).
+
+To establish the invariant, *csum* is initialized to 1.0 which is
+always larger than x**2 after scaling or division by *max*.
+After the loop is finished, the initial 1.0 is subtracted out for a
+net zero effect on the final sum.  Since *csum* will be greater than
+1.0, the subtraction of 1.0 will not cause fractional digits to be
+dropped from *csum*.
+
+To get the full benefit from compensated summation, the largest
+addend should be in the range: 0.5 <= x <= 1.0.  Accordingly,
+scaling or division by *max* should not be skipped even if not
+otherwise needed to prevent overflow or loss of precision.
+
+The assertion that hi*hi >= 1.0 is a bit subtle.  Each vector
+element gets scaled to be below 1.0.  The Veltkamp-Dekker splitting
+algorithm gives a *hi* value that is correctly rounded to half
+precision.  When a value at or below 1.0 is correctly rounded, it
+never goes above 1.0.  And when values at or below 1.0 are squared,
+they remain at or below 1.0, thus preserving the summation
+invariant.
+
+The square root differential correction is needed because a
+correctly rounded square root of the correctly rounded sum of
+squares can still be off by as much as one ulp.
+
+The differential correction starts with a value *x* that is
+the difference between the square of *h*, the possibly inaccurately
+rounded square root, and the accurate computed sum of squares.
+The correction is the first order term of the Maclaurin series
+expansion of sqrt(h**2 - x) == h - x/(2*h) + O(x**2).
+
+Essentially, this differential correction is equivalent to one
+refinement step in the Newton divide-and-average square root
+algorithm, effectively doubling the number of accurate bits.
+
+This technique is used in Dekker's SQRT2 algorithm and again in
+Borges' ALGORITHM 4 and 5.
+
+References:
+
+1. Veltkamp-Dekker splitting: http://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
+2. Compensated summation:  http://www.ti3.tu-harburg.de/paper/rump/Ru08b.pdf
+3. Square root diffential correction:  https://arxiv.org/pdf/1904.09481.pdf
 
 */
 
@@ -2452,7 +2480,7 @@ static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
     double x, csum = 1.0, oldcsum, frac = 0.0, scale;
-    double t, hi, lo, result;
+    double t, hi, lo, h;
     int max_e;
     Py_ssize_t i;
 
@@ -2473,6 +2501,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         for (i=0 ; i < n ; i++) {
             x = vec[i];
             assert(Py_IS_FINITE(x) && x >= 0.0 && fabs(x) <= max);
+
             x *= scale;
             assert(x < 1.0);
 
@@ -2500,9 +2529,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
             csum += x;
             frac += (oldcsum - csum) + x;
         }
-        result = sqrt(csum - 1.0 + frac);
+        h = sqrt(csum - 1.0 + frac);
 
-        x = result;
+        x = h;
         t = x * T27;
         hi = t - (t - x);
         lo = x - hi;
@@ -2527,8 +2556,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         frac += (oldcsum - csum) + x;
 
         x = csum - 1.0 + frac;
-        result += x / (2.0 * result);
-        return result / scale;
+        return (h + x / (2.0 * h)) / scale;
     }
     /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
        So instead of multiplying by a scale, we just divide by *max*.

From e17ca4cb2ffd96ebecc16c766a62e0286085ec8b Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 23 Aug 2020 14:23:27 -0700
Subject: [PATCH 08/12] Add blurb

---
 .../next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst      | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst

diff --git a/Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst b/Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst
new file mode 100644
index 00000000000000..b4d0d9b63cf87d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-08-23-14-23-18.bpo-41513.DGqc_I.rst
@@ -0,0 +1,3 @@
+Improved the accuracy of math.hypot().  Internally, each step is computed
+with extra precision so that the result is now almost always correctly
+rounded.

From c0134374d1f705713aaaf3ea00a56b89227af2d4 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 23 Aug 2020 14:48:52 -0700
Subject: [PATCH 09/12] Move the splitting constant into the function that uses
 it.

---
 Modules/mathmodule.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index e49356edda4740..917c4e587c095d 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2474,11 +2474,10 @@ Borges' ALGORITHM 4 and 5.
 
 */
 
-static const double T27 = 134217729.0;   /* T27=ldexp(1.0, 27)+1.0) */
-
 static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
+    const double T27 = 134217729.0;     /* ldexp(1.0, 27)+1.0) */
     double x, csum = 1.0, oldcsum, frac = 0.0, scale;
     double t, hi, lo, h;
     int max_e;

From f2c325427da850c27b1e7f2739c62d768091e2c6 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 23 Aug 2020 17:18:56 -0700
Subject: [PATCH 10/12] Drop the requirement that input elements are
 non-negative.

---
 Modules/mathmodule.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 917c4e587c095d..b6b46e3c0292e8 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2404,20 +2404,19 @@ math_fmod_impl(PyObject *module, double x, double y)
 }
 
 /*
-Given an *n* length *vec* of values, compute the vector norm:
+Given a *vec* of values, compute the vector norm:
 
-    sqrt(sum(x ** 2 for x in vec)
+    sqrt(sum(x ** 2 for x in vec))
 
-The *vec* values should be non-negative.
-The *max* variable should be equal to the largest *x*.
+The *max* variable should be equal to the largest fabs(x).
 The *n* variable is the length of *vec*.
 If n==0, then *max* should be 0.0.
 If an infinity is present in the vec, *max* should be INF.
 The *found_nan* variable indicates whether some member of
 the *vec* is a NaN.
 
-To achieve high accuracy, where results are almost always correctly
-rounded, and to avoid overflow/underflow, four techniques are used:
+To avoid overflow/underflow and to achieve high accuracy giving results
+that are almost always correctly rounded, four techniques are used:
 
 * lossless scaling using a power-of-two scaling factor
 * accurate squaring using Veltkamp-Dekker splitting
@@ -2437,32 +2436,30 @@ net zero effect on the final sum.  Since *csum* will be greater than
 dropped from *csum*.
 
 To get the full benefit from compensated summation, the largest
-addend should be in the range: 0.5 <= x <= 1.0.  Accordingly,
+addend should be in the range: 0.5 <= |x| <= 1.0.  Accordingly,
 scaling or division by *max* should not be skipped even if not
 otherwise needed to prevent overflow or loss of precision.
 
-The assertion that hi*hi >= 1.0 is a bit subtle.  Each vector
-element gets scaled to be below 1.0.  The Veltkamp-Dekker splitting
+The assertion that hi*hi >= 1.0 is a bit subtle.  Each vector element
+gets scaled to a magnitude below 1.0.  The Veltkamp-Dekker splitting
 algorithm gives a *hi* value that is correctly rounded to half
 precision.  When a value at or below 1.0 is correctly rounded, it
 never goes above 1.0.  And when values at or below 1.0 are squared,
-they remain at or below 1.0, thus preserving the summation
-invariant.
+they remain at or below 1.0, thus preserving the summation invariant.
 
 The square root differential correction is needed because a
-correctly rounded square root of the correctly rounded sum of
+correctly rounded square root of a correctly rounded sum of
 squares can still be off by as much as one ulp.
 
 The differential correction starts with a value *x* that is
 the difference between the square of *h*, the possibly inaccurately
-rounded square root, and the accurate computed sum of squares.
+rounded square root, and the accurately computed sum of squares.
 The correction is the first order term of the Maclaurin series
 expansion of sqrt(h**2 - x) == h - x/(2*h) + O(x**2).
 
 Essentially, this differential correction is equivalent to one
 refinement step in the Newton divide-and-average square root
 algorithm, effectively doubling the number of accurate bits.
-
 This technique is used in Dekker's SQRT2 algorithm and again in
 Borges' ALGORITHM 4 and 5.
 
@@ -2499,10 +2496,10 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         assert(max * scale < 1.0);
         for (i=0 ; i < n ; i++) {
             x = vec[i];
-            assert(Py_IS_FINITE(x) && x >= 0.0 && fabs(x) <= max);
+            assert(Py_IS_FINITE(x) && fabs(x) <= max);
 
             x *= scale;
-            assert(x < 1.0);
+            assert(fabs(x) < 1.0);
 
             t = x * T27;
             hi = t - (t - x);

From 106bec215d3fdf673747edd480267eabee8a7f60 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <rhettinger@users.noreply.github.com>
Date: Sun, 23 Aug 2020 18:47:47 -0700
Subject: [PATCH 11/12] Update Modules/mathmodule.c

Co-authored-by: Tim Peters <tim.peters@gmail.com>
---
 Modules/mathmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index b6b46e3c0292e8..aaa2b83763e471 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2455,7 +2455,7 @@ The differential correction starts with a value *x* that is
 the difference between the square of *h*, the possibly inaccurately
 rounded square root, and the accurately computed sum of squares.
 The correction is the first order term of the Maclaurin series
-expansion of sqrt(h**2 - x) == h - x/(2*h) + O(x**2).
+expansion of sqrt(h**2 + x) == h + x/(2*h) + O(x**2).
 
 Essentially, this differential correction is equivalent to one
 refinement step in the Newton divide-and-average square root

From 39667b53b069c110dbbc258aa3c7f7daaef9c42d Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 23 Aug 2020 19:07:30 -0700
Subject: [PATCH 12/12] Keep the summation order consistent between sections

---
 Modules/mathmodule.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index aaa2b83763e471..1d6174132814b2 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2533,7 +2533,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         lo = x - hi;
         assert (hi + lo == x);
 
-        x = -lo * lo;
+        x = -hi * hi;
         assert(fabs(csum) >= fabs(x));
         oldcsum = csum;
         csum += x;
@@ -2545,7 +2545,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         csum += x;
         frac += (oldcsum - csum) + x;
 
-        x = -hi * hi;
+        x = -lo * lo;
         assert(fabs(csum) >= fabs(x));
         oldcsum = csum;
         csum += x;