@@ -2404,52 +2404,79 @@ math_fmod_impl(PyObject *module, double x, double y)
24042404}
24052405
24062406/*
2407- Given an *n* length * vec* of values and a value *max*, compute :
2407+ Given a * vec* of values, compute the vector norm :
24082408
2409- sqrt(sum(( x * scale) ** 2 for x in vec)) / scale
2409+ sqrt(sum(x ** 2 for x in vec))
24102410
2411- where scale is the first power of two
2412- greater than max.
2413-
2414- or compute:
2415-
2416- max * sqrt(sum((x / max) ** 2 for x in vec))
2417-
2418- The value of the *max* variable must be non-negative and
2419- equal to the absolute value of the largest magnitude
2420- entry in the vector. If n==0, then *max* should be 0.0.
2411+ The *max* variable should be equal to the largest fabs(x).
2412+ The *n* variable is the length of *vec*.
2413+ If n==0, then *max* should be 0.0.
24212414If an infinity is present in the vec, *max* should be INF.
2422-
24232415The *found_nan* variable indicates whether some member of
24242416the *vec* is a NaN.
24252417
2426- To improve accuracy and to increase the number of cases where
2427- vector_norm() is commutative, we use a variant of Neumaier
2428- summation specialized to exploit that we always know that
2429- |csum| >= |x|.
2430-
2431- The *csum* variable tracks the cumulative sum and *frac* tracks
2432- the cumulative fractional errors at each step. Since this
2433- variant assumes that |csum| >= |x| at each step, we establish
2434- the precondition by starting the accumulation from 1.0 which
2435- represents the largest possible value of (x*scale)**2 or (x/max)**2.
2436-
2437- After the loop is finished, the initial 1.0 is subtracted out
2438- for a net zero effect on the final sum. Since *csum* will be
2439- greater than 1.0, the subtraction of 1.0 will not cause
2440- fractional digits to be dropped from *csum*.
2441-
2442- To get the full benefit from compensated summation, the
2443- largest addend should be in the range: 0.5 <= x <= 1.0.
2444- Accordingly, scaling or division by *max* should not be skipped
2445- even if not otherwise needed to prevent overflow or loss of precision.
2418+ To avoid overflow/underflow and to achieve high accuracy giving results
2419+ that are almost always correctly rounded, four techniques are used:
2420+
2421+ * lossless scaling using a power-of-two scaling factor
2422+ * accurate squaring using Veltkamp-Dekker splitting
2423+ * compensated summation using a variant of the Neumaier algorithm
2424+ * differential correction of the square root
2425+
2426+ The usual presentation of the Neumaier summation algorithm has an
2427+ expensive branch depending on which operand has the larger
2428+ magnitude. We avoid this cost by arranging the calculation so that
2429+ fabs(csum) is always as large as fabs(x).
2430+
2431+ To establish the invariant, *csum* is initialized to 1.0 which is
2432+ always larger than x**2 after scaling or division by *max*.
2433+ After the loop is finished, the initial 1.0 is subtracted out for a
2434+ net zero effect on the final sum. Since *csum* will be greater than
2435+ 1.0, the subtraction of 1.0 will not cause fractional digits to be
2436+ dropped from *csum*.
2437+
2438+ To get the full benefit from compensated summation, the largest
2439+ addend should be in the range: 0.5 <= |x| <= 1.0. Accordingly,
2440+ scaling or division by *max* should not be skipped even if not
2441+ otherwise needed to prevent overflow or loss of precision.
2442+
2443+ The assertion that hi*hi >= 1.0 is a bit subtle. Each vector element
2444+ gets scaled to a magnitude below 1.0. The Veltkamp-Dekker splitting
2445+ algorithm gives a *hi* value that is correctly rounded to half
2446+ precision. When a value at or below 1.0 is correctly rounded, it
2447+ never goes above 1.0. And when values at or below 1.0 are squared,
2448+ they remain at or below 1.0, thus preserving the summation invariant.
2449+
2450+ The square root differential correction is needed because a
2451+ correctly rounded square root of a correctly rounded sum of
2452+ squares can still be off by as much as one ulp.
2453+
2454+ The differential correction starts with a value *x* that is
2455+ the difference between the square of *h*, the possibly inaccurately
2456+ rounded square root, and the accurately computed sum of squares.
2457+ The correction is the first order term of the Maclaurin series
2458+ expansion of sqrt(h**2 + x) == h + x/(2*h) + O(x**2).
2459+
2460+ Essentially, this differential correction is equivalent to one
2461+ refinement step in the Newton divide-and-average square root
2462+ algorithm, effectively doubling the number of accurate bits.
2463+ This technique is used in Dekker's SQRT2 algorithm and again in
2464+ Borges' ALGORITHM 4 and 5.
2465+
2466+ References:
2467+
2468+ 1. Veltkamp-Dekker splitting: http://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
2469+ 2. Compensated summation: http://www.ti3.tu-harburg.de/paper/rump/Ru08b.pdf
2470+ 3. Square root diffential correction: https://arxiv.org/pdf/1904.09481.pdf
24462471
24472472*/
24482473
24492474static inline double
24502475vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
24512476{
2477+ const double T27 = 134217729.0 ; /* ldexp(1.0, 27)+1.0) */
24522478 double x , csum = 1.0 , oldcsum , frac = 0.0 , scale ;
2479+ double t , hi , lo , h ;
24532480 int max_e ;
24542481 Py_ssize_t i ;
24552482
@@ -2470,15 +2497,62 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
24702497 for (i = 0 ; i < n ; i ++ ) {
24712498 x = vec [i ];
24722499 assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2500+
24732501 x *= scale ;
2474- x = x * x ;
2502+ assert (fabs (x ) < 1.0 );
2503+
2504+ t = x * T27 ;
2505+ hi = t - (t - x );
2506+ lo = x - hi ;
2507+ assert (hi + lo == x );
2508+
2509+ x = hi * hi ;
24752510 assert (x <= 1.0 );
2476- assert (csum >= x );
2511+ assert (fabs (csum ) >= fabs (x ));
2512+ oldcsum = csum ;
2513+ csum += x ;
2514+ frac += (oldcsum - csum ) + x ;
2515+
2516+ x = 2.0 * hi * lo ;
2517+ assert (fabs (csum ) >= fabs (x ));
2518+ oldcsum = csum ;
2519+ csum += x ;
2520+ frac += (oldcsum - csum ) + x ;
2521+
2522+ x = lo * lo ;
2523+ assert (fabs (csum ) >= fabs (x ));
24772524 oldcsum = csum ;
24782525 csum += x ;
24792526 frac += (oldcsum - csum ) + x ;
24802527 }
2481- return sqrt (csum - 1.0 + frac ) / scale ;
2528+ h = sqrt (csum - 1.0 + frac );
2529+
2530+ x = h ;
2531+ t = x * T27 ;
2532+ hi = t - (t - x );
2533+ lo = x - hi ;
2534+ assert (hi + lo == x );
2535+
2536+ x = - hi * hi ;
2537+ assert (fabs (csum ) >= fabs (x ));
2538+ oldcsum = csum ;
2539+ csum += x ;
2540+ frac += (oldcsum - csum ) + x ;
2541+
2542+ x = -2.0 * hi * lo ;
2543+ assert (fabs (csum ) >= fabs (x ));
2544+ oldcsum = csum ;
2545+ csum += x ;
2546+ frac += (oldcsum - csum ) + x ;
2547+
2548+ x = - lo * lo ;
2549+ assert (fabs (csum ) >= fabs (x ));
2550+ oldcsum = csum ;
2551+ csum += x ;
2552+ frac += (oldcsum - csum ) + x ;
2553+
2554+ x = csum - 1.0 + frac ;
2555+ return (h + x / (2.0 * h )) / scale ;
24822556 }
24832557 /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
24842558 So instead of multiplying by a scale, we just divide by *max*.
@@ -2489,7 +2563,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
24892563 x /= max ;
24902564 x = x * x ;
24912565 assert (x <= 1.0 );
2492- assert (csum >= x );
2566+ assert (fabs ( csum ) >= fabs ( x ) );
24932567 oldcsum = csum ;
24942568 csum += x ;
24952569 frac += (oldcsum - csum ) + x ;
0 commit comments