@@ -2404,52 +2404,79 @@ math_fmod_impl(PyObject *module, double x, double y)
2404
2404
}
2405
2405
2406
2406
/*
2407
- Given an *n* length * vec* of values and a value *max*, compute :
2407
+ Given a * vec* of values, compute the vector norm :
2408
2408
2409
- sqrt(sum(( x * scale) ** 2 for x in vec)) / scale
2409
+ sqrt(sum(x ** 2 for x in vec))
2410
2410
2411
- where scale is the first power of two
2412
- greater than max.
2413
-
2414
- or compute:
2415
-
2416
- max * sqrt(sum((x / max) ** 2 for x in vec))
2417
-
2418
- The value of the *max* variable must be non-negative and
2419
- equal to the absolute value of the largest magnitude
2420
- entry in the vector. If n==0, then *max* should be 0.0.
2411
+ The *max* variable should be equal to the largest fabs(x).
2412
+ The *n* variable is the length of *vec*.
2413
+ If n==0, then *max* should be 0.0.
2421
2414
If an infinity is present in the vec, *max* should be INF.
2422
-
2423
2415
The *found_nan* variable indicates whether some member of
2424
2416
the *vec* is a NaN.
2425
2417
2426
- To improve accuracy and to increase the number of cases where
2427
- vector_norm() is commutative, we use a variant of Neumaier
2428
- summation specialized to exploit that we always know that
2429
- |csum| >= |x|.
2430
-
2431
- The *csum* variable tracks the cumulative sum and *frac* tracks
2432
- the cumulative fractional errors at each step. Since this
2433
- variant assumes that |csum| >= |x| at each step, we establish
2434
- the precondition by starting the accumulation from 1.0 which
2435
- represents the largest possible value of (x*scale)**2 or (x/max)**2.
2436
-
2437
- After the loop is finished, the initial 1.0 is subtracted out
2438
- for a net zero effect on the final sum. Since *csum* will be
2439
- greater than 1.0, the subtraction of 1.0 will not cause
2440
- fractional digits to be dropped from *csum*.
2441
-
2442
- To get the full benefit from compensated summation, the
2443
- largest addend should be in the range: 0.5 <= x <= 1.0.
2444
- Accordingly, scaling or division by *max* should not be skipped
2445
- even if not otherwise needed to prevent overflow or loss of precision.
2418
+ To avoid overflow/underflow and to achieve high accuracy giving results
2419
+ that are almost always correctly rounded, four techniques are used:
2420
+
2421
+ * lossless scaling using a power-of-two scaling factor
2422
+ * accurate squaring using Veltkamp-Dekker splitting
2423
+ * compensated summation using a variant of the Neumaier algorithm
2424
+ * differential correction of the square root
2425
+
2426
+ The usual presentation of the Neumaier summation algorithm has an
2427
+ expensive branch depending on which operand has the larger
2428
+ magnitude. We avoid this cost by arranging the calculation so that
2429
+ fabs(csum) is always as large as fabs(x).
2430
+
2431
+ To establish the invariant, *csum* is initialized to 1.0 which is
2432
+ always larger than x**2 after scaling or division by *max*.
2433
+ After the loop is finished, the initial 1.0 is subtracted out for a
2434
+ net zero effect on the final sum. Since *csum* will be greater than
2435
+ 1.0, the subtraction of 1.0 will not cause fractional digits to be
2436
+ dropped from *csum*.
2437
+
2438
+ To get the full benefit from compensated summation, the largest
2439
+ addend should be in the range: 0.5 <= |x| <= 1.0. Accordingly,
2440
+ scaling or division by *max* should not be skipped even if not
2441
+ otherwise needed to prevent overflow or loss of precision.
2442
+
2443
+ The assertion that hi*hi >= 1.0 is a bit subtle. Each vector element
2444
+ gets scaled to a magnitude below 1.0. The Veltkamp-Dekker splitting
2445
+ algorithm gives a *hi* value that is correctly rounded to half
2446
+ precision. When a value at or below 1.0 is correctly rounded, it
2447
+ never goes above 1.0. And when values at or below 1.0 are squared,
2448
+ they remain at or below 1.0, thus preserving the summation invariant.
2449
+
2450
+ The square root differential correction is needed because a
2451
+ correctly rounded square root of a correctly rounded sum of
2452
+ squares can still be off by as much as one ulp.
2453
+
2454
+ The differential correction starts with a value *x* that is
2455
+ the difference between the square of *h*, the possibly inaccurately
2456
+ rounded square root, and the accurately computed sum of squares.
2457
+ The correction is the first order term of the Maclaurin series
2458
+ expansion of sqrt(h**2 + x) == h + x/(2*h) + O(x**2).
2459
+
2460
+ Essentially, this differential correction is equivalent to one
2461
+ refinement step in the Newton divide-and-average square root
2462
+ algorithm, effectively doubling the number of accurate bits.
2463
+ This technique is used in Dekker's SQRT2 algorithm and again in
2464
+ Borges' ALGORITHM 4 and 5.
2465
+
2466
+ References:
2467
+
2468
+ 1. Veltkamp-Dekker splitting: http://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
2469
+ 2. Compensated summation: http://www.ti3.tu-harburg.de/paper/rump/Ru08b.pdf
2470
+ 3. Square root diffential correction: https://arxiv.org/pdf/1904.09481.pdf
2446
2471
2447
2472
*/
2448
2473
2449
2474
static inline double
2450
2475
vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
2451
2476
{
2477
+ const double T27 = 134217729.0 ; /* ldexp(1.0, 27)+1.0) */
2452
2478
double x , csum = 1.0 , oldcsum , frac = 0.0 , scale ;
2479
+ double t , hi , lo , h ;
2453
2480
int max_e ;
2454
2481
Py_ssize_t i ;
2455
2482
@@ -2470,15 +2497,62 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2470
2497
for (i = 0 ; i < n ; i ++ ) {
2471
2498
x = vec [i ];
2472
2499
assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2500
+
2473
2501
x *= scale ;
2474
- x = x * x ;
2502
+ assert (fabs (x ) < 1.0 );
2503
+
2504
+ t = x * T27 ;
2505
+ hi = t - (t - x );
2506
+ lo = x - hi ;
2507
+ assert (hi + lo == x );
2508
+
2509
+ x = hi * hi ;
2475
2510
assert (x <= 1.0 );
2476
- assert (csum >= x );
2511
+ assert (fabs (csum ) >= fabs (x ));
2512
+ oldcsum = csum ;
2513
+ csum += x ;
2514
+ frac += (oldcsum - csum ) + x ;
2515
+
2516
+ x = 2.0 * hi * lo ;
2517
+ assert (fabs (csum ) >= fabs (x ));
2518
+ oldcsum = csum ;
2519
+ csum += x ;
2520
+ frac += (oldcsum - csum ) + x ;
2521
+
2522
+ x = lo * lo ;
2523
+ assert (fabs (csum ) >= fabs (x ));
2477
2524
oldcsum = csum ;
2478
2525
csum += x ;
2479
2526
frac += (oldcsum - csum ) + x ;
2480
2527
}
2481
- return sqrt (csum - 1.0 + frac ) / scale ;
2528
+ h = sqrt (csum - 1.0 + frac );
2529
+
2530
+ x = h ;
2531
+ t = x * T27 ;
2532
+ hi = t - (t - x );
2533
+ lo = x - hi ;
2534
+ assert (hi + lo == x );
2535
+
2536
+ x = - hi * hi ;
2537
+ assert (fabs (csum ) >= fabs (x ));
2538
+ oldcsum = csum ;
2539
+ csum += x ;
2540
+ frac += (oldcsum - csum ) + x ;
2541
+
2542
+ x = -2.0 * hi * lo ;
2543
+ assert (fabs (csum ) >= fabs (x ));
2544
+ oldcsum = csum ;
2545
+ csum += x ;
2546
+ frac += (oldcsum - csum ) + x ;
2547
+
2548
+ x = - lo * lo ;
2549
+ assert (fabs (csum ) >= fabs (x ));
2550
+ oldcsum = csum ;
2551
+ csum += x ;
2552
+ frac += (oldcsum - csum ) + x ;
2553
+
2554
+ x = csum - 1.0 + frac ;
2555
+ return (h + x / (2.0 * h )) / scale ;
2482
2556
}
2483
2557
/* When max_e < -1023, ldexp(1.0, -max_e) overflows.
2484
2558
So instead of multiplying by a scale, we just divide by *max*.
@@ -2489,7 +2563,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2489
2563
x /= max ;
2490
2564
x = x * x ;
2491
2565
assert (x <= 1.0 );
2492
- assert (csum >= x );
2566
+ assert (fabs ( csum ) >= fabs ( x ) );
2493
2567
oldcsum = csum ;
2494
2568
csum += x ;
2495
2569
frac += (oldcsum - csum ) + x ;
0 commit comments