@@ -101,6 +101,65 @@ inline constexpr Int128 prod_hi<Int128, UInt128>(Int128 x, UInt128 y) {
101
101
return static_cast <Int128>(prod - negative_part);
102
102
}
103
103
104
+ // Newton-Raphson first order step to improve accuracy of the result.
105
+ // For the initial approximation r0 ~ 1/sqrt(x), let
106
+ // h = r0^2 * x - 1
107
+ // be its scaled error. Then the first-order Newton-Raphson iteration is:
108
+ // r1 = r0 - r0 * h / 2
109
+ // which has error bounded by:
110
+ // |r1 - 1/sqrt(x)| < h^2 / 2.
111
+ LIBC_INLINE uint64_t rsqrt_newton_raphson (uint64_t m, uint64_t r) {
112
+ uint64_t r2 = prod_hi (r, r);
113
+ // h = r0^2*x - 1.
114
+ int64_t h = static_cast <int64_t >(prod_hi (m, r2) + r2);
115
+ // hr = r * h / 2
116
+ int64_t hr = prod_hi (h, static_cast <int64_t >(r >> 1 ));
117
+ return r - hr;
118
+ }
119
+
120
+ #ifdef LIBC_MATH_HAS_SMALL_TABLES
121
+ // Degree-12 minimax polynomials for 1/sqrt(x) on [1, 2].
122
+ constexpr uint32_t RSQRT_COEFFS[12 ] = {
123
+ 0xb5947a4a , 0x2d651e32 , 0x9ad50532 , 0x2d28d093 , 0x0d8be653 , 0x04239014 ,
124
+ 0x01492449 , 0x0066ff7d , 0x001e74a1 , 0x000984cc , 0x00049abc , 0x00018340 ,
125
+ };
126
+
127
+ LIBC_INLINE uint64_t rsqrt_approx (uint64_t m) {
128
+ int64_t x = static_cast <uint64_t >(m) ^ (uint64_t (1 ) << 63 );
129
+ int64_t x_26 = x >> 2 ;
130
+ int64_t z = x >> 31 ;
131
+
132
+ if (LIBC_UNLIKELY (z <= -4294967296 ))
133
+ return ~(m >> 1 );
134
+
135
+ uint64_t x2 = static_cast <uint64_t >(z) * static_cast <uint64_t >(z);
136
+ uint64_t x2_26 = x2 >> 5 ;
137
+ x2 >>= 32 ;
138
+ // Calculate the odd part of the polynomial using Horner's method.
139
+ uint64_t c0 = RSQRT_COEFFS[8 ] + ((x2 * RSQRT_COEFFS[10 ]) >> 32 );
140
+ uint64_t c1 = RSQRT_COEFFS[6 ] + ((x2 * c0) >> 32 );
141
+ uint64_t c2 = RSQRT_COEFFS[4 ] + ((x2 * c1) >> 32 );
142
+ uint64_t c3 = RSQRT_COEFFS[2 ] + ((x2 * c2) >> 32 );
143
+ uint64_t c4 = RSQRT_COEFFS[0 ] + ((x2 * c3) >> 32 );
144
+ uint64_t odd =
145
+ static_cast <uint64_t >((x >> 34 ) * static_cast <int64_t >(c4 >> 3 )) + x_26;
146
+ // Calculate the even part of the polynomial using Horner's method.
147
+ uint64_t d0 = RSQRT_COEFFS[9 ] + ((x2 * RSQRT_COEFFS[11 ]) >> 32 );
148
+ uint64_t d1 = RSQRT_COEFFS[7 ] + ((x2 * d0) >> 32 );
149
+ uint64_t d2 = RSQRT_COEFFS[5 ] + ((x2 * d1) >> 32 );
150
+ uint64_t d3 = RSQRT_COEFFS[3 ] + ((x2 * d2) >> 32 );
151
+ uint64_t d4 = RSQRT_COEFFS[1 ] + ((x2 * d3) >> 32 );
152
+ uint64_t even = 0xd105eb806655d608ul + ((x2 * d4) >> 6 ) + x2_26;
153
+
154
+ uint64_t r = even - odd; // error < 1.5e-10
155
+ // Newton-Raphson first order step to improve accuracy of the result to almost
156
+ // 64 bits.
157
+ return rsqrt_newton_raphson (m, r);
158
+ }
159
+
160
+ #else
161
+ // Cubic minimax polynomials for 1/sqrt(x) on [1 + k/64, 1 + (k + 1)/64]
162
+ // for k = 0..63.
104
163
constexpr uint32_t RSQRT_COEFFS[64 ][4 ] = {
105
164
{0xffffffff , 0xfffff780 , 0xbff55815 , 0x9bb5b6e7 },
106
165
{0xfc0bd889 , 0xfa1d6e7d , 0xb8a95a89 , 0x938bf8f0 },
@@ -196,26 +255,15 @@ LIBC_INLINE uint64_t rsqrt_approx(uint64_t m) {
196
255
uint64_t ro = d * ((c1 + ((d2 * c3) >> 19 )) >> 26 ) >>
197
256
6 ; // odd part of the polynomial (negative)
198
257
uint64_t r = re - ro; // maximal error < 1.55e-10 and it is less than 2^-32
199
-
200
258
// Newton-Raphson first order step to improve accuracy of the result to almost
201
- // 64 bits:
202
- // For the initial approximation r0 ~ 1/sqrt(x), let
203
- // h = r0^2 * x - 1
204
- // be its scaled error. Then the first-order Newton-Raphson iteration is:
205
- // r1 = r0 - r0 * h / 2
206
- // which has error bounded by:
207
- // |r1 - 1/sqrt(x)| < h^2 / 2.
208
- uint64_t r2 = prod_hi (r, r);
209
- // h = r0^2*x - 1.
210
- int64_t h = static_cast <int64_t >(prod_hi (m, r2) + r2);
211
- // hr = r * h / 2
212
- int64_t hr = prod_hi (h, static_cast <int64_t >(r >> 1 ));
213
- r -= hr;
259
+ // 64 bits.
260
+ r = rsqrt_newton_raphson (m, r);
214
261
// Adjust in the unlucky case x~1;
215
262
if (LIBC_UNLIKELY (!r))
216
263
--r;
217
264
return r;
218
265
}
266
+ #endif // LIBC_MATH_HAS_SMALL_TABLES
219
267
220
268
} // anonymous namespace
221
269
@@ -254,7 +302,7 @@ LLVM_LIBC_FUNCTION(float128, sqrtf128, (float128 x)) {
254
302
// Now x is subnormal or x = +0.
255
303
256
304
// x is +0.
257
- if (x_u == 0 )
305
+ if (x_frac == 0 )
258
306
return x;
259
307
260
308
// Normalize subnormal inputs.
0 commit comments