Skip to content

Commit 18a9a58

Browse files
committed
Add small table option.
1 parent b4d2045 commit 18a9a58

File tree

1 file changed

+63
-15
lines changed

1 file changed

+63
-15
lines changed

libc/src/math/generic/sqrtf128.cpp

Lines changed: 63 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,65 @@ inline constexpr Int128 prod_hi<Int128, UInt128>(Int128 x, UInt128 y) {
101101
return static_cast<Int128>(prod - negative_part);
102102
}
103103

104+
// Newton-Raphson first order step to improve accuracy of the result.
105+
// For the initial approximation r0 ~ 1/sqrt(x), let
106+
// h = r0^2 * x - 1
107+
// be its scaled error. Then the first-order Newton-Raphson iteration is:
108+
// r1 = r0 - r0 * h / 2
109+
// which has error bounded by:
110+
// |r1 - 1/sqrt(x)| < h^2 / 2.
111+
LIBC_INLINE uint64_t rsqrt_newton_raphson(uint64_t m, uint64_t r) {
112+
uint64_t r2 = prod_hi(r, r);
113+
// h = r0^2*x - 1.
114+
int64_t h = static_cast<int64_t>(prod_hi(m, r2) + r2);
115+
// hr = r * h / 2
116+
int64_t hr = prod_hi(h, static_cast<int64_t>(r >> 1));
117+
return r - hr;
118+
}
119+
120+
#ifdef LIBC_MATH_HAS_SMALL_TABLES
121+
// Degree-12 minimax polynomials for 1/sqrt(x) on [1, 2].
122+
constexpr uint32_t RSQRT_COEFFS[12] = {
123+
0xb5947a4a, 0x2d651e32, 0x9ad50532, 0x2d28d093, 0x0d8be653, 0x04239014,
124+
0x01492449, 0x0066ff7d, 0x001e74a1, 0x000984cc, 0x00049abc, 0x00018340,
125+
};
126+
127+
LIBC_INLINE uint64_t rsqrt_approx(uint64_t m) {
128+
int64_t x = static_cast<uint64_t>(m) ^ (uint64_t(1) << 63);
129+
int64_t x_26 = x >> 2;
130+
int64_t z = x >> 31;
131+
132+
if (LIBC_UNLIKELY(z <= -4294967296))
133+
return ~(m >> 1);
134+
135+
uint64_t x2 = static_cast<uint64_t>(z) * static_cast<uint64_t>(z);
136+
uint64_t x2_26 = x2 >> 5;
137+
x2 >>= 32;
138+
// Calculate the odd part of the polynomial using Horner's method.
139+
uint64_t c0 = RSQRT_COEFFS[8] + ((x2 * RSQRT_COEFFS[10]) >> 32);
140+
uint64_t c1 = RSQRT_COEFFS[6] + ((x2 * c0) >> 32);
141+
uint64_t c2 = RSQRT_COEFFS[4] + ((x2 * c1) >> 32);
142+
uint64_t c3 = RSQRT_COEFFS[2] + ((x2 * c2) >> 32);
143+
uint64_t c4 = RSQRT_COEFFS[0] + ((x2 * c3) >> 32);
144+
uint64_t odd =
145+
static_cast<uint64_t>((x >> 34) * static_cast<int64_t>(c4 >> 3)) + x_26;
146+
// Calculate the even part of the polynomial using Horner's method.
147+
uint64_t d0 = RSQRT_COEFFS[9] + ((x2 * RSQRT_COEFFS[11]) >> 32);
148+
uint64_t d1 = RSQRT_COEFFS[7] + ((x2 * d0) >> 32);
149+
uint64_t d2 = RSQRT_COEFFS[5] + ((x2 * d1) >> 32);
150+
uint64_t d3 = RSQRT_COEFFS[3] + ((x2 * d2) >> 32);
151+
uint64_t d4 = RSQRT_COEFFS[1] + ((x2 * d3) >> 32);
152+
uint64_t even = 0xd105eb806655d608ul + ((x2 * d4) >> 6) + x2_26;
153+
154+
uint64_t r = even - odd; // error < 1.5e-10
155+
// Newton-Raphson first order step to improve accuracy of the result to almost
156+
// 64 bits.
157+
return rsqrt_newton_raphson(m, r);
158+
}
159+
160+
#else
161+
// Cubic minimax polynomials for 1/sqrt(x) on [1 + k/64, 1 + (k + 1)/64]
162+
// for k = 0..63.
104163
constexpr uint32_t RSQRT_COEFFS[64][4] = {
105164
{0xffffffff, 0xfffff780, 0xbff55815, 0x9bb5b6e7},
106165
{0xfc0bd889, 0xfa1d6e7d, 0xb8a95a89, 0x938bf8f0},
@@ -196,26 +255,15 @@ LIBC_INLINE uint64_t rsqrt_approx(uint64_t m) {
196255
uint64_t ro = d * ((c1 + ((d2 * c3) >> 19)) >> 26) >>
197256
6; // odd part of the polynomial (negative)
198257
uint64_t r = re - ro; // maximal error < 1.55e-10 and it is less than 2^-32
199-
200258
// Newton-Raphson first order step to improve accuracy of the result to almost
201-
// 64 bits:
202-
// For the initial approximation r0 ~ 1/sqrt(x), let
203-
// h = r0^2 * x - 1
204-
// be its scaled error. Then the first-order Newton-Raphson iteration is:
205-
// r1 = r0 - r0 * h / 2
206-
// which has error bounded by:
207-
// |r1 - 1/sqrt(x)| < h^2 / 2.
208-
uint64_t r2 = prod_hi(r, r);
209-
// h = r0^2*x - 1.
210-
int64_t h = static_cast<int64_t>(prod_hi(m, r2) + r2);
211-
// hr = r * h / 2
212-
int64_t hr = prod_hi(h, static_cast<int64_t>(r >> 1));
213-
r -= hr;
259+
// 64 bits.
260+
r = rsqrt_newton_raphson(m, r);
214261
// Adjust in the unlucky case x~1;
215262
if (LIBC_UNLIKELY(!r))
216263
--r;
217264
return r;
218265
}
266+
#endif // LIBC_MATH_HAS_SMALL_TABLES
219267

220268
} // anonymous namespace
221269

@@ -254,7 +302,7 @@ LLVM_LIBC_FUNCTION(float128, sqrtf128, (float128 x)) {
254302
// Now x is subnormal or x = +0.
255303

256304
// x is +0.
257-
if (x_u == 0)
305+
if (x_frac == 0)
258306
return x;
259307

260308
// Normalize subnormal inputs.

0 commit comments

Comments
 (0)