-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[libc][math] Improve the performance of sqrtf128. #122578
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-libc Author: None (lntue) ChangesUse a combination of polynomial approximation and Newton-Raphson iterations in 64-bit and 128-bit integers to improve the performance of sqrtf128. The correct rounding is provided by squaring the result and comparing it with the argument. Performance improvement using the newly added perf test:
Patch is 34.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122578.diff 17 Files Affected:
diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index a95ab4ff8e1abf..e922e30cb1d952 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -241,7 +241,7 @@ LIBC_INLINE constexpr void quick_mul_hi(cpp::array<word, N> &dst,
}
template <typename word, size_t N>
-LIBC_INLINE constexpr bool is_negative(cpp::array<word, N> &array) {
+LIBC_INLINE constexpr bool is_negative(const cpp::array<word, N> &array) {
using signed_word = cpp::make_signed_t<word>;
return cpp::bit_cast<signed_word>(array.back()) < 0;
}
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 382f5b362e2eb4..3c810795d027da 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3379,8 +3379,12 @@ add_entrypoint_object(
HDRS
../sqrtf128.h
DEPENDS
+ libc.src.__support.CPP.bit
+ libc.src.__support.FPUtil.fenv_impl
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.rounding_mode
+ libc.src.__support.macros.optimization
libc.src.__support.macros.properties.types
- libc.src.__support.FPUtil.sqrt
COMPILE_OPTIONS
${libc_opt_high_flag}
)
diff --git a/libc/src/math/generic/sqrtf128.cpp b/libc/src/math/generic/sqrtf128.cpp
index f87066b6f6403e..79f1681a9e871d 100644
--- a/libc/src/math/generic/sqrtf128.cpp
+++ b/libc/src/math/generic/sqrtf128.cpp
@@ -1,5 +1,7 @@
//===-- Implementation of sqrtf128 function -------------------------------===//
//
+// Copyright (c) 2024 Alexei Sibidanov <[email protected]>
+//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -7,14 +9,347 @@
//===----------------------------------------------------------------------===//
#include "src/math/sqrtf128.h"
-#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/rounding_mode.h"
#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/uint128.h"
namespace LIBC_NAMESPACE_DECL {
+using FPBits = fputil::FPBits<float128>;
+
+namespace {
+
+template <typename T, typename U = T> static inline constexpr T prod_hi(T, U);
+
+// Get high part of integer multiplications.
+// Use template to prevent implicit conversion.
+template <>
+inline constexpr uint64_t prod_hi<uint64_t>(uint64_t x, uint64_t y) {
+ return static_cast<uint64_t>(
+ (static_cast<UInt128>(x) * static_cast<UInt128>(y)) >> 64);
+}
+
+// Get high part of unsigned 128x64 bit multiplication.
+template <>
+inline constexpr UInt128 prod_hi<UInt128, uint64_t>(UInt128 y, uint64_t x) {
+ uint64_t y_lo = static_cast<uint64_t>(y);
+ uint64_t y_hi = static_cast<uint64_t>(y >> 64);
+ UInt128 xyl = static_cast<UInt128>(x) * static_cast<UInt128>(y_lo);
+ UInt128 xyh = static_cast<UInt128>(x) * static_cast<UInt128>(y_hi);
+ return xyh + (xyl >> 64);
+}
+
+// Get high part of signed 64x64 bit multiplication.
+template <> inline constexpr int64_t prod_hi<int64_t>(int64_t x, int64_t y) {
+ return static_cast<int64_t>(
+ (static_cast<Int128>(x) * static_cast<Int128>(y)) >> 64);
+}
+
+// Get high 128-bit part of unsigned 128x128 bit multiplication.
+template <> inline constexpr UInt128 prod_hi<UInt128>(UInt128 x, UInt128 y) {
+ uint64_t x_lo = static_cast<uint64_t>(x);
+ uint64_t x_hi = static_cast<uint64_t>(x >> 64);
+ uint64_t y_lo = static_cast<uint64_t>(y);
+ uint64_t y_hi = static_cast<uint64_t>(y >> 64);
+
+ UInt128 xh_yh = static_cast<UInt128>(x_hi) * static_cast<UInt128>(y_hi);
+ UInt128 xh_yl = static_cast<UInt128>(x_hi) * static_cast<UInt128>(y_lo);
+ UInt128 xl_yh = static_cast<UInt128>(x_lo) * static_cast<UInt128>(y_hi);
+
+ xh_yh += xh_yl >> 64;
+
+ return xh_yh + (xl_yh >> 64);
+}
+
+// Get high 128-bit part of mixed sign 128x128 bit multiplication.
+template <>
+inline constexpr Int128 prod_hi<Int128, UInt128>(Int128 x, UInt128 y) {
+ UInt128 mask = static_cast<UInt128>(x >> 127);
+ UInt128 negative_part = y & mask;
+ UInt128 prod = prod_hi(static_cast<UInt128>(x), y);
+ return static_cast<Int128>(prod - negative_part);
+}
+
+constexpr uint32_t RSQRT_COEFFS[64][4] = {
+ {0xffffffff, 0xfffff780, 0xbff55815, 0x9bb5b6e7},
+ {0xfc0bd889, 0xfa1d6e7d, 0xb8a95a89, 0x938bf8f0},
+ {0xf82ec882, 0xf473bea9, 0xb1bf4705, 0x8bed0079},
+ {0xf467f280, 0xeefff2a1, 0xab309d4a, 0x84cdb431},
+ {0xf0b6848c, 0xe9bf46f4, 0xa4f76232, 0x7e24037b},
+ {0xed19b75e, 0xe4af2628, 0x9f0e1340, 0x77e6ca62},
+ {0xe990cdad, 0xdfcd2521, 0x996f9b96, 0x720db8df},
+ {0xe61b138e, 0xdb16ffde, 0x94174a00, 0x6c913cff},
+ {0xe2b7dddf, 0xd68a967b, 0x8f00c812, 0x676a6f92},
+ {0xdf6689b7, 0xd225ea80, 0x8a281226, 0x62930308},
+ {0xdc267bea, 0xcde71c63, 0x8589702c, 0x5e05343e},
+ {0xd8f7208e, 0xc9cc6948, 0x81216f2e, 0x59bbbcf8},
+ {0xd5d7ea91, 0xc5d428ee, 0x7cecdb76, 0x55b1c7d6},
+ {0xd2c8534e, 0xc1fccbc9, 0x78e8bb45, 0x51e2e592},
+ {0xcfc7da32, 0xbe44d94a, 0x75124a0a, 0x4e4b0369},
+ {0xccd6045f, 0xbaaaee41, 0x7166f40f, 0x4ae66284},
+ {0xc9f25c5c, 0xb72dbb69, 0x6de45288, 0x47b19045},
+ {0xc71c71c7, 0xb3cc040f, 0x6a882804, 0x44a95f5f},
+ {0xc453d90f, 0xb0849cd4, 0x67505d2a, 0x41cae1a0},
+ {0xc1982b2e, 0xad566a85, 0x643afdc8, 0x3f13625c},
+ {0xbee9056f, 0xaa406113, 0x6146361f, 0x3c806169},
+ {0xbc46092e, 0xa7418293, 0x5e70506d, 0x3a0f8e8e},
+ {0xb9aedba5, 0xa458de58, 0x5bb7b2b1, 0x37bec572},
+ {0xb72325b7, 0xa1859022, 0x591adc9a, 0x358c09e2},
+ {0xb4a293c2, 0x9ec6bf52, 0x569865a7, 0x33758476},
+ {0xb22cd56d, 0x9c1b9e36, 0x542efb6a, 0x31797f8a},
+ {0xafc19d86, 0x9983695c, 0x51dd5ffb, 0x2f96647a},
+ {0xad60a1d1, 0x96fd66f7, 0x4fa2687c, 0x2dcab91f},
+ {0xab099ae9, 0x9488e64b, 0x4d7cfbc9, 0x2c151d8a},
+ {0xa8bc441a, 0x92253f20, 0x4b6c1139, 0x2a7449ef},
+ {0xa6785b42, 0x8fd1d14a, 0x496eaf82, 0x28e70cc3},
+ {0xa43da0ae, 0x8d8e042a, 0x4783eba7, 0x276c4900},
+ {0xa20bd701, 0x8b594648, 0x45aae80a, 0x2602f493},
+ {0x9fe2c315, 0x89330ce4, 0x43e2d382, 0x24aa16ec},
+ {0x9dc22be4, 0x871ad399, 0x422ae88c, 0x2360c7af},
+ {0x9ba9da6c, 0x85101c05, 0x40826c88, 0x22262d7b},
+ {0x99999999, 0x83126d70, 0x3ee8af07, 0x20f97cd2},
+ {0x97913630, 0x81215480, 0x3d5d0922, 0x1fd9f714},
+ {0x95907eb8, 0x7f3c62ef, 0x3bdedce0, 0x1ec6e994},
+ {0x93974369, 0x7d632f45, 0x3a6d94a9, 0x1dbfacbb},
+ {0x91a55615, 0x7b955498, 0x3908a2be, 0x1cc3a33b},
+ {0x8fba8a1c, 0x79d2724e, 0x37af80bf, 0x1bd23960},
+ {0x8dd6b456, 0x781a2be4, 0x3661af39, 0x1aeae458},
+ {0x8bf9ab07, 0x766c28ba, 0x351eb539, 0x1a0d21a2},
+ {0x8a2345cc, 0x74c813dd, 0x33e61feb, 0x19387676},
+ {0x88535d90, 0x732d9bdc, 0x32b7823a, 0x186c6f3e},
+ {0x8689cc7e, 0x719c7297, 0x3192747d, 0x17a89f21},
+ {0x84c66df1, 0x70144d19, 0x30769424, 0x16ec9f89},
+ {0x83091e6a, 0x6e94e36c, 0x2f63836f, 0x16380fbf},
+ {0x8151bb87, 0x6d1df079, 0x2e58e925, 0x158a9484},
+ {0x7fa023f1, 0x6baf31de, 0x2d567053, 0x14e3d7ba},
+ {0x7df43758, 0x6a4867d3, 0x2c5bc811, 0x1443880e},
+ {0x7c4dd664, 0x68e95508, 0x2b68a346, 0x13a958ab},
+ {0x7aace2b0, 0x6791be86, 0x2a7cb871, 0x131500ee},
+ {0x79113ebc, 0x66416b95, 0x2997c17a, 0x12863c29},
+ {0x777acde8, 0x64f825a1, 0x28b97b82, 0x11fcc95c},
+ {0x75e9746a, 0x63b5b822, 0x27e1a6b4, 0x11786b03},
+ {0x745d1746, 0x6279f081, 0x2710061d, 0x10f8e6da},
+ {0x72d59c46, 0x61449e06, 0x26445f86, 0x107e05ac},
+ {0x7152e9f4, 0x601591be, 0x257e7b4d, 0x10079327},
+ {0x6fd4e793, 0x5eec9e6b, 0x24be2445, 0x0f955da9},
+ {0x6e5b7d16, 0x5dc9986e, 0x24032795, 0x0f273620},
+ {0x6ce6931d, 0x5cac55b7, 0x234d5496, 0x0ebcefdb},
+ {0x6b7612ec, 0x5b94adb2, 0x229c7cbc, 0x0e56606e},
+};
+
+// Approximate rsqrt with cubic polynomials.
+// The range [1,2] is splitted into 64 equal sub-ranges and the reciprocal
+// square root is approximated by a cubic polynomial by the minimax method in
+// each subrange. The approximation accuracy fits into 32-33 bits and thus it is
+// natural to round coefficients into 32 bit. The constant coefficient can be
+// rounded to 33 bits since the most significant bit is always 1 and implicitly
+// assumed in the table.
+LIBC_INLINE uint64_t rsqrt_approx(uint64_t m) {
+ // ULP(m) = 2^-64.
+ // Use the top 6 bits as index for looking up polynomial coeffs.
+ uint64_t indx = m >> 58;
+
+ uint64_t c0 = static_cast<uint64_t>(RSQRT_COEFFS[indx][0]);
+ c0 <<= 31; // to 64 bit with the space for the implicit bit
+ c0 |= 1ull << 63; // add implicit bit
+
+ uint64_t c1 = static_cast<uint64_t>(RSQRT_COEFFS[indx][1]);
+ c1 <<= 25; // to 64 bit format
+
+ uint64_t c2 = static_cast<uint64_t>(RSQRT_COEFFS[indx][2]);
+ uint64_t c3 = static_cast<uint64_t>(RSQRT_COEFFS[indx][3]);
+
+ uint64_t d = (m << 6) >> 32; // local coordinate in the subrange [0, 2^32]
+ uint64_t d2 = (d * d) >> 32; // square of the local coordinate
+ uint64_t re = c0 + (d2 * c2 >> 13); // even part of the polynomial (positive)
+ uint64_t ro = d * ((c1 + ((d2 * c3) >> 19)) >> 26) >>
+ 6; // odd part of the polynomial (negative)
+ uint64_t r = re - ro; // maximal error < 1.55e-10 and it is less than 2^-32
+
+ // Newton-Raphson first order step to improve accuracy of the result to almost
+ // 64 bits:
+ // For the initial approximation r0 ~ 1/sqrt(x), let
+ // h = r0^2 * x - 1
+ // be its scaled error. Then the first-order Newton-Raphson iteration is:
+ // r1 = r0 - r0 * h / 2
+ // which has error bounded by:
+ // |r1 - 1/sqrt(x)| < h^2 / 2.
+ uint64_t r2 = prod_hi<uint64_t>(r, r);
+ // h = r0^2*x - 1.
+ int64_t h = static_cast<int64_t>(prod_hi<uint64_t>(m, r2) + r2);
+ // hr = r * h / 2
+ int64_t hr = prod_hi<int64_t>(h, static_cast<int64_t>(r >> 1));
+ r -= hr;
+ // Adjust in the unlucky case x~1;
+ if (LIBC_UNLIKELY(!r))
+ --r;
+ return r;
+}
+
+} // anonymous namespace
+
LLVM_LIBC_FUNCTION(float128, sqrtf128, (float128 x)) {
- return fputil::sqrt<float128>(x);
+ using FPBits = fputil::FPBits<float128>;
+ // Get rounding mode.
+ uint32_t rm = fputil::get_round();
+
+ FPBits xbits(x);
+ UInt128 x_u = xbits.uintval();
+ // Bring leading bit of the mantissa to the highest bit.
+ // ulp(x_frac) = 2^-128.
+ UInt128 x_frac = xbits.get_mantissa() << (FPBits::EXP_LEN + 1);
+
+ int sign_exp = static_cast<int>(x_u >> FPBits::FRACTION_LEN);
+
+ if (LIBC_UNLIKELY(sign_exp == 0 || sign_exp >= 0x7fff)) {
+ // Special cases: NAN, inf, negative numbers
+ if (sign_exp >= 0x7fff) {
+ // x = -0 or x = inf
+ if (xbits.is_zero() || xbits == xbits.inf())
+ return x;
+ // x is nan
+ if (xbits.is_nan()) {
+ // pass through quiet nan
+ if (xbits.is_quiet_nan())
+ return x;
+ // transform signaling nan to quiet and return
+ return xbits.quiet_nan().get_val();
+ }
+ // x < 0 or x = -inf
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_INVALID);
+ return xbits.quiet_nan().get_val();
+ }
+ // x is subnormal or x=+0
+ if (x == 0)
+ return x;
+
+ // Normalize subnormal inputs.
+ sign_exp = -cpp::countl_zero(x_frac);
+ int normal_shifts = 1 - sign_exp;
+ x_frac <<= normal_shifts;
+ }
+
+ // For sign_exp = biased exponent of x = real_exponent + 16383,
+ // let f be the real exponent of the output:
+ // f = floor(real_exponent / 2)
+ // Then:
+ // floor((sign_exp + 1) / 2) = f + 8192
+ // Hence, the biased exponent of the final result is:
+ // f + 16383 = floor((sign_exp + 1) / 2) + 8191.
+ // Since the output mantissa will include the hidden bit, we can define the
+ // output exponent part:
+ // e2 = floor((sign_exp + 1) / 2) + 8190
+ unsigned i = static_cast<unsigned>(1 - (sign_exp & 1));
+ uint32_t q2 = (sign_exp + 1) >> 1;
+ // Exponent of the final result
+ uint32_t e2 = q2 + 8190;
+
+ constexpr uint64_t RSQRT_2[2] = {~0ull,
+ 0xb504f333f9de6484 /* 2^64/sqrt(2) */};
+
+ // Approximate 1/sqrt(1 + x_frac)
+ // Error: |r_1 - 1/sqrt(x)| < 2^-63.
+ uint64_t r1 = rsqrt_approx(static_cast<uint64_t>(x_frac >> 64));
+ // Adjust for the even/odd exponent.
+ uint64_t r2 = prod_hi(r1, RSQRT_2[i]);
+ unsigned shift = 2 - i;
+
+ // Normalized input:
+ // 1 <= x_reduced < 4
+ UInt128 x_reduced = (x_frac >> shift) | (UInt128(1) << (126 + i));
+ // With r2 ~ 1/sqrt(x) up to 2^-63, we perform another round of Newton-Raphson
+ // iteration:
+ // r3 = r2 - r2 * h / 2,
+ // for h = r2^2 * x - 1.
+ // Then:
+ // sqrt(x) = x * (1 / sqrt(x))
+ // ~ x * r3
+ // = x * (r2 - r2 * h / 2)
+ // = (x * r2) - (x * r2) * h / 2
+ UInt128 sx = prod_hi(x_reduced, r2);
+ UInt128 h = prod_hi(sx, r2) << 2;
+ UInt128 ds = static_cast<UInt128>(prod_hi(static_cast<Int128>(h), sx));
+ UInt128 v = (sx << 1) - ds;
+
+ uint32_t nrst = rm == FE_TONEAREST;
+ // The result lies within (-2,5) of true square root so we now
+ // test that we can correctly round the result taking into account
+ // the rounding mode
+ // check the lowest 14 bits.
+ int dd = (static_cast<int>(v) << 18) >> 18;
+
+ if (LIBC_UNLIKELY(dd < 4 && dd >= -8)) { // can round correctly?
+ // m is almost the final result it can be only 1 ulp off so we
+ // just need to test both possibilities. We square it and
+ // compare with the initial argument.
+ UInt128 m = v >> 15;
+ UInt128 m2 = m * m;
+ Int128 t0, t1;
+ // The difference of the squared result and the argument
+ t0 = static_cast<Int128>(m2 - (x_reduced << 98));
+ if (t0 == 0) {
+ // the square root is exact
+ v = m << 15;
+ } else {
+ // Add +-1 ulp to m depend on the sign of the difference. Here
+ // we do not need to square again since (m+1)^2 = m^2 + 2*m +
+ // 1 so just need to add shifted m and 1.
+ t1 = t0;
+ Int128 sgn = t0 >> 127; // sign of the difference
+ t1 -= (m << 1) ^ sgn;
+ t1 += 1 + sgn;
+
+ Int128 sgn1 = t1 >> 127;
+ if (LIBC_UNLIKELY(sgn == sgn1)) {
+ t0 = t1;
+ v -= sgn << 15;
+ t1 -= (m << 1) ^ sgn;
+ t1 += 1 + sgn;
+ }
+
+ if (t1 == 0) {
+ // 1 ulp offset brings again an exact root
+ v = (m - (2 * sgn + 1)) << 15;
+ } else {
+ t1 += t0;
+ Int128 side = t1 >> 127; // select what is closer m or m+-1
+ v &= ~UInt128(0) << 15; // wipe the fractional bits
+ v -= ((sgn & side) | (~sgn & 1)) << (15 + side);
+ v |= 1; // add sticky bit since we cannot have an exact mid-point
+ // situation
+ }
+ }
+ }
+
+ unsigned frac = static_cast<unsigned>(v) & 0x7fff; // fractional part
+ unsigned rnd; // round bit
+ if (LIBC_LIKELY(nrst != 0)) {
+ rnd = frac >> 14; // round to nearest tie to even
+ } else if (rm == FE_UPWARD) {
+ rnd = !!frac; // round up
+ } else if (rm == FE_DOWNWARD) {
+ rnd = 0; // round down
+ } else {
+ rnd = 0; // round to zero
+ }
+
+ v >>= 15; // position mantissa
+ v += rnd; // round
+
+ // // Set inexact flag only if square root is inexact
+ // // TODO: We will have to raise FE_INEXACT most of the time, but this
+ // // operation is very costly, especially in x86-64, since technically, it
+ // // needs to synchronize both SSE and x87 flags. Need to investigate
+ // // further to see how we can make this performant.
+ // if(frac) fputil::raise_except_if_required(FE_INEXACT);
+
+ v += static_cast<UInt128>(e2) << FPBits::FRACTION_LEN; // place exponent
+ return cpp::bit_cast<float128>(v);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index b8e240bf328ce1..f82a7b1588c300 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -331,27 +331,6 @@ struct ModifyMXCSR {
EXPECT_FP_EXCEPTION(expected_except); \
} while (0)
-#define EXPECT_FP_EQ_ALL_ROUNDING(expected, actual) \
- do { \
- using namespace LIBC_NAMESPACE::fputil::testing; \
- ForceRoundingMode __r1(RoundingMode::Nearest); \
- if (__r1.success) { \
- EXPECT_FP_EQ((expected), (actual)); \
- } \
- ForceRoundingMode __r2(RoundingMode::Upward); \
- if (__r2.success) { \
- EXPECT_FP_EQ((expected), (actual)); \
- } \
- ForceRoundingMode __r3(RoundingMode::Downward); \
- if (__r3.success) { \
- EXPECT_FP_EQ((expected), (actual)); \
- } \
- ForceRoundingMode __r4(RoundingMode::TowardZero); \
- if (__r4.success) { \
- EXPECT_FP_EQ((expected), (actual)); \
- } \
- } while (0)
-
#define EXPECT_FP_EQ_ROUNDING_MODE(expected, actual, rounding_mode) \
do { \
using namespace LIBC_NAMESPACE::fputil::testing; \
@@ -373,6 +352,61 @@ struct ModifyMXCSR {
#define EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(expected, actual) \
EXPECT_FP_EQ_ROUNDING_MODE((expected), (actual), RoundingMode::TowardZero)
+#define EXPECT_FP_EQ_ALL_ROUNDING_1(expected, actual) \
+ do { \
+ EXPECT_FP_EQ_ROUNDING_NEAREST((expected), (actual)); \
+ EXPECT_FP_EQ_ROUNDING_UPWARD((expected), (actual)); \
+ EXPECT_FP_EQ_ROUNDING_DOWNWARD((expected), (actual)); \
+ EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO((expected), (actual)); \
+ } while (0)
+
+#define EXPECT_FP_EQ_ALL_ROUNDING_4(expected_nearest, expected_upward, \
+ expected_downward, expected_toward_zero, \
+ actual) \
+ do { \
+ EXPECT_FP_EQ_ROUNDING_NEAREST((expected_nearest), (actual)); \
+ EXPECT_FP_EQ_ROUNDING_UPWARD((expected_upward), (actual)); \
+ EXPECT_FP_EQ_ROUNDING_DOWNWARD((expected_downward), (actual)); \
+ EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO((expected_toward_zero), (actual)); \
+ } while (0)
+
+#define EXPECT_FP_EQ_ALL_ROUNDING_UNSUPPORTED(...) \
+ static_assert(false, "Unsupported number of arguments")
+
+#define EXPECT_FP_EQ_ALL_ROUNDING_GET_6TH_ARG(ARG1, ARG2, ARG3, ARG4, ARG5, \
+ ARG6, ...) \
+ ARG6
+
+#define EXPECT_FP_EQ_ALL_ROUNDING_SELECTION(...) \
+ EXPECT_FP_EQ_ALL_ROUNDING_GET_6TH_ARG( \
+ __VA_ARGS__, EXPECT_FP_EQ_ALL_ROUNDING_4, \
+ EXPECT_FP_EQ_ALL_ROUNDING_UNSUPPORTED, \
+ EXPECT_FP_EQ_ALL_ROUNDING_UNSUPPORTED, EXPECT_FP_EQ_ALL_ROUNDING_1)
+
+#define EXPECT_FP_EQ_ALL_ROUNDING(...) \
+ EXPECT_FP_EQ_ALL_ROUNDING_SELECTION(__VA_ARGS__)(__VA_ARGS__)
+
+#define ASSERT_FP_EQ_ROUNDING_MODE(expected, actual, rounding_mode) \
+ do { ...
[truncated]
|
libc/src/math/generic/sqrtf128.cpp
Outdated
@@ -1,20 +1,355 @@ | |||
//===-- Implementation of sqrtf128 function -------------------------------===// | |||
// | |||
// Copyright (c) 2024 Alexei Sibidanov <[email protected]> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are generally frowned upon, since they cannot be removed.
https://llvm.org/docs/DeveloperPolicy.html#new-llvm-project-license-framework
You cannot strip the copyright headers off or replace them with your own.
I've spoken with company lawyers about these copyright assignments previously (not in regards to this patch), and they've echo'd the above sentiment.
It also makes it very difficult to refactor code in the future. Let's say we split this file in two. Do we copy over the attribution or not? It gets complicated. Please do not introduce such complication into libc/.
I think https://llvm.org/docs/DeveloperPolicy.html#copyright also addresses this:
the copyright for the code in the project is held by the respective contributors.
I'm going to throw a "minus one" on this patch until this is dropped. If necessary, I can reach out to LLVM's board to get more precise language around copyright attribution in comments added to https://llvm.org/docs/DeveloperPolicy.html.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is my intention to have this line. I want to have my name in the code and never be removed as a proper acknowledgment of my contribution.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've emailed LLVM's board for clarification around copyright attribution notices.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed the copyright notice and add attribution to the author in the comments below.
libc/src/math/generic/sqrtf128.cpp
Outdated
UInt128 xyl = static_cast<UInt128>(x) * static_cast<UInt128>(y_lo); | ||
UInt128 xyh = static_cast<UInt128>(x) * static_cast<UInt128>(y_hi); | ||
return xyh + (xyl >> 64); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It smells like we might be able to reuse some of these prod_hi
overloads. Here and possibly more below.
UInt128 xyl = static_cast<UInt128>(x) * static_cast<UInt128>(y_lo); | |
UInt128 xyh = static_cast<UInt128>(x) * static_cast<UInt128>(y_hi); | |
return xyh + (xyl >> 64); | |
UInt128 xyh = static_cast<UInt128>(x) * static_cast<UInt128>(y_hi); | |
return xyh + prod_hi(x, y_lo); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The types of those 2 are not the same, prod_hi(x, y_lo)
is a uint64_t
, so to prevent implicit conversion warnings, we still need to convert that back and forth between uint64_t
and UInt128
. I just keep it in current form just in case the compiler fails to optimize the back and forth conversion, especially when the builtin uint128
is not available, and UInt128
is of BigInt
type.
|
||
// Get high 128-bit part of mixed sign 128x128 bit multiplication. | ||
template <> | ||
inline constexpr Int128 prod_hi<Int128, UInt128>(Int128 x, UInt128 y) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this overload is for mixed signedness of parameters...and the parameter order matters. Seems error prone; I hope we can't implicitly convert from UInt128
to Int128
by accident.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we don't use the specialized template explicitly and the parameters do not match exactly, we should get ambiguous template resolution errors for matching with either prod_hi(UInt128, UInt128)
or prod_hi(Int128, UInt128)
with implicit conversions, or undefined / failed to match without implicit conversions. So I think it should be safe.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The issue of copyright notices in source files has come up a few times over the years. This is the first instance I'm aware of where someone adding one is insistent on keeping it, which is probably why we've gone so long without officially codifying our stance in the developer policy.
It has been the stance of the LLVM Project that we do not accept contributions with copyright notices in the source unless the source is part of a separate open source project that is being included in the project in source form as a vendored dependency where the copyright statement is required to abide by the license terms (see LLVM's regex and MD5 implementations or libcxx's Unicode sources).
Due to the longevity of LLVM's codebase and the continual evolution of the code, embedded copyright notices rapidly become out of date and are cumbersome to maintain. Attempting to keep accurate per-file copyright notices could easily result in some files having hundreds if not thousands of copyright notices in the file headers, which is an extremely undesirable state. As a result the best way to track the copyright of contributions is through LLVM's revision control history.
As of #123463, this is now codified in the developer policy.
If you wish to contribute this PR to LLVM, the embedded copyright statement needs to be removed to comply with the clarified developer policy.
Use a combination of polynomial approximation and Newton-Raphson iterations in 64-bit and 128-bit integers to improve the performance of sqrtf128. The correct rounding is provided by squaring the result and comparing it with the argument. Co-authored-by: Alexei Sibidanov <[email protected]>
f698dee
to
18a9a58
Compare
@llvm-beanz Can you take a look to see if the updated version is ok for you? Thanks, |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/196/builds/4895 Here is the relevant piece of the build log for the reference
|
Use a combination of polynomial approximation and Newton-Raphson iterations in 64-bit and 128-bit integers to improve the performance of sqrtf128. The correct rounding is provided by squaring the result and comparing it with the argument. Performance improvement using the newly added perf test: - My function = the improved implementation from this PR - Other function = current implementation using `libc/src/__support/FPUtil/generic/sqrt.h` ``` Performance tests with inputs in denormal range: -- My function -- Total time : 1260765265 ns Average runtime : 125.951 ns/op Ops per second : 7939623 op/s -- Other function -- Total time : 7160726518 ns Average runtime : 715.357 ns/op Ops per second : 1397902 op/s -- Average runtime ratio -- Mine / Other's : 0.176067 Performance tests with inputs in normal range: -- My function -- Total time : 373003808 ns Average runtime : 37.2631 ns/op Ops per second : 26836189 op/s -- Other function -- Total time : 7353398916 ns Average runtime : 734.605 ns/op Ops per second : 1361275 op/s -- Average runtime ratio -- Mine / Other's : 0.0507254 ``` --------- Co-authored-by: Alexei Sibidanov <[email protected]>
This fixes rv32 buildbot failure from #122578
This fixes rv32 buildbot failure from llvm/llvm-project#122578
…cit conversions. (#127154) This fixes rv32 buildbot failure from llvm/llvm-project#122578 GitOrigin-RevId: d4029539f5227b44d2a0db928c8ee37aa8075b9c Original-Revision: 6883a65d2f6b09b7e6314ee3cfd8a8a955ff2ced Roller-URL: https://cr-buildbucket.appspot.com/build/8722866281070587137 CQ-Do-Not-Cancel-Tryjobs: true Change-Id: I2e018ac2e2ff69903cd601d4686267cd46784a6c Reviewed-on: https://fuchsia-review.googlesource.com/c/fuchsia/+/1207844
Use a combination of polynomial approximation and Newton-Raphson iterations in 64-bit and 128-bit integers to improve the performance of sqrtf128. The correct rounding is provided by squaring the result and comparing it with the argument. Performance improvement using the newly added perf test: - My function = the improved implementation from this PR - Other function = current implementation using `libc/src/__support/FPUtil/generic/sqrt.h` ``` Performance tests with inputs in denormal range: -- My function -- Total time : 1260765265 ns Average runtime : 125.951 ns/op Ops per second : 7939623 op/s -- Other function -- Total time : 7160726518 ns Average runtime : 715.357 ns/op Ops per second : 1397902 op/s -- Average runtime ratio -- Mine / Other's : 0.176067 Performance tests with inputs in normal range: -- My function -- Total time : 373003808 ns Average runtime : 37.2631 ns/op Ops per second : 26836189 op/s -- Other function -- Total time : 7353398916 ns Average runtime : 734.605 ns/op Ops per second : 1361275 op/s -- Average runtime ratio -- Mine / Other's : 0.0507254 ``` --------- Co-authored-by: Alexei Sibidanov <[email protected]>
This fixes rv32 buildbot failure from llvm#122578
Use a combination of polynomial approximation and Newton-Raphson iterations in 64-bit and 128-bit integers to improve the performance of sqrtf128. The correct rounding is provided by squaring the result and comparing it with the argument.
Performance improvement using the newly added perf test:
libc/src/__support/FPUtil/generic/sqrt.h