From 7ecf234491de46dfca1bc431638dcafb60c86bc2 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 7 May 2024 20:54:11 -0500 Subject: [PATCH 01/79] Adding new, but unsued, `_dec_str_to_int_inner()`, + discussion. --- Lib/_pylong.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 4970eb3fa67b2b..ee7051ee569d34 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -211,6 +211,55 @@ def inner(a, b): return inner(0, len(s)) +if 0: + import math + _LOG_10_BASE_256 = math.log(10, 256) + del math + # Asymptotically faster version, using the C decimal module. Unused + # for now. See extensive comments at the end of the file. This + # basically uses decimal arithmetic to convert from base 10 to base + # 256. The latter is just a string of bytes, which CPython can + # convert very efficiently to a Python int. + def _dec_str_to_int_inner(s): + BYTELIM = 200 + D = decimal.Decimal + D2to8 = D(256) + D5to8 = D(390625) + result = bytearray() + + def inner(n, w): + if w <= BYTELIM: + result.extend(int(n).to_bytes(w)) # default big-endian + return + w2 = w >> 1 + if 0: + # This is maximally clear, but "too slow". `decimal` + # division is asymptotically fast, but we have no way to + # tell it to reuse the high-precision reciprocal + # approximations it computes for pow2to5[w2], so it has + # to recompute them over & over & over again :-( + hi, lo = divmod(n, pow2to8[w2]) + else: + # The only division in this alternative is by a power of + # 10, which comes nearly "for free" in decimal. + hi = n.scaleb(-8 * w2) # exactly n/10**(8*w2) + hi *= pow5to8[w2] # n/10**(8*w2) * 5**(8*w2( = n/2**(8*w2) + hi = hi.to_integral_value() # lose the fractional digits + lo = n - hi * pow2to8[w2] + # The assert should always succeed, but way too slow to + # keep enabled. + #assert hi, lo == divmod(n, pow2to8[w2]) + inner(hi, w - w2) + inner(lo, w2) + + w = int(len(s) * _LOG_10_BASE_256) + 1 + with decimal.localcontext(_unbounded_dec_context) as ctx: + ctx.rounding = decimal.ROUND_DOWN # so to_integral_value() chops + pow2to8 = compute_powers(w, D2to8, BYTELIM) + pow5to8 = compute_powers(w, D5to8, BYTELIM) + inner(D(s), w) + return int.from_bytes(result) + def int_from_string(s): """Asymptotically fast version of PyLong_FromString(), conversion of a string of decimal digits into an 'int'.""" @@ -361,3 +410,36 @@ def int_divmod(a, b): return ~q, b + ~r else: return _divmod_pos(a, b) + + +# Notes on _dec_str_to_int_inner: +# +# Stefan Pochmann worked up a str->int function that used the decimal +# module to, in effect, convert from base 10 to base 256. This is +# "unnatural", in that it requires multiplying and dividing by large +# powers of 2, which `decimal` isn't naturally suited to. But +# `decimal`'s `*` and `/` are asymptotically superior to CPython's, so +# at _some_ point it could be expected to win. +# +# Alas, the crossover point was too high to be of much real interest. I +# (Tim) then worked on ways to replace its division with multiplication +# by a cached reciprocal approximation instead, fixing up (usually +# small) errors afterwards. This reduced the crossover point +# significantly, but still too high to be of much real interest. And the +# code grew increasingly complex. +# +# Then I had a different idea: there's no actual need to divide by +# powers of 2 at all. Since `2*5 = 10`, division by `2**k` is the same +# as multiplying by `5**k` (exact) and then dividing by `10**k` (also +# exact in `decimal`, and dirt cheap: the module's `scaleb()` function, +# which just adjusts the internal exponent). +# +# This yields short and reasonably clear code with a much lower +# crossover value, at about 26 million bits. That's still too high to be +# compelling, though. +# +# If someone wants to look into speeding it more, I suggest focusing on +# the multiplication by the cacned `5**(8**w2)`. We only need the +# integer part of the result, so are generally computing many more +# digits than are needed for that. But we do need the exact ("as if to +# infinite precsion") integer part. From 6abc9cc735dafbff68ad7f73f16e316e94f4cb8c Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 7 May 2024 21:05:42 -0500 Subject: [PATCH 02/79] Correction: `decimal` in this conext computes reciprocals of powers --- Lib/_pylong.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index ee7051ee569d34..7fa312d62af602 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -235,9 +235,9 @@ def inner(n, w): if 0: # This is maximally clear, but "too slow". `decimal` # division is asymptotically fast, but we have no way to - # tell it to reuse the high-precision reciprocal - # approximations it computes for pow2to5[w2], so it has - # to recompute them over & over & over again :-( + # tell it to reuse the high-precision reciprocala it + # computes for pow2to5[w2], so it has to recompute them + # over & over & over again :-( hi, lo = divmod(n, pow2to8[w2]) else: # The only division in this alternative is by a power of From f4c15cef6187b85fed9e7af6de75f3ec6ccb71f4 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 7 May 2024 21:09:47 -0500 Subject: [PATCH 03/79] Clarify precedence in assert. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 7fa312d62af602..81a308c82f9aed 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -248,7 +248,7 @@ def inner(n, w): lo = n - hi * pow2to8[w2] # The assert should always succeed, but way too slow to # keep enabled. - #assert hi, lo == divmod(n, pow2to8[w2]) + #assert (hi, lo) == divmod(n, pow2to8[w2]) inner(hi, w - w2) inner(lo, w2) From 8673dcb5bf643f18748b955a05e35e883d3cb1b2 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 7 May 2024 23:36:32 -0500 Subject: [PATCH 04/79] Update Lib/_pylong.py Co-authored-by: Jelle Zijlstra --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 81a308c82f9aed..813aa28c379cee 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -235,7 +235,7 @@ def inner(n, w): if 0: # This is maximally clear, but "too slow". `decimal` # division is asymptotically fast, but we have no way to - # tell it to reuse the high-precision reciprocala it + # tell it to reuse the high-precision reciprocal it # computes for pow2to5[w2], so it has to recompute them # over & over & over again :-( hi, lo = divmod(n, pow2to8[w2]) From b1aa3161011d099ca5a5af9fa0591159174cde1a Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 12:25:16 -0500 Subject: [PATCH 05/79] Update Lib/_pylong.py Co-authored-by: sstandre <43125375+sstandre@users.noreply.github.com> --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 813aa28c379cee..fe560745a908df 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -439,7 +439,7 @@ def int_divmod(a, b): # compelling, though. # # If someone wants to look into speeding it more, I suggest focusing on -# the multiplication by the cacned `5**(8**w2)`. We only need the +# the multiplication by the cached `5**(8**w2)`. We only need the # integer part of the result, so are generally computing many more # digits than are needed for that. But we do need the exact ("as if to # infinite precsion") integer part. From d9490d001772ac8b81fba5edefd8dbabd3e10c99 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 12:25:35 -0500 Subject: [PATCH 06/79] Update Lib/_pylong.py Co-authored-by: sstandre <43125375+sstandre@users.noreply.github.com> --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index fe560745a908df..9134148310b286 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -243,7 +243,7 @@ def inner(n, w): # The only division in this alternative is by a power of # 10, which comes nearly "for free" in decimal. hi = n.scaleb(-8 * w2) # exactly n/10**(8*w2) - hi *= pow5to8[w2] # n/10**(8*w2) * 5**(8*w2( = n/2**(8*w2) + hi *= pow5to8[w2] # n/10**(8*w2) * 5**(8*w2) = n/2**(8*w2) hi = hi.to_integral_value() # lose the fractional digits lo = n - hi * pow2to8[w2] # The assert should always succeed, but way too slow to From b2734c2cc30f28b86fc359eedd5e6858b9259029 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 15:09:45 -0500 Subject: [PATCH 07/79] My memory was wrong: using explicit, cached reciprocal approximations did very much better than I recalled. So moving to that instead. The crossover point is "only" about 3.4 million digits now, far smaller. --- Lib/_pylong.py | 102 ++++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 39 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 9134148310b286..547f664d090bd9 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -10,7 +10,7 @@ "micro-optimizations". This module will only be imported and used for integers with a huge number of digits. Saving a few microseconds with tricky or non-obvious code is not worth it. For people looking for -maximum performance, they should use something like gmpy2.""" +maximum performance, they should use somethig like gmpy2.""" import re import decimal @@ -211,25 +211,36 @@ def inner(a, b): return inner(0, len(s)) -if 0: +if 1: import math _LOG_10_BASE_256 = math.log(10, 256) del math # Asymptotically faster version, using the C decimal module. Unused - # for now. See extensive comments at the end of the file. This - # basically uses decimal arithmetic to convert from base 10 to base - # 256. The latter is just a string of bytes, which CPython can - # convert very efficiently to a Python int. + # for now. See comments at the end of the file. This basically uses + # decimal arithmetic to convert from base 10 to base 256. The latter + # is just a string of bytes, which CPython can convert very + # efficiently to a Python int. + + # _spread is for testing, mapping how often `n` quotient correction + # steps are needed t0 a count of how many times that was needed. + # Usually no corrections are needed, and I haven't seen it need more + # than 1. + from collections import defaultdict + _spread = defaultdict(int) + del defaultdict + def _dec_str_to_int_inner(s): BYTELIM = 200 D = decimal.Decimal - D2to8 = D(256) - D5to8 = D(390625) result = bytearray() def inner(n, w): if w <= BYTELIM: - result.extend(int(n).to_bytes(w)) # default big-endian + # XXX Stefan Pochmann discovered that, for 1024-bit + # ints, `int(Decimal)` took 2.5x longer than + # `int(str(Decimal))`. So simplify this code to the + # former if/when that gets repaired. + result.extend(int(str(n)).to_bytes(w)) return w2 = w >> 1 if 0: @@ -238,25 +249,53 @@ def inner(n, w): # tell it to reuse the high-precision reciprocal it # computes for pow2to5[w2], so it has to recompute them # over & over & over again :-( - hi, lo = divmod(n, pow2to8[w2]) + hi, lo = divmod(n, pow256[w2][0]) else: - # The only division in this alternative is by a power of - # 10, which comes nearly "for free" in decimal. - hi = n.scaleb(-8 * w2) # exactly n/10**(8*w2) - hi *= pow5to8[w2] # n/10**(8*w2) * 5**(8*w2) = n/2**(8*w2) + p256, recip = pow256[w2] + # The integer part will have about half the digits of n. + # So only need that much precision, plus some guard digits. + ctx.prec = (n.adjusted() >> 1) + 8 + hi = (+ n) * recip hi = hi.to_integral_value() # lose the fractional digits - lo = n - hi * pow2to8[w2] + ctx.prec = decimal.MAX_PREC + lo = n - hi * p256 + assert lo >= 0 + count = 0 + while lo >= p256: + count += 1 + lo -= p256 + hi += 1 + _spread[count] += 1 + if count >= 2: + print("HUH! count", count) + input("MORE") # The assert should always succeed, but way too slow to # keep enabled. - #assert (hi, lo) == divmod(n, pow2to8[w2]) + #assert hi, lo == divmod(n, pow256[w2][0]) inner(hi, w - w2) inner(lo, w2) w = int(len(s) * _LOG_10_BASE_256) + 1 with decimal.localcontext(_unbounded_dec_context) as ctx: - ctx.rounding = decimal.ROUND_DOWN # so to_integral_value() chops - pow2to8 = compute_powers(w, D2to8, BYTELIM) - pow5to8 = compute_powers(w, D5to8, BYTELIM) + D256 = D(256) + pow256 = compute_powers(w, D256, BYTELIM) + rpow256 = compute_powers(w, 1 / D256, BYTELIM) + # We're going to do inexact, chopped arithmetic, multiplying + # by an approximation to the reciprocal of 256**i. We chop + # to get a lower bound on the true integer quotient. Our + # approximation is a lower bound, the multiply is chopped + # too, and to_integral_value() is also chopped. + ctx.traps[decimal.Inexact] = 0 + ctx.rounding = decimal.ROUND_DOWN + for k, v in pow256.items(): + # No need to save more precision in the reciprocal than + # the power of 256 has, plus some guard digits to absorb + # most relevant rounding errors. + ctx.prec = v.adjusted() + 8 + # The unary "+" chope the reciprocal back to that precision. + pow256[k] = v, +rpow256[k] + del rpow256 # exact reciprocals no longer needed + ctx.prec = decimal.MAX_PREC inner(D(s), w) return int.from_bytes(result) @@ -419,27 +458,12 @@ def int_divmod(a, b): # "unnatural", in that it requires multiplying and dividing by large # powers of 2, which `decimal` isn't naturally suited to. But # `decimal`'s `*` and `/` are asymptotically superior to CPython's, so -# at _some_ point it could be expected to win. +# at _some_ point it could be expected to win. # # Alas, the crossover point was too high to be of much real interest. I # (Tim) then worked on ways to replace its division with multiplication -# by a cached reciprocal approximation instead, fixing up (usually -# small) errors afterwards. This reduced the crossover point -# significantly, but still too high to be of much real interest. And the -# code grew increasingly complex. -# -# Then I had a different idea: there's no actual need to divide by -# powers of 2 at all. Since `2*5 = 10`, division by `2**k` is the same -# as multiplying by `5**k` (exact) and then dividing by `10**k` (also -# exact in `decimal`, and dirt cheap: the module's `scaleb()` function, -# which just adjusts the internal exponent). -# -# This yields short and reasonably clear code with a much lower -# crossover value, at about 26 million bits. That's still too high to be -# compelling, though. +# by a cached reciprocal approximation instead, fixing up errors +# afterwards. This reduced the crossover point significantly, # -# If someone wants to look into speeding it more, I suggest focusing on -# the multiplication by the cached `5**(8**w2)`. We only need the -# integer part of the result, so are generally computing many more -# digits than are needed for that. But we do need the exact ("as if to -# infinite precsion") integer part. +# I revisited tha code, and found ways to improve and simplify it. The +# crossover point is at about 3.4 million digits now. From 9906417095796ad9f80ab3b27cc852082dc1344b Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 15:16:44 -0500 Subject: [PATCH 08/79] Repair mysterious damage to comment. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 547f664d090bd9..9f1b01afdd93a2 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -10,7 +10,7 @@ "micro-optimizations". This module will only be imported and used for integers with a huge number of digits. Saving a few microseconds with tricky or non-obvious code is not worth it. For people looking for -maximum performance, they should use somethig like gmpy2.""" +maximum performance, they should use something like gmpy2.""" import re import decimal From 27b00ea389d81c37cf07cd3167c294a5635064fe Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 18:37:22 -0500 Subject: [PATCH 09/79] Cut the precision of the reciprocal too before multiplying. --- Lib/_pylong.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 9f1b01afdd93a2..0ef836550f5ab3 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -247,15 +247,16 @@ def inner(n, w): # This is maximally clear, but "too slow". `decimal` # division is asymptotically fast, but we have no way to # tell it to reuse the high-precision reciprocal it - # computes for pow2to5[w2], so it has to recompute them + # computes for pow256[w2], so it has to recompute it # over & over & over again :-( hi, lo = divmod(n, pow256[w2][0]) else: p256, recip = pow256[w2] # The integer part will have about half the digits of n. - # So only need that much precision, plus some guard digits. + # So only need that much precision, plus some guard + # digits. ctx.prec = (n.adjusted() >> 1) + 8 - hi = (+ n) * recip + hi = +n * +recip hi = hi.to_integral_value() # lose the fractional digits ctx.prec = decimal.MAX_PREC lo = n - hi * p256 @@ -290,9 +291,13 @@ def inner(n, w): for k, v in pow256.items(): # No need to save more precision in the reciprocal than # the power of 256 has, plus some guard digits to absorb - # most relevant rounding errors. + # most relevant rounding errors. This is highly + # signficant: 1/2**i has the same number of significant + # decimal digits as 5**i, generally over twice the + # number in 2**i, ctx.prec = v.adjusted() + 8 - # The unary "+" chope the reciprocal back to that precision. + # The unary "+" chope the reciprocal back to that + # precision. pow256[k] = v, +rpow256[k] del rpow256 # exact reciprocals no longer needed ctx.prec = decimal.MAX_PREC From 1e1b26d9d418d1dc8e0c1a816efa86974630490d Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 19:46:25 -0500 Subject: [PATCH 10/79] New comments. --- Lib/_pylong.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 0ef836550f5ab3..e43bba3b86b816 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -240,7 +240,7 @@ def inner(n, w): # ints, `int(Decimal)` took 2.5x longer than # `int(str(Decimal))`. So simplify this code to the # former if/when that gets repaired. - result.extend(int(str(n)).to_bytes(w)) + result.extend(int(str(n)).to_bytes(w)) # big-endian default return w2 = w >> 1 if 0: @@ -256,14 +256,25 @@ def inner(n, w): # So only need that much precision, plus some guard # digits. ctx.prec = (n.adjusted() >> 1) + 8 - hi = +n * +recip - hi = hi.to_integral_value() # lose the fractional digits + hi = +n * +recip # unary `+` chops back to ctx.prec digits ctx.prec = decimal.MAX_PREC + hi = hi.to_integral_value() # lose the fractional digits lo = n - hi * p256 + # Because we've been uniformly rounding down, `hi` is a + # lower bound on the correct quotient. assert lo >= 0 + # Adjust quotient up if needed. It usually isn't. In + # random testing, the loop body entered about one in 100 + # thousand cases. I never saw it need more than one + # iteration. count = 0 while lo >= p256: count += 1 + # If the assert fails, chances are decent we're + # sooooo far off it may seem to run forever + # otherwise - the error analysis was fatally flawed + # in this case.. + assert count < 10, (count, w, str(n)) lo -= p256 hi += 1 _spread[count] += 1 From 2f3b6b89670b380a1580b8e1b10f82b4c5abca32 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 19:49:56 -0500 Subject: [PATCH 11/79] Typo repair. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index e43bba3b86b816..4d8b5d3b0e2c24 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -273,7 +273,7 @@ def inner(n, w): # If the assert fails, chances are decent we're # sooooo far off it may seem to run forever # otherwise - the error analysis was fatally flawed - # in this case.. + # in this case. assert count < 10, (count, w, str(n)) lo -= p256 hi += 1 From fc2f646078efde4a37a27a4992784b095c49fa85 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 20:56:49 -0500 Subject: [PATCH 12/79] Raise BYTELIM. Add a int<->str test for a truly large int (10 million digits), which isn't currently tested. Bur regrtest will skip it unless the "cpu" resource is enabled (e.g., via "-ucpu" on the cmdline). --- Lib/_pylong.py | 2 +- Lib/test/test_int.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 4d8b5d3b0e2c24..aa5f7098f1910d 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -230,7 +230,7 @@ def inner(a, b): del defaultdict def _dec_str_to_int_inner(s): - BYTELIM = 200 + BYTELIM = 512 D = decimal.Decimal result = bytearray() diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 8959ffb6dcc236..28dddeae3c62da 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -919,5 +919,15 @@ def test_pylong_roundtrip(self): self.assertEqual(n, int(sn)) bits <<= 1 + @support.requires_resource('cpu') + def test_pylong_roundtrip_huge(self): + # k blocks of 1234567890 + k = 1_000_000 # so 10 million digits in all + tentoten = 10**10 + n = 1234567890 * ((tentoten**k - 1) // (tentoten - 1)) + sn = "1234567890" * k + self.assertEqual(n, int(sn)) + self.assertEqual(sn, str(n)) + if __name__ == "__main__": unittest.main() From 0a1daab413e566548c7bc96807eef568a91bb755 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 21:30:13 -0500 Subject: [PATCH 13/79] For long strings, switch to the new implementation if `_decimal` is present. --- Lib/_pylong.py | 190 ++++++++++++++++++++++--------------------------- 1 file changed, 87 insertions(+), 103 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index aa5f7098f1910d..402daeed1b9670 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -210,110 +210,91 @@ def inner(a, b): w5pow = compute_powers(len(s), 5, DIGLIM) return inner(0, len(s)) +# Asymptotically faster version, using the C decimal module. See +# comments at the end of the file. This uses decimal arithmetic to +# convert from base 10 to base 256. The latter is just a string of +# bytes, which CPython can convert very efficiently to a Python int. -if 1: - import math - _LOG_10_BASE_256 = math.log(10, 256) - del math - # Asymptotically faster version, using the C decimal module. Unused - # for now. See comments at the end of the file. This basically uses - # decimal arithmetic to convert from base 10 to base 256. The latter - # is just a string of bytes, which CPython can convert very - # efficiently to a Python int. - - # _spread is for testing, mapping how often `n` quotient correction - # steps are needed t0 a count of how many times that was needed. - # Usually no corrections are needed, and I haven't seen it need more - # than 1. - from collections import defaultdict - _spread = defaultdict(int) - del defaultdict - - def _dec_str_to_int_inner(s): - BYTELIM = 512 - D = decimal.Decimal - result = bytearray() - - def inner(n, w): - if w <= BYTELIM: - # XXX Stefan Pochmann discovered that, for 1024-bit - # ints, `int(Decimal)` took 2.5x longer than - # `int(str(Decimal))`. So simplify this code to the - # former if/when that gets repaired. - result.extend(int(str(n)).to_bytes(w)) # big-endian default - return - w2 = w >> 1 - if 0: - # This is maximally clear, but "too slow". `decimal` - # division is asymptotically fast, but we have no way to - # tell it to reuse the high-precision reciprocal it - # computes for pow256[w2], so it has to recompute it - # over & over & over again :-( - hi, lo = divmod(n, pow256[w2][0]) - else: - p256, recip = pow256[w2] - # The integer part will have about half the digits of n. - # So only need that much precision, plus some guard - # digits. - ctx.prec = (n.adjusted() >> 1) + 8 - hi = +n * +recip # unary `+` chops back to ctx.prec digits - ctx.prec = decimal.MAX_PREC - hi = hi.to_integral_value() # lose the fractional digits - lo = n - hi * p256 - # Because we've been uniformly rounding down, `hi` is a - # lower bound on the correct quotient. - assert lo >= 0 - # Adjust quotient up if needed. It usually isn't. In - # random testing, the loop body entered about one in 100 - # thousand cases. I never saw it need more than one - # iteration. - count = 0 - while lo >= p256: - count += 1 - # If the assert fails, chances are decent we're - # sooooo far off it may seem to run forever - # otherwise - the error analysis was fatally flawed - # in this case. - assert count < 10, (count, w, str(n)) - lo -= p256 - hi += 1 - _spread[count] += 1 - if count >= 2: - print("HUH! count", count) - input("MORE") - # The assert should always succeed, but way too slow to - # keep enabled. - #assert hi, lo == divmod(n, pow256[w2][0]) - inner(hi, w - w2) - inner(lo, w2) - - w = int(len(s) * _LOG_10_BASE_256) + 1 - with decimal.localcontext(_unbounded_dec_context) as ctx: - D256 = D(256) - pow256 = compute_powers(w, D256, BYTELIM) - rpow256 = compute_powers(w, 1 / D256, BYTELIM) - # We're going to do inexact, chopped arithmetic, multiplying - # by an approximation to the reciprocal of 256**i. We chop - # to get a lower bound on the true integer quotient. Our - # approximation is a lower bound, the multiply is chopped - # too, and to_integral_value() is also chopped. - ctx.traps[decimal.Inexact] = 0 - ctx.rounding = decimal.ROUND_DOWN - for k, v in pow256.items(): - # No need to save more precision in the reciprocal than - # the power of 256 has, plus some guard digits to absorb - # most relevant rounding errors. This is highly - # signficant: 1/2**i has the same number of significant - # decimal digits as 5**i, generally over twice the - # number in 2**i, - ctx.prec = v.adjusted() + 8 - # The unary "+" chope the reciprocal back to that - # precision. - pow256[k] = v, +rpow256[k] - del rpow256 # exact reciprocals no longer needed +import math +_LOG_10_BASE_256 = math.log(10, 256) +del math + +def _dec_str_to_int_inner(s): + BYTELIM = 512 + D = decimal.Decimal + result = bytearray() + + def inner(n, w): + if w <= BYTELIM: + # XXX Stefan Pochmann discovered that, for 1024-bit ints, + # `int(Decimal)` took 2.5x longer than `int(str(Decimal))`. + # So simplify this code to the former if/when that gets + # repaired. + result.extend(int(str(n)).to_bytes(w)) # big-endian default + return + w2 = w >> 1 + if 0: + # This is maximally clear, but "too slow". `decimal` + # division is asymptotically fast, but we have no way to + # tell it to reuse the high-precision reciprocal it computes + # for pow256[w2], so it has to recompute it over & over & + # over again :-( + hi, lo = divmod(n, pow256[w2][0]) + else: + p256, recip = pow256[w2] + # The integer part will have about half the digits of n. So + # only need that much precision, plus some guard digits. + ctx.prec = (n.adjusted() >> 1) + 8 + hi = +n * +recip # unary `+` chops back to ctx.prec digits ctx.prec = decimal.MAX_PREC - inner(D(s), w) - return int.from_bytes(result) + hi = hi.to_integral_value() # lose the fractional digits + lo = n - hi * p256 + # Because we've been uniformly rounding down, `hi` is a + # lower bound on the correct quotient. + assert lo >= 0 + # Adjust quotient up if needed. It usually isn't. In random + # testing, the loop body entered about one in 100 thousand + # cases. I never saw it need more than one iteration. + count = 0 + while lo >= p256: + count += 1 + # If the assert fails, chances are decent we're sooooo + # far off it may seem to run forever otherwise - the + # error analysis was fatally flawed in this case. + assert count < 10, (count, w, str(n)) + lo -= p256 + hi += 1 + # The assert should always succeed, but way too slow to keep + # enabled. + #assert hi, lo == divmod(n, pow256[w2][0]) + inner(hi, w - w2) + inner(lo, w2) + + w = int(len(s) * _LOG_10_BASE_256) + 1 + with decimal.localcontext(_unbounded_dec_context) as ctx: + D256 = D(256) + pow256 = compute_powers(w, D256, BYTELIM) + rpow256 = compute_powers(w, 1 / D256, BYTELIM) + # We're going to do inexact, chopped arithmetic, multiplying by + # an approximation to the reciprocal of 256**i. We chop to get a + # lower bound on the true integer quotient. Our approximation is + # a lower bound, the multiply is chopped too, and + # to_integral_value() is also chopped. + ctx.traps[decimal.Inexact] = 0 + ctx.rounding = decimal.ROUND_DOWN + for k, v in pow256.items(): + # No need to save more precision in the reciprocal than the + # power of 256 has, plus some guard digits to absorb most + # relevant rounding errors. This is highly signficant: + # 1/2**i has the same number of significant decimal digits + # as 5**i, generally over twice the number in 2**i, + ctx.prec = v.adjusted() + 8 + # The unary "+" chope the reciprocal back to that precision. + pow256[k] = v, +rpow256[k] + del rpow256 # exact reciprocals no longer needed + ctx.prec = decimal.MAX_PREC + inner(D(s), w) + return int.from_bytes(result) def int_from_string(s): """Asymptotically fast version of PyLong_FromString(), conversion @@ -323,7 +304,10 @@ def int_from_string(s): # and underscores, and stripped leading whitespace. The input can still # contain underscores and have trailing whitespace. s = s.rstrip().replace('_', '') - return _str_to_int_inner(s) + func = _str_to_int_inner + if len(s) >= 3_500_000 and _decimal is not None: + func = _dec_str_to_int_inner + return func(s) def str_to_int(s): """Asymptotically fast version of decimal string to 'int' conversion.""" From 1977774f9478c182046a536aaca4ce52b088d091 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 9 May 2024 02:37:31 +0000 Subject: [PATCH 14/79] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst new file mode 100644 index 00000000000000..edcb1d18523591 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst @@ -0,0 +1 @@ +If the C version of the ``decimal`` module is available, ``str(int)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the integer requires over about 3.5 million digits. From 5ce49aeb2bff3272d5c94747eaf55e2f5d15b48f Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 8 May 2024 21:40:55 -0500 Subject: [PATCH 15/79] Update 2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst Fix the news. --- .../2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst index edcb1d18523591..e56580ba83eba5 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst @@ -1 +1 @@ -If the C version of the ``decimal`` module is available, ``str(int)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the integer requires over about 3.5 million digits. +If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 3.5 million digits. From 998b8714daabf83608428f38ca0ac24e9a5a627d Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 9 May 2024 03:59:27 -0500 Subject: [PATCH 16/79] Do a better job of picking working precision for the multiply. And do a much better ;-) job of explaining it. --- Lib/_pylong.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 402daeed1b9670..c66077874da3e7 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -242,9 +242,13 @@ def inner(n, w): hi, lo = divmod(n, pow256[w2][0]) else: p256, recip = pow256[w2] - # The integer part will have about half the digits of n. So - # only need that much precision, plus some guard digits. - ctx.prec = (n.adjusted() >> 1) + 8 + # The integer part will have a number of digits about equal + # to the difference between the log10s of `n` and `pow256` + # (which, since these are integers, is roughly approximated + # by `.adjusted()`). That's the working precision we need, + # but add some guard digits to protect against the "about" + # and "roughly" uncertainties. + ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + 8 hi = +n * +recip # unary `+` chops back to ctx.prec digits ctx.prec = decimal.MAX_PREC hi = hi.to_integral_value() # lose the fractional digits From 53b4fa8befeea0b8fcc78981e2f0a5cb654063b4 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 9 May 2024 12:59:22 -0500 Subject: [PATCH 17/79] Fotce the initial `w` to be an upper bound on the true value. This is needed so the error analyais makes sense. Also give up if `w` requires at least 43 bits, 10 less than the precision of an IEEE-754 double. That's an extremely cautious guard against low-order bit rot in log256(10) and *. But I can't test that. Don't have 21 terabytes of RAM to build a string long enough to trigger it. Nobody will ever, in good faith, pass a string that large. --- Lib/_pylong.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index c66077874da3e7..376fc07f942fc2 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -225,6 +225,7 @@ def _dec_str_to_int_inner(s): result = bytearray() def inner(n, w): + #assert n < D256 ** w # required, but too expensive to check if w <= BYTELIM: # XXX Stefan Pochmann discovered that, for 1024-bit ints, # `int(Decimal)` took 2.5x longer than `int(str(Decimal))`. @@ -274,7 +275,28 @@ def inner(n, w): inner(hi, w - w2) inner(lo, w2) - w = int(len(s) * _LOG_10_BASE_256) + 1 + # How many base 256 digits are needed?. Mathematically, exactly + # floor(log256(int(s))) + 1. There is no cheap way to compute this. + # But we can get an upper buond, and that's necessary for our error + # analysis to make sense. int(s) < 10**len(s), so the log needed is + # < log256(10**len(s)) = len(s) * log256(10). However, using + # finite-precision floating point for this, it's possible that the + # computed value is a little less than the true value. If the true + # value is at - or a little higher than - an integer, we can get an + # off-by-1 error too low. So we add 2 instead of 1 if chopping lost + # a fraction > 0.9. + log_ub = len(s) * _LOG_10_BASE_256 + log_ub_as_int = int(log_ub) + w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 9.9) + # And what if we'vv plain exhausted the limits of HW floats? We + # could compute the log to any desired precision using `decimal`, + # but it's not plausible that anyone will pass a string requiring + # trillions of bytes (unles they're just trying to "break things"). + if w.bit_length() >= 43: + # "Only" had < 53 - 43 = 10 bits to spare in IEEE-754 double. + # XXX I can't test this - don't have 21 terabytes of RAM to + # build a string long enough to trigger this. + raise ValueError(f"cannot convert string of len {len(s)} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) From 1a78510dabc7c168e2fe1ea6c361e674ed77a094 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 9 May 2024 16:12:53 -0500 Subject: [PATCH 18/79] Serhiy wore me out ;-) Add best possible log(10, 255) constant. Since this eliminates the (by far) largest possible source of error in computing the number of bytes needed, boost the "string too large" limit to a `w` of 47 bits. Now we'd need a 339 terabyte input string to trigger it. Pure fantasy. a --- Lib/_pylong.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 376fc07f942fc2..9664f1991f2ef2 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -215,9 +215,12 @@ def inner(a, b): # convert from base 10 to base 256. The latter is just a string of # bytes, which CPython can convert very efficiently to a Python int. -import math -_LOG_10_BASE_256 = math.log(10, 256) -del math +# log of 10 to base 256 with best-possible 53-bit precision. Obtained +# via: +# from mpmath import mp +# mp.prec = 1000 +# print(float(mp.log(10, 256)).hex()) +_LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415 def _dec_str_to_int_inner(s): BYTELIM = 512 @@ -292,9 +295,9 @@ def inner(n, w): # could compute the log to any desired precision using `decimal`, # but it's not plausible that anyone will pass a string requiring # trillions of bytes (unles they're just trying to "break things"). - if w.bit_length() >= 43: - # "Only" had < 53 - 43 = 10 bits to spare in IEEE-754 double. - # XXX I can't test this - don't have 21 terabytes of RAM to + if w.bit_length() >= 46: + # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. + # XXX I can't test this - don't have 339 terabytes of RAM to # build a string long enough to trigger this. raise ValueError(f"cannot convert string of len {len(s)} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: From b766ebfedca1432f9bc60cf009cb95de064adf1c Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 9 May 2024 16:24:30 -0500 Subject: [PATCH 19/79] Repair comment. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 9664f1991f2ef2..c4377abd0ae603 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -297,7 +297,7 @@ def inner(n, w): # trillions of bytes (unles they're just trying to "break things"). if w.bit_length() >= 46: # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. - # XXX I can't test this - don't have 339 terabytes of RAM to + # XXX I can't test this - don't have 169 terabytes of RAM to # build a string long enough to trigger this. raise ValueError(f"cannot convert string of len {len(s)} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: From 9cccb8dc072cd42f75d4d859799a8051251daafb Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 9 May 2024 19:54:20 -0500 Subject: [PATCH 20/79] Repair test for "close to maybe wrong" initial computation of `w`. Change the correction-loop assert to put useful stuff in its detail. Nobody was ever going to save such a message that includes the full megabytes (or gigabytes, or ...) of the input string. Instead give short, critical info, about the byte and digit offets in play. That should suffice to reverse-engineer what went wrong in the error analysis. _All_ input strings with the same length go through exactly the same analysis. Although, no, I still don't expect it to ever trigger. --- Lib/_pylong.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index c4377abd0ae603..bd74bc6834df13 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -269,7 +269,8 @@ def inner(n, w): # If the assert fails, chances are decent we're sooooo # far off it may seem to run forever otherwise - the # error analysis was fatally flawed in this case. - assert count < 10, (count, w, str(n)) + assert count < 10, (count, w, len(s), + n.adjusted(), p256.adjusted()) lo -= p256 hi += 1 # The assert should always succeed, but way too slow to keep @@ -290,7 +291,7 @@ def inner(n, w): # a fraction > 0.9. log_ub = len(s) * _LOG_10_BASE_256 log_ub_as_int = int(log_ub) - w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 9.9) + w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9) # And what if we'vv plain exhausted the limits of HW floats? We # could compute the log to any desired precision using `decimal`, # but it's not plausible that anyone will pass a string requiring From a241843e4f22f96ce0ccb3548fbfef054a1b1294 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 16:12:52 -0500 Subject: [PATCH 21/79] Added exact quotient correction for cases adding 1 once isn't enough. We have no test case that takes that path, although they're easy to provoke by reducing the new GUARD parameter & trying input strings of the form `"9" * n`. --- Lib/_pylong.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index bd74bc6834df13..73f587fc323fb3 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -226,6 +226,19 @@ def _dec_str_to_int_inner(s): BYTELIM = 512 D = decimal.Decimal result = bytearray() + # We only want the integer part of divisions, so don't need to build + # the full multiplication tree. But using _just_ the number of + # digits expected in the integer part ignores too much. What's left + # out can have a very significant effect on the quotient. So we use + # GUARD additional digits. Exact analysis of this eluded me. 8 is + # more than enough so no more than 1 correction step was ever needed + # for all inputs tried through 2.5 billion digita. So, + # empirically, "good enough". Toward this end, testing on inputs + # with just "9" digits is most effective. If this is too small for + # some input much larger than ever tried, a different path detects + # that and usss divmod() instead (which ignores nothing, so is exact + # from the start - but also much slower). + GUARD = 8 # must be > 0 to avoid setting `decimal` precision to 9 def inner(n, w): #assert n < D256 ** w # required, but too expensive to check @@ -250,9 +263,7 @@ def inner(n, w): # to the difference between the log10s of `n` and `pow256` # (which, since these are integers, is roughly approximated # by `.adjusted()`). That's the working precision we need, - # but add some guard digits to protect against the "about" - # and "roughly" uncertainties. - ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + 8 + ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD hi = +n * +recip # unary `+` chops back to ctx.prec digits ctx.prec = decimal.MAX_PREC hi = hi.to_integral_value() # lose the fractional digits @@ -261,18 +272,17 @@ def inner(n, w): # lower bound on the correct quotient. assert lo >= 0 # Adjust quotient up if needed. It usually isn't. In random - # testing, the loop body entered about one in 100 thousand - # cases. I never saw it need more than one iteration. - count = 0 - while lo >= p256: - count += 1 - # If the assert fails, chances are decent we're sooooo - # far off it may seem to run forever otherwise - the - # error analysis was fatally flawed in this case. - assert count < 10, (count, w, len(s), - n.adjusted(), p256.adjusted()) + # testing on inputs through 2.5 billion digit strings, the + # test triggered about one in 100 thousand cases. + if lo >= p256: lo -= p256 hi += 1 + if lo >= p256: + # Complete correction via an exact computation. + # XXX we have no input that takes this branch + # It's only been tested by reducing GUARD a lot. + hi2, lo = divmod(lo, p256) + hi += hi2 # The assert should always succeed, but way too slow to keep # enabled. #assert hi, lo == divmod(n, pow256[w2][0]) @@ -318,7 +328,7 @@ def inner(n, w): # relevant rounding errors. This is highly signficant: # 1/2**i has the same number of significant decimal digits # as 5**i, generally over twice the number in 2**i, - ctx.prec = v.adjusted() + 8 + ctx.prec = v.adjusted() + GUARD # The unary "+" chope the reciprocal back to that precision. pow256[k] = v, +rpow256[k] del rpow256 # exact reciprocals no longer needed From 45708dbc6eca4459d8ae5671a68cf9e1696de717 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 20:30:27 -0500 Subject: [PATCH 22/79] Check in testing helper I kept ripping out before earlier commits. It's cheap, so leave it in. And this allows adding a test using it to exercise the new "failsafe" logic that guards against having too few guard digits. --- Lib/_pylong.py | 20 +++++++++++++++++--- Lib/test/test_int.py | 20 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 73f587fc323fb3..391b265e06ea7e 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -210,6 +210,7 @@ def inner(a, b): w5pow = compute_powers(len(s), 5, DIGLIM) return inner(0, len(s)) + # Asymptotically faster version, using the C decimal module. See # comments at the end of the file. This uses decimal arithmetic to # convert from base 10 to base 256. The latter is just a string of @@ -222,7 +223,16 @@ def inner(a, b): # print(float(mp.log(10, 256)).hex()) _LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415 -def _dec_str_to_int_inner(s): +# _apread is for internal testing. It maps a key to the number of times +# that condition obtained in _dec_str_to_int_inner: +# key 0 - quotient guess was right +# key 1 - quotient had to be boosted by 1, one time +# key 999 - one adjustment wasn't enough, so fell back to divmod +from collections import defaultdict +_spread = defaultdict(int) +del defaultdict + +def _dec_str_to_int_inner(s, GUARD=8): BYTELIM = 512 D = decimal.Decimal result = bytearray() @@ -238,7 +248,7 @@ def _dec_str_to_int_inner(s): # some input much larger than ever tried, a different path detects # that and usss divmod() instead (which ignores nothing, so is exact # from the start - but also much slower). - GUARD = 8 # must be > 0 to avoid setting `decimal` precision to 9 + assert GUARD > 0 # if 0, `decimal` can blow up - .prec 0 not allowed def inner(n, w): #assert n < D256 ** w # required, but too expensive to check @@ -274,15 +284,19 @@ def inner(n, w): # Adjust quotient up if needed. It usually isn't. In random # testing on inputs through 2.5 billion digit strings, the # test triggered about one in 100 thousand cases. + count = 0 if lo >= p256: + count = 1 lo -= p256 hi += 1 if lo >= p256: + count = 999 # Complete correction via an exact computation. # XXX we have no input that takes this branch - # It's only been tested by reducing GUARD a lot. + # It's only tested by reducing GUARD a lot. hi2, lo = divmod(lo, p256) hi += hi2 + _spread[count] += 1 # The assert should always succeed, but way too slow to keep # enabled. #assert hi, lo == divmod(n, pow256[w2][0]) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 28dddeae3c62da..69b16c8d2d4478 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -929,5 +929,25 @@ def test_pylong_roundtrip_huge(self): self.assertEqual(n, int(sn)) self.assertEqual(sn, str(n)) + @support.requires_resource('cpu') + @unittest.skipUnless(_pylong, "_pylong module required") + def test_whitebox_dec_str_to_int_inner_failsafe(self): + # The number of "guard digits" used in _dec_str_to_int_inner may not + # be adequate for very large inputs, but we haven't yet found an + # input for which the default of 8 isn't. So normal tests can't reach + # its "oops! not good enough" block. Here we test a contrived input + # which _does_ reach that block, provided the number of guard digits + # is reduced to 3. + sn = "9" * 4000000 + n = 10**len(sn) - 1 + orig_spread = _pylong._spread.copy() + _pylong._spread.clear() + try: + self.assertEqual(n, _pylong._dec_str_to_int_inner(sn, GUARD=3)) + self.assertIn(999, _pylong._spread) + finally: + _pylong._spread.clear() + _pylong._spread.update(orig_spread) + if __name__ == "__main__": unittest.main() From 83f07dac640e7422a4a03efcd7b6ac251d315778 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 22:18:14 -0500 Subject: [PATCH 23/79] And one more test, of thta absurdly large inputs are rejected. --- Lib/_pylong.py | 2 -- Lib/test/test_int.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 391b265e06ea7e..9931cddddc9bf0 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -322,8 +322,6 @@ def inner(n, w): # trillions of bytes (unles they're just trying to "break things"). if w.bit_length() >= 46: # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. - # XXX I can't test this - don't have 169 terabytes of RAM to - # build a string long enough to trigger this. raise ValueError(f"cannot convert string of len {len(s)} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 69b16c8d2d4478..03c4be159e115f 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -949,5 +949,23 @@ def test_whitebox_dec_str_to_int_inner_failsafe(self): _pylong._spread.clear() _pylong._spread.update(orig_spread) + @unittest.skipUnless(_pylong, "pylong module required") + def test_whitebox_dec_str_to_int_inner_monster(self): + # I don't think anyone has enough RAM o build a string long enough + # for this function to complain. So lie about the string length. + + class LyingStr(str): + def __len__(self): + return int((1 << 47) / _pylong._LOG_10_BASE_256) + + liar = LyingStr("42") + # We have to pass the liar directly to the complaining function. If we + # just try `int(liar)`, earlier layers will replace it with plain old + # "43". + self.assertRaisesRegex(ValueError, + f"^cannot convert string of len {len(liar)} to int$", + _pylong._dec_str_to_int_inner, + liar) + if __name__ == "__main__": unittest.main() From ae41a106bc4fff8240d933730e8dd70c26a301e2 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 22:52:38 -0500 Subject: [PATCH 24/79] Trying to repair new test that failed on some test platforms. --- Lib/_pylong.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 9931cddddc9bf0..08546db69829dc 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -322,7 +322,10 @@ def inner(n, w): # trillions of bytes (unles they're just trying to "break things"). if w.bit_length() >= 46: # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. - raise ValueError(f"cannot convert string of len {len(s)} to int") + # embedding `len(s)` in the f-sring failed on some test + # platforms - don't know why + L = len(s) + raise ValueError(f"cannot convert string of len {L} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) From 3779a9291319c9970fbb333b161b1d05a42d1d53 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 23:11:22 -0500 Subject: [PATCH 25/79] I don't know what the "WASI" testbot is, except that it's annoying ;-) --- Lib/test/test_int.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 03c4be159e115f..956abedec35a6c 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -962,6 +962,11 @@ def __len__(self): # We have to pass the liar directly to the complaining function. If we # just try `int(liar)`, earlier layers will replace it with plain old # "43". + # Embedding `len(liar)` into the f-string failed on the WASI testbot + # (don't know what that is): + # OverflowError: cannot fit 'int' into an index-sized integer + # So a random stab at worming around that. + L = len(liar) self.assertRaisesRegex(ValueError, f"^cannot convert string of len {len(liar)} to int$", _pylong._dec_str_to_int_inner, From 5140c231eda391d265622554811633dfa70b5cac Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 23:13:40 -0500 Subject: [PATCH 26/79] And more random thrashing. --- Lib/test/test_int.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 956abedec35a6c..524546429a5115 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -968,7 +968,7 @@ def __len__(self): # So a random stab at worming around that. L = len(liar) self.assertRaisesRegex(ValueError, - f"^cannot convert string of len {len(liar)} to int$", + f"^cannot convert string of len {L} to int$", _pylong._dec_str_to_int_inner, liar) From 45c1da05c7d0fcd941ed5b824337a014607151f9 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 23:34:10 -0500 Subject: [PATCH 27/79] Another random stab at making WASI happy. --- Lib/_pylong.py | 5 ++--- Lib/test/test_int.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 08546db69829dc..b53fe51eb6712d 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -323,9 +323,8 @@ def inner(n, w): if w.bit_length() >= 46: # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. # embedding `len(s)` in the f-sring failed on some test - # platforms - don't know why - L = len(s) - raise ValueError(f"cannot convert string of len {L} to int") + # platform (WASI) + raise ValueError(f"cannot convert string of len {s.__len__()} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 524546429a5115..fcb6ca18797f4c 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -966,9 +966,8 @@ def __len__(self): # (don't know what that is): # OverflowError: cannot fit 'int' into an index-sized integer # So a random stab at worming around that. - L = len(liar) self.assertRaisesRegex(ValueError, - f"^cannot convert string of len {L} to int$", + f"^cannot convert string of len {liar.__len__()} to int$", _pylong._dec_str_to_int_inner, liar) From ee17e541c2fe830722023c107fee791b5d40538d Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 10 May 2024 23:53:24 -0500 Subject: [PATCH 28/79] More random thrashing to try to shut up WASI :-( --- Lib/_pylong.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index b53fe51eb6712d..bdfc796ef45a5d 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -313,7 +313,11 @@ def inner(n, w): # value is at - or a little higher than - an integer, we can get an # off-by-1 error too low. So we add 2 instead of 1 if chopping lost # a fraction > 0.9. - log_ub = len(s) * _LOG_10_BASE_256 + + # The "WASI" test platfrom can complain about `len(s)` if it's too + # large to fit in its idea of "an index-sized integer". + lenS = s.__len__() + log_ub = lenS * _LOG_10_BASE_256 log_ub_as_int = int(log_ub) w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9) # And what if we'vv plain exhausted the limits of HW floats? We @@ -322,9 +326,7 @@ def inner(n, w): # trillions of bytes (unles they're just trying to "break things"). if w.bit_length() >= 46: # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. - # embedding `len(s)` in the f-sring failed on some test - # platform (WASI) - raise ValueError(f"cannot convert string of len {s.__len__()} to int") + raise ValueError(f"cannot convert string of len {lenS} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) From ed18fdccd2811ee9265bbb6c813608fa0e9c4ace Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 11:02:33 -0500 Subject: [PATCH 29/79] Make GUARD a keyword-only argument. Plus many comment changes. Explain GUARD in much more detail at the end of the file. I believe 5 is actually large enough, but there's ne simple way to explain why even a million would always be large enough. --- Lib/_pylong.py | 81 +++++++++++++++++++++++++++++++++++--------- Lib/test/test_int.py | 12 +++---- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index bdfc796ef45a5d..8c1ed9e4635ff3 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -232,22 +232,11 @@ def inner(a, b): _spread = defaultdict(int) del defaultdict -def _dec_str_to_int_inner(s, GUARD=8): +def _dec_str_to_int_inner(s, *, GUARD=8): BYTELIM = 512 D = decimal.Decimal result = bytearray() - # We only want the integer part of divisions, so don't need to build - # the full multiplication tree. But using _just_ the number of - # digits expected in the integer part ignores too much. What's left - # out can have a very significant effect on the quotient. So we use - # GUARD additional digits. Exact analysis of this eluded me. 8 is - # more than enough so no more than 1 correction step was ever needed - # for all inputs tried through 2.5 billion digita. So, - # empirically, "good enough". Toward this end, testing on inputs - # with just "9" digits is most effective. If this is too small for - # some input much larger than ever tried, a different path detects - # that and usss divmod() instead (which ignores nothing, so is exact - # from the start - but also much slower). + # See notes at end of file for discussion of GUARD. assert GUARD > 0 # if 0, `decimal` can blow up - .prec 0 not allowed def inner(n, w): @@ -290,10 +279,11 @@ def inner(n, w): lo -= p256 hi += 1 if lo >= p256: + # Complete correction via an exact computation. I + # believe it's not possible to get here provided + # GUARD >= 5. It's tested by reducing GUARD below + # that. count = 999 - # Complete correction via an exact computation. - # XXX we have no input that takes this branch - # It's only tested by reducing GUARD a lot. hi2, lo = divmod(lo, p256) hi += hi2 _spread[count] += 1 @@ -523,3 +513,62 @@ def int_divmod(a, b): # # I revisited tha code, and found ways to improve and simplify it. The # crossover point is at about 3.4 million digits now. +# +# GUARD digits +# ------------ +# We only want the integer part of divisions, so don't need to build +# the full multiplication tree. But using _just_ the number of +# digits expected in the integer part ignores too much. What's left +# out can have a very significant effect on the quotient. So we use +# GUARD additional digits. +# +# The default 8 is more than enough so no more than 1 correction step +# was ever needed for all inputs tried through 2.5 billion digita. In +# fact, I believe 5 guard digis are always enougn - but the proof is +# very involved, so better safe than sorry. +# +# Short course: +# +# If prec is the decimal precision in effect, and we're rounding down, +# the result of an operation is exactly equal to the infinitely precise +# result times 1-e for some real e with 0 <= e < 10**(1-prec). We have +# 3 operations: chopping n back to prec digits, likewise for 1/256**w2, +# and also for their product. +# +# So the computed product is exactly equal to the true product times +# (1-e1)*(1-e2)*(1-e3); since the e's are all very small, an excellent +# approximation to the second factor is 1-(e1+e2+e3) (the 2nd and 3rd +# order terms in the expanded product are too tiny to matter). If +# they're all as large as possible, that's 1 - 3*10**(1-prec). This, +# BTW, is all bog-standard FP error analysis. +# +# That implies the computed product is within 1 of the true product +# provided prec >= log10(true_product) + 1.47712125. +# +# Here are telegraphic details, rephrasing the initial condition in +# equivalent ways, step by step: +# +# prod - prod * (1 - 3*10**(1-prec)) <= 1 +# prod - prod + prod * 3*10**(1-prec)) <= 1 +# prod * 3*10**(1-prec)) <= 1 +# 10**(log10(prod)) * 3*10**(1-prec)) <= 1 +# 3*10**(1-prec+log10(prod))) <= 1 +# 10**(1-prec+log10(prod))) <= 1/3 +# 1-prec+log10(prod) <= log10(1/3) = -0.47712125 +# -prec <= -1.47712125 - log10(prod) +# prec >= log10(prod) + 1.47712125 +# +# n.adjusted() - p256.adjusted() is s crude integer approximation to +# log10(true_product) - but prec is necessarily an int too, so adding a +# few guard digita is always enough to compensate for that. +# +# Also skipping why cutting the reciprocal to p256.adjusted() + GUARD +# digits to begin with is good enough. The precondition n < 256**w is +# needed to establish that the true product can't be too large for the +# reciprocal approximation to be too narrow. +# +# But since this is just a sketch of a proof ;-), the code uses the +# emprically tested 8 instead of 5. 3 digits more or less makes no +# practical difference to speed - these ints are huge. And while +# increasing GUARD above 5 may not be necessary, every increase cuts the +# percentage of cases that need a correction at all. diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index fcb6ca18797f4c..e0268a081a596c 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -932,12 +932,12 @@ def test_pylong_roundtrip_huge(self): @support.requires_resource('cpu') @unittest.skipUnless(_pylong, "_pylong module required") def test_whitebox_dec_str_to_int_inner_failsafe(self): - # The number of "guard digits" used in _dec_str_to_int_inner may not - # be adequate for very large inputs, but we haven't yet found an - # input for which the default of 8 isn't. So normal tests can't reach - # its "oops! not good enough" block. Here we test a contrived input - # which _does_ reach that block, provided the number of guard digits - # is reduced to 3. + # While I believe the number of GUARD digits in this function is + # always enough so that no more than one correction step is ever + # needed, the code has a "failsafe" path that takes over if I'm + # wrong about that. We have no input that reaches that block. + # Here we test a contrived input that _does_ reach that block, + # provided the number of guard digits is reduced to 3. sn = "9" * 4000000 n = 10**len(sn) - 1 orig_spread = _pylong._spread.copy() From a5def42dc5816e6fa0b276ef6ca3b3bc4beb96c6 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 11:36:12 -0500 Subject: [PATCH 30/79] repair spelling mistake in comment --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 8c1ed9e4635ff3..a443b3d30a6f25 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -568,7 +568,7 @@ def int_divmod(a, b): # reciprocal approximation to be too narrow. # # But since this is just a sketch of a proof ;-), the code uses the -# emprically tested 8 instead of 5. 3 digits more or less makes no +# empirically tested 8 instead of 5. 3 digits more or less makes no # practical difference to speed - these ints are huge. And while # increasing GUARD above 5 may not be necessary, every increase cuts the # percentage of cases that need a correction at all. From 727d39663e9cabb6007dbd3924dde3223f337267 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 12:59:48 -0500 Subject: [PATCH 31/79] Update Lib/_pylong.py Co-authored-by: Pieter Eendebak --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index a443b3d30a6f25..2d5b6ebe6aba51 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -524,7 +524,7 @@ def int_divmod(a, b): # # The default 8 is more than enough so no more than 1 correction step # was ever needed for all inputs tried through 2.5 billion digita. In -# fact, I believe 5 guard digis are always enougn - but the proof is +# fact, I believe 5 guard digits are always enough - but the proof is # very involved, so better safe than sorry. # # Short course: From 505012c140a09e3a834f4069d9a76ed21b82d08b Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 12:59:54 -0500 Subject: [PATCH 32/79] Update Lib/_pylong.py Co-authored-by: Pieter Eendebak --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 2d5b6ebe6aba51..302770bd55180f 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -511,7 +511,7 @@ def int_divmod(a, b): # by a cached reciprocal approximation instead, fixing up errors # afterwards. This reduced the crossover point significantly, # -# I revisited tha code, and found ways to improve and simplify it. The +# I revisited the code, and found ways to improve and simplify it. The # crossover point is at about 3.4 million digits now. # # GUARD digits From 83ea16189e0ec23a4c53be5bae795ce1ff8729d5 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 16:04:57 -0500 Subject: [PATCH 33/79] I now believe GUARD=3(!) is sufficient. And that's the smallest value the error analysis concluded _could_ work. The analysis may actually be "crisp" now :-) The problem wasn't ever in the "crude approximation" `.adjusted()` gives for log10. Turns out that's always an upper bound on what's needed. The problem was that the precommputed reciprocal appromixations didn't save quite enough digits to handle the worst cases. I got burned by that before (and the fix _then_ was to force the guess for the number of bytes needed to be an upper bound). This one was subtler. --- Lib/_pylong.py | 32 ++++++++++++++++++++++++-------- Lib/test/test_int.py | 8 ++++---- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 302770bd55180f..6048d8ac2e7ff3 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -281,7 +281,7 @@ def inner(n, w): if lo >= p256: # Complete correction via an exact computation. I # believe it's not possible to get here provided - # GUARD >= 5. It's tested by reducing GUARD below + # GUARD >= 3. It's tested by reducing GUARD below # that. count = 999 hi2, lo = divmod(lo, p256) @@ -329,12 +329,12 @@ def inner(n, w): ctx.traps[decimal.Inexact] = 0 ctx.rounding = decimal.ROUND_DOWN for k, v in pow256.items(): - # No need to save more precision in the reciprocal than the - # power of 256 has, plus some guard digits to absorb most - # relevant rounding errors. This is highly signficant: + # No need to save much more precision in the reciprocal than + # the power of 256 has, plus some guard digits to absorb + # most relevant rounding errors. This is highly signficant: # 1/2**i has the same number of significant decimal digits # as 5**i, generally over twice the number in 2**i, - ctx.prec = v.adjusted() + GUARD + ctx.prec = v.adjusted() + GUARD + 2 # The unary "+" chope the reciprocal back to that precision. pow256[k] = v, +rpow256[k] del rpow256 # exact reciprocals no longer needed @@ -559,16 +559,32 @@ def int_divmod(a, b): # prec >= log10(prod) + 1.47712125 # # n.adjusted() - p256.adjusted() is s crude integer approximation to -# log10(true_product) - but prec is necessarily an int too, so adding a -# few guard digita is always enough to compensate for that. +# log10(true_product) - but prec is necessarily an int too, and via +# tedious case analysis it can be shown that the "crude xpproaximation" +# is always an upper bouns on what's needed (it's either spot on, or +# one larger than necessary). # # Also skipping why cutting the reciprocal to p256.adjusted() + GUARD # digits to begin with is good enough. The precondition n < 256**w is # needed to establish that the true product can't be too large for the -# reciprocal approximation to be too narrow. +# reciprocal approximation to be too narrow. But read on for more ;-) # # But since this is just a sketch of a proof ;-), the code uses the # empirically tested 8 instead of 5. 3 digits more or less makes no # practical difference to speed - these ints are huge. And while # increasing GUARD above 5 may not be necessary, every increase cuts the # percentage of cases that need a correction at all. +# +# LATER: doing this analysis pointed out an error: our division isn't +# exactly "balanced", in that when `w` is odd the integer part of +# n/256**w2 can be larger than 256**w2. The code used enough working +# precision in the multiply then, but the precommputed reciprocal +# approximation didn't have that many good digits to give. This was +# repaired by retaining 2 more digits in the reciprocal. +# +# After that, I believe GUARD=3 should be enough. Which was "the +# obvious" conclusion I leaped to after deriving `prec >= log10(prod) + +# 1.47712125` (adding the fractional part of the log to 1.47 ... could +# push that over 2, and then the ceiling is needed to get an integer >= +# to that). But, at that time, I knew GUARDs of 3 and 4 "didn't work". + diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index e0268a081a596c..593c5ba7ce7b5c 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -937,13 +937,13 @@ def test_whitebox_dec_str_to_int_inner_failsafe(self): # needed, the code has a "failsafe" path that takes over if I'm # wrong about that. We have no input that reaches that block. # Here we test a contrived input that _does_ reach that block, - # provided the number of guard digits is reduced to 3. - sn = "9" * 4000000 - n = 10**len(sn) - 1 + # provided the number of guard digits is reduced to 1. + sn = "6" * (4000000 - 1) + n = (10**len(sn) - 1) // 9 * 6 orig_spread = _pylong._spread.copy() _pylong._spread.clear() try: - self.assertEqual(n, _pylong._dec_str_to_int_inner(sn, GUARD=3)) + self.assertEqual(n, _pylong._dec_str_to_int_inner(sn, GUARD=1)) self.assertIn(999, _pylong._spread) finally: _pylong._spread.clear() From 07c70bdaaa5d7f070a76267b025d0ba518a4a86a Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 16:19:56 -0500 Subject: [PATCH 34/79] Apply suggestions from code review Co-authored-by: Pieter Eendebak --- Lib/_pylong.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 6048d8ac2e7ff3..4b22b940c3b874 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -223,7 +223,7 @@ def inner(a, b): # print(float(mp.log(10, 256)).hex()) _LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415 -# _apread is for internal testing. It maps a key to the number of times +# _spread is for internal testing. It maps a key to the number of times # that condition obtained in _dec_str_to_int_inner: # key 0 - quotient guess was right # key 1 - quotient had to be boosted by 1, one time @@ -295,7 +295,7 @@ def inner(n, w): # How many base 256 digits are needed?. Mathematically, exactly # floor(log256(int(s))) + 1. There is no cheap way to compute this. - # But we can get an upper buond, and that's necessary for our error + # But we can get an upper bound, and that's necessary for our error # analysis to make sense. int(s) < 10**len(s), so the log needed is # < log256(10**len(s)) = len(s) * log256(10). However, using # finite-precision floating point for this, it's possible that the @@ -310,7 +310,7 @@ def inner(n, w): log_ub = lenS * _LOG_10_BASE_256 log_ub_as_int = int(log_ub) w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9) - # And what if we'vv plain exhausted the limits of HW floats? We + # And what if we have plain exhausted the limits of HW floats? We # could compute the log to any desired precision using `decimal`, # but it's not plausible that anyone will pass a string requiring # trillions of bytes (unles they're just trying to "break things"). From fcfa5aa5adaaceb34cf1e1f1b6e6b908a95f41ed Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 16:30:48 -0500 Subject: [PATCH 35/79] The lint checker whinef about a blank line at the end of the file. And, for whatever reason, GitHub appeared to go on to toss code review changes committed after that but before this. --- Lib/_pylong.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 4b22b940c3b874..6810448643e660 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -223,7 +223,7 @@ def inner(a, b): # print(float(mp.log(10, 256)).hex()) _LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415 -# _spread is for internal testing. It maps a key to the number of times +# _apread is for internal testing. It maps a key to the number of times # that condition obtained in _dec_str_to_int_inner: # key 0 - quotient guess was right # key 1 - quotient had to be boosted by 1, one time @@ -310,7 +310,7 @@ def inner(n, w): log_ub = lenS * _LOG_10_BASE_256 log_ub_as_int = int(log_ub) w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9) - # And what if we have plain exhausted the limits of HW floats? We + # And what if we've plain exhausted the limits of HW floats? We # could compute the log to any desired precision using `decimal`, # but it's not plausible that anyone will pass a string requiring # trillions of bytes (unles they're just trying to "break things"). @@ -587,4 +587,3 @@ def int_divmod(a, b): # 1.47712125` (adding the fractional part of the log to 1.47 ... could # push that over 2, and then the ceiling is needed to get an integer >= # to that). But, at that time, I knew GUARDs of 3 and 4 "didn't work". - From 7763ab28d5fef5b8d8cb732a4c1e3ac72ac46f3b Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 16:35:41 -0500 Subject: [PATCH 36/79] And restoring another code review change. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 6810448643e660..d2302aa42ac45e 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -223,7 +223,7 @@ def inner(a, b): # print(float(mp.log(10, 256)).hex()) _LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415 -# _apread is for internal testing. It maps a key to the number of times +# _spread is for internal testing. It maps a key to the number of times # that condition obtained in _dec_str_to_int_inner: # key 0 - quotient guess was right # key 1 - quotient had to be boosted by 1, one time From b1443eff5c8f2885c051c4a0578bc390a786c5a1 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 16:47:57 -0500 Subject: [PATCH 37/79] Update Lib/test/test_int.py Co-authored-by: Pieter Eendebak --- Lib/test/test_int.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 593c5ba7ce7b5c..e9d0f3bb4e9ace 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -951,7 +951,7 @@ def test_whitebox_dec_str_to_int_inner_failsafe(self): @unittest.skipUnless(_pylong, "pylong module required") def test_whitebox_dec_str_to_int_inner_monster(self): - # I don't think anyone has enough RAM o build a string long enough + # I don't think anyone has enough RAM to build a string long enough # for this function to complain. So lie about the string length. class LyingStr(str): From 44c85ac0783610b4b694f235cce17df12435cb61 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 11 May 2024 19:44:06 -0500 Subject: [PATCH 38/79] Made suggested word change. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index d2302aa42ac45e..98eb8137c1612e 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -324,7 +324,7 @@ def inner(n, w): # We're going to do inexact, chopped arithmetic, multiplying by # an approximation to the reciprocal of 256**i. We chop to get a # lower bound on the true integer quotient. Our approximation is - # a lower bound, the multiply is chopped too, and + # a lower bound, the multiplication is chopped too, and # to_integral_value() is also chopped. ctx.traps[decimal.Inexact] = 0 ctx.rounding = decimal.ROUND_DOWN From 1a90eb6e8b57f557d15b1d1d3ea911f472a4b26c Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 01:38:07 -0500 Subject: [PATCH 39/79] Just commehts: be more careful about explaining how bad the approximation to log10(true_product) can be. It's actually suprising how good it is, given how bad it is ;-) --- Lib/_pylong.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 98eb8137c1612e..9ecd55b3261965 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -560,9 +560,15 @@ def int_divmod(a, b): # # n.adjusted() - p256.adjusted() is s crude integer approximation to # log10(true_product) - but prec is necessarily an int too, and via -# tedious case analysis it can be shown that the "crude xpproaximation" -# is always an upper bouns on what's needed (it's either spot on, or -# one larger than necessary). +# tedious case analysis it can be shown that the "crude xpproximation" +# is never smaller than the floor of the true ratio's log10. For +# exxmple, in 8E20 / 1E20, it gives 20 - 20 = 0, which is the floor of +# log10(9), It also giver 0 for 1E20/9E20 (`.adjusted()` doesn't look at +# the digits at all - it just gives the power-of-10 exponent of the most +# significnt digit, whatever it may be). But in that case it's the +# ceiling of the true log 10 (which is a bit larger than -1). So "it's +# close", but since it may be as bad as (but no worse than) 1 too +# small, we have to assume the worst: 1 too small. # # Also skipping why cutting the reciprocal to p256.adjusted() + GUARD # digits to begin with is good enough. The precondition n < 256**w is From cd07da348437432034c6a866ba6d8301840e805e Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 01:46:52 -0500 Subject: [PATCH 40/79] Typo repair. --- Lib/_pylong.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 9ecd55b3261965..148fc8c6a28ac4 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -566,9 +566,9 @@ def int_divmod(a, b): # log10(9), It also giver 0 for 1E20/9E20 (`.adjusted()` doesn't look at # the digits at all - it just gives the power-of-10 exponent of the most # significnt digit, whatever it may be). But in that case it's the -# ceiling of the true log 10 (which is a bit larger than -1). So "it's -# close", but since it may be as bad as (but no worse than) 1 too -# small, we have to assume the worst: 1 too small. +# ceiling of the true log10 (which is a bit larger than -1). So "it's +# close", but since it may be as bad as (but no worse than) 1 too small, +# we have to assume the worst: 1 too small. # # Also skipping why cutting the reciprocal to p256.adjusted() + GUARD # digits to begin with is good enough. The precondition n < 256**w is From a985009775b853f6aefcc7b7829fe9c200e5c76b Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 01:54:31 -0500 Subject: [PATCH 41/79] Someone should contribute a spell-checker to IDLE ;-) --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 148fc8c6a28ac4..fc319a9a4295bd 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -565,7 +565,7 @@ def int_divmod(a, b): # exxmple, in 8E20 / 1E20, it gives 20 - 20 = 0, which is the floor of # log10(9), It also giver 0 for 1E20/9E20 (`.adjusted()` doesn't look at # the digits at all - it just gives the power-of-10 exponent of the most -# significnt digit, whatever it may be). But in that case it's the +# significant digit, whatever it may be). But in that case it's the # ceiling of the true log10 (which is a bit larger than -1). So "it's # close", but since it may be as bad as (but no worse than) 1 too small, # we have to assume the worst: 1 too small. From ec326b34706bd251d3ea5853fd13f58d5b603685 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 12:06:23 -0500 Subject: [PATCH 42/79] Free the memory for `hi` as soon as we're done with it. This helps my testing on multi-gigabyte inputs, by reducing peak RAM use. --- Lib/_pylong.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index fc319a9a4295bd..f778256dacc2bd 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -291,6 +291,7 @@ def inner(n, w): # enabled. #assert hi, lo == divmod(n, pow256[w2][0]) inner(hi, w - w2) + del hi # at top levels, can free a lot of RAM "early" inner(lo, w2) # How many base 256 digits are needed?. Mathematically, exactly From 922dca147d7d575552f678bf38c213f62283cc3d Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 14:09:23 -0500 Subject: [PATCH 43/79] Add notes about the details of how reciprocals are computed. --- Lib/_pylong.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index f778256dacc2bd..bf474a3f66ec0c 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -594,3 +594,26 @@ def int_divmod(a, b): # 1.47712125` (adding the fractional part of the log to 1.47 ... could # push that over 2, and then the ceiling is needed to get an integer >= # to that). But, at that time, I knew GUARDs of 3 and 4 "didn't work". +# +# On Computing Reciprocals +# ------------------------ +# The code computes all the powers of 256 needed, and all those of +# 1/256. These are exact, but the reciprocals have over twice as many +# significant digts as needed. So in another pass we cut the exact +# reciprocals back, and then throw away the exact valuea. +# +# This "wastes" a lot of RAM for the duration. We could instead not +# compute any exact reciprocals, and simply precompute 1/pow256 with the +# desired final precisions directly. +# +# But it's a real tradeoff: turns out explicit division, despite that +# it's to smaller precision, is significantly slower than computing +# exact powers of 1/256 (which requires no division, except for the +# initial 1/256) and then chopping them back. +# +# I judged that's more desirable for smaller inputs to run faster than +# to incease the size of the largest string a given box's RAM can +# handle, so picked the division-free method. +# +# Note this is all about startup cost - it doesn't affect the speed of +# `inner()` either way. Computing the powers needed isn't cheap. From 9fe92f3b07c2368ba2ef5905a1097f285dbef8b7 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 14:14:52 -0500 Subject: [PATCH 44/79] Repair grammar. --- Lib/_pylong.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index bf474a3f66ec0c..12114c23c451fb 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -611,8 +611,8 @@ def int_divmod(a, b): # exact powers of 1/256 (which requires no division, except for the # initial 1/256) and then chopping them back. # -# I judged that's more desirable for smaller inputs to run faster than -# to incease the size of the largest string a given box's RAM can +# I judged that it's more desirable for smaller inputs to run faster +# than to incease the size of the largest string a given box's RAM can # handle, so picked the division-free method. # # Note this is all about startup cost - it doesn't affect the speed of From 4eaccfc8b8af132356e3bd71b75767c92474a01f Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 18:29:30 -0500 Subject: [PATCH 45/79] Apply suggestions from code review Co-authored-by: Pieter Eendebak --- Lib/_pylong.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 12114c23c451fb..0d9714ae4e998c 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -585,7 +585,7 @@ def int_divmod(a, b): # LATER: doing this analysis pointed out an error: our division isn't # exactly "balanced", in that when `w` is odd the integer part of # n/256**w2 can be larger than 256**w2. The code used enough working -# precision in the multiply then, but the precommputed reciprocal +# precision in the multiply then, but the precomputed reciprocal # approximation didn't have that many good digits to give. This was # repaired by retaining 2 more digits in the reciprocal. # @@ -599,7 +599,7 @@ def int_divmod(a, b): # ------------------------ # The code computes all the powers of 256 needed, and all those of # 1/256. These are exact, but the reciprocals have over twice as many -# significant digts as needed. So in another pass we cut the exact +# significant digits as needed. So in another pass we cut the exact # reciprocals back, and then throw away the exact valuea. # # This "wastes" a lot of RAM for the duration. We could instead not From 5178e0943d44fa0d19da4209adcd1a6a8bc85d27 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 19:28:06 -0500 Subject: [PATCH 46/79] Big change: keeping the exact reciprocals now. No longer trying to truncate them "early". That was another possible source of error that the error analysis didn't properly account for, and one that bit me multiple times. So screw it :-) It doesn't affect the speed, since the recip is chopped back to `prec` digits before use anyway. The difference now is that chopping will always retain as mamy "good" digits as needed. --- Lib/_pylong.py | 79 ++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 0d9714ae4e998c..3446745e480d18 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -257,13 +257,13 @@ def inner(n, w): # over again :-( hi, lo = divmod(n, pow256[w2][0]) else: - p256, recip = pow256[w2] + p256 = pow256[w2] # The integer part will have a number of digits about equal # to the difference between the log10s of `n` and `pow256` # (which, since these are integers, is roughly approximated # by `.adjusted()`). That's the working precision we need, ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD - hi = +n * +recip # unary `+` chops back to ctx.prec digits + hi = +n * +recip[w2] # unary `+` chops to ctx.prec digits ctx.prec = decimal.MAX_PREC hi = hi.to_integral_value() # lose the fractional digits lo = n - hi * p256 @@ -321,7 +321,7 @@ def inner(n, w): with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) - rpow256 = compute_powers(w, 1 / D256, BYTELIM) + recip = compute_powers(w, 1 / D256, BYTELIM) # We're going to do inexact, chopped arithmetic, multiplying by # an approximation to the reciprocal of 256**i. We chop to get a # lower bound on the true integer quotient. Our approximation is @@ -329,16 +329,6 @@ def inner(n, w): # to_integral_value() is also chopped. ctx.traps[decimal.Inexact] = 0 ctx.rounding = decimal.ROUND_DOWN - for k, v in pow256.items(): - # No need to save much more precision in the reciprocal than - # the power of 256 has, plus some guard digits to absorb - # most relevant rounding errors. This is highly signficant: - # 1/2**i has the same number of significant decimal digits - # as 5**i, generally over twice the number in 2**i, - ctx.prec = v.adjusted() + GUARD + 2 - # The unary "+" chope the reciprocal back to that precision. - pow256[k] = v, +rpow256[k] - del rpow256 # exact reciprocals no longer needed ctx.prec = decimal.MAX_PREC inner(D(s), w) return int.from_bytes(result) @@ -525,7 +515,7 @@ def int_divmod(a, b): # # The default 8 is more than enough so no more than 1 correction step # was ever needed for all inputs tried through 2.5 billion digita. In -# fact, I believe 5 guard digits are always enough - but the proof is +# fact, I believe 3 guard digits are always enough - but the proof is # very involved, so better safe than sorry. # # Short course: @@ -571,49 +561,38 @@ def int_divmod(a, b): # close", but since it may be as bad as (but no worse than) 1 too small, # we have to assume the worst: 1 too small. # -# Also skipping why cutting the reciprocal to p256.adjusted() + GUARD -# digits to begin with is good enough. The precondition n < 256**w is -# needed to establish that the true product can't be too large for the -# reciprocal approximation to be too narrow. But read on for more ;-) -# # But since this is just a sketch of a proof ;-), the code uses the -# empirically tested 8 instead of 5. 3 digits more or less makes no +# empirically tested 8 instead of 3. 5 digits more or less makes no # practical difference to speed - these ints are huge. And while -# increasing GUARD above 5 may not be necessary, every increase cuts the +# increasing GUARD above 3 may not be necessary, every increase cuts the # percentage of cases that need a correction at all. # -# LATER: doing this analysis pointed out an error: our division isn't -# exactly "balanced", in that when `w` is odd the integer part of -# n/256**w2 can be larger than 256**w2. The code used enough working -# precision in the multiply then, but the precomputed reciprocal -# approximation didn't have that many good digits to give. This was -# repaired by retaining 2 more digits in the reciprocal. -# -# After that, I believe GUARD=3 should be enough. Which was "the -# obvious" conclusion I leaped to after deriving `prec >= log10(prod) + -# 1.47712125` (adding the fractional part of the log to 1.47 ... could -# push that over 2, and then the ceiling is needed to get an integer >= -# to that). But, at that time, I knew GUARDs of 3 and 4 "didn't work". -# # On Computing Reciprocals # ------------------------ -# The code computes all the powers of 256 needed, and all those of -# 1/256. These are exact, but the reciprocals have over twice as many -# significant digits as needed. So in another pass we cut the exact -# reciprocals back, and then throw away the exact valuea. +# In general, the exact reciprocals we compute have over twice as many +# significant digts as needed. 1/256**i has the same number of +# significant decimal digits as 5**i. It's a significant waste of RAM +# to store all those unneded digits. +# +# So earlier versions of this code rounded them back to shorter values, +# based on the corresponding p256.adjusted(). But this burned me +# multiple times - rarer and rarer cases kept poppigg up where not +# enough digits were retained. In the end, I ran out of bad cases, but +# was unable to complete a proof, with reasonable effort, that +# `p256.adjusted() + GUARD + 2" was in fact always enough. # -# This "wastes" a lot of RAM for the duration. We could instead not -# compute any exact reciprocals, and simply precompute 1/pow256 with the -# desired final precisions directly. +# Since it proved hard to out-think, and was a bug magnet, I tossed that +# code. # -# But it's a real tradeoff: turns out explicit division, despite that -# it's to smaller precision, is significantly slower than computing -# exact powers of 1/256 (which requires no division, except for the -# initial 1/256) and then chopping them back. +# If someone wants to improve this, have at it ;-) Then in the main +# result, we have 4 chopped operations instead of 3, so change the "3" +# near the start to "4". In the end, the constant addend gets a litle +# bigger (1 - log10(1/4)), but not enough to matter to any conclusion. # -# I judged that it's more desirable for smaller inputs to run faster -# than to incease the size of the largest string a given box's RAM can -# handle, so picked the division-free method. +# The hard part is that chopping back to a shorter width then first +# occurs _outside_ of `inner`. We can't know then what `prec` `inner()` +# will need. You have to pick, for each value of `w2`, the largest +# possible value `prec` can become when `inner()` is working on `w2`. # -# Note this is all about startup cost - it doesn't affect the speed of -# `inner()` either way. Computing the powers needed isn't cheap. +# Here's a hint: x.adjusted() is the exact mathematical value of +# floor(log10(abs(x))). And, no, that's not much of a help ;-) From 5a0a574f20b249bef62c73fd1751344efdb9c45f Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 19:47:54 -0500 Subject: [PATCH 47/79] Remove no-longer-useful reset of ctx.prec. --- Lib/_pylong.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 3446745e480d18..1970fa3e7e73d1 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -329,7 +329,6 @@ def inner(n, w): # to_integral_value() is also chopped. ctx.traps[decimal.Inexact] = 0 ctx.rounding = decimal.ROUND_DOWN - ctx.prec = decimal.MAX_PREC inner(D(s), w) return int.from_bytes(result) From 1ee272eeeee81836a47a2c9f3925627514afb64c Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 23:50:38 -0500 Subject: [PATCH 48/79] Reciprocal approximations are back - you knew it was coming ;-) Found a rigorous upper bound on the number of extra digits needed (4), and reworked the error analysis to account for this too. I have no _empirical_ evidence that 4 is needed, though, or even 3. --- Lib/_pylong.py | 123 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 39 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 1970fa3e7e73d1..63f475c6278044 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -257,13 +257,13 @@ def inner(n, w): # over again :-( hi, lo = divmod(n, pow256[w2][0]) else: - p256 = pow256[w2] + p256, recip = pow256[w2] # The integer part will have a number of digits about equal # to the difference between the log10s of `n` and `pow256` # (which, since these are integers, is roughly approximated # by `.adjusted()`). That's the working precision we need, ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD - hi = +n * +recip[w2] # unary `+` chops to ctx.prec digits + hi = +n * +recip # unary `+` chops back to ctx.prec digits ctx.prec = decimal.MAX_PREC hi = hi.to_integral_value() # lose the fractional digits lo = n - hi * p256 @@ -321,7 +321,7 @@ def inner(n, w): with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) - recip = compute_powers(w, 1 / D256, BYTELIM) + rpow256 = compute_powers(w, 1 / D256, BYTELIM) # We're going to do inexact, chopped arithmetic, multiplying by # an approximation to the reciprocal of 256**i. We chop to get a # lower bound on the true integer quotient. Our approximation is @@ -329,6 +329,17 @@ def inner(n, w): # to_integral_value() is also chopped. ctx.traps[decimal.Inexact] = 0 ctx.rounding = decimal.ROUND_DOWN + for k, v in pow256.items(): + # No need to save much more precision in the reciprocal than + # the power of 256 has, plus some guard digits to absorb + # most relevant rounding errors. This is highly signficant: + # 1/2**i has the same number of significant decimal digits + # as 5**i, generally over twice the number in 2**i, + ctx.prec = v.adjusted() + GUARD + 4 + # The unary "+" chope the reciprocal back to that precision. + pow256[k] = v, +rpow256[k] + del rpow256 # exact reciprocals no longer needed + ctx.prec = decimal.MAX_PREC inner(D(s), w) return int.from_bytes(result) @@ -521,32 +532,38 @@ def int_divmod(a, b): # # If prec is the decimal precision in effect, and we're rounding down, # the result of an operation is exactly equal to the infinitely precise -# result times 1-e for some real e with 0 <= e < 10**(1-prec). We have -# 3 operations: chopping n back to prec digits, likewise for 1/256**w2, -# and also for their product. +# result times 1-e for some real e with 0 <= e < 10**(1-prec). In +# +# ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD +# hi = +n * +recip # unary `+` chops to ctx.prec digits +# +# we have 3 visible chopped operationa, but there's also a 4th: +# precomuting `recip`, chopped back from the exact reciprocal as part of +# setup. # # So the computed product is exactly equal to the true product times -# (1-e1)*(1-e2)*(1-e3); since the e's are all very small, an excellent -# approximation to the second factor is 1-(e1+e2+e3) (the 2nd and 3rd -# order terms in the expanded product are too tiny to matter). If -# they're all as large as possible, that's 1 - 3*10**(1-prec). This, -# BTW, is all bog-standard FP error analysis. +# (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an +# excellent approximation to the second factor is 1-(e1+e2+e3+e4) (the +# 2nd and higher order terms 3rd in the expanded product are too tiny to +# matter). If they're all as large as possible, that's +# +# 1 - 4*10**(1-prec). This, BTW, is all bog-standard FP error analysis. # # That implies the computed product is within 1 of the true product -# provided prec >= log10(true_product) + 1.47712125. +# provided prec >= log10(true_product) + 1.602. # # Here are telegraphic details, rephrasing the initial condition in # equivalent ways, step by step: # -# prod - prod * (1 - 3*10**(1-prec)) <= 1 -# prod - prod + prod * 3*10**(1-prec)) <= 1 -# prod * 3*10**(1-prec)) <= 1 -# 10**(log10(prod)) * 3*10**(1-prec)) <= 1 -# 3*10**(1-prec+log10(prod))) <= 1 -# 10**(1-prec+log10(prod))) <= 1/3 -# 1-prec+log10(prod) <= log10(1/3) = -0.47712125 -# -prec <= -1.47712125 - log10(prod) -# prec >= log10(prod) + 1.47712125 +# prod - prod * (1 - 4*10**(1-prec)) <= 1 +# prod - prod + prod * 4*10**(1-prec)) <= 1 +# prod * 4*10**(1-prec)) <= 1 +# 10**(log10(prod)) * 4*10**(1-prec)) <= 1 +# 4*10**(1-prec+log10(prod))) <= 1 +# 10**(1-prec+log10(prod))) <= 1/4 +# 1-prec+log10(prod) <= log10(1/4) = -0.602 +# -prec <= -1.602 - log10(prod) +# prec >= log10(prod) + 1.602 # # n.adjusted() - p256.adjusted() is s crude integer approximation to # log10(true_product) - but prec is necessarily an int too, and via @@ -573,25 +590,53 @@ def int_divmod(a, b): # significant decimal digits as 5**i. It's a significant waste of RAM # to store all those unneded digits. # -# So earlier versions of this code rounded them back to shorter values, -# based on the corresponding p256.adjusted(). But this burned me -# multiple times - rarer and rarer cases kept poppigg up where not -# enough digits were retained. In the end, I ran out of bad cases, but -# was unable to complete a proof, with reasonable effort, that -# `p256.adjusted() + GUARD + 2" was in fact always enough. +# So we cut exact repicroals back to the least precision that can +# be needed so that the error analysis above is valid, +# +# [Note: turns out it's very significantly faster to do it this way than +# to compute 1 / 256**i directly to the desired precision, because the +# power method doesn't require division.] +# +# The hard part is that chopping back to a shorter width occurs +# _outside_ of `inner`. We can't know then what `prec` `inner()` will +# need. We have to pick, for each value of `w2`, the largest possible +# value `prec` can become when `inner()` is working on `w2`. +# +# This is the `prec` inner() uses: +# max(n.adjusted() - p256.adjusted(), 0) + GUARD +# and what setup uses (renaming its `v` to `p256` - same thing): +# p256.adjusted() + GUARD + 4 +# +# We need that the second is always at least as large as the first, +# which is the same as requiring +# +# n.adjusted() - 2 * p256.adjusted() <= 4 +# +# Which is true, but tedious to show. At a high level, the worst cases +# are when `w` is odd and `n` large. Then `w2` is less than half `w`. +# p234 ia 256**w2, and the largest `n` can be ia 256**w - 1. Forget the +# -1, call it 256**w. Which is in turn p256**2 * 256. If it weren't for +# the "* 256", n and p256**2 would be the same, but multiplying by 256 +# tacks on another 2 or digits. That's the heart of why the "+4" is +# needed. # -# Since it proved hard to out-think, and was a bug magnet, I tossed that -# code. +# If you want to work out the remaining details, start with the smallest +# "worst case" `w`, which is 5: # -# If someone wants to improve this, have at it ;-) Then in the main -# result, we have 4 chopped operations instead of 3, so change the "3" -# near the start to "4". In the end, the constant addend gets a litle -# bigger (1 - log10(1/4)), but not enough to matter to any conclusion. +# >>> n = D256**5 +# >>< n +# Decimal('1099511627776') +# >>> n.adjusted() +# 12 +# >>> p = D256**2 +# >>> p +# Decimal('65536') +# >>> p.adjusted() +# 4 # -# The hard part is that chopping back to a shorter width then first -# occurs _outside_ of `inner`. We can't know then what `prec` `inner()` -# will need. You have to pick, for each value of `w2`, the largest -# possible value `prec` can become when `inner()` is working on `w2`. +# n.adjusted() - 2 * p256.adjusted() is then 12 - 2*4 = 4, hugging the +# bound. # -# Here's a hint: x.adjusted() is the exact mathematical value of -# floor(log10(abs(x))). And, no, that's not much of a help ;-) +# Note: 4 is a rigorous upper bound on what's needed, but even more +# tedious analysis may show that 3, or even 2, are enough. I never +# saw a failure at 2, but did at 1. From 3129d5f21bae58ef2e545de1a1291bb62de3957c Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 12 May 2024 23:59:25 -0500 Subject: [PATCH 49/79] Comment repair. --- Lib/_pylong.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 63f475c6278044..af4bce45278f43 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -544,7 +544,7 @@ def int_divmod(a, b): # So the computed product is exactly equal to the true product times # (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an # excellent approximation to the second factor is 1-(e1+e2+e3+e4) (the -# 2nd and higher order terms 3rd in the expanded product are too tiny to +# 2nd and higher order terms in the expanded product are too tiny to # matter). If they're all as large as possible, that's # # 1 - 4*10**(1-prec). This, BTW, is all bog-standard FP error analysis. @@ -617,7 +617,7 @@ def int_divmod(a, b): # p234 ia 256**w2, and the largest `n` can be ia 256**w - 1. Forget the # -1, call it 256**w. Which is in turn p256**2 * 256. If it weren't for # the "* 256", n and p256**2 would be the same, but multiplying by 256 -# tacks on another 2 or digits. That's the heart of why the "+4" is +# tacks on another 2 or 3 digits. That's the heart of why the "+4" is # needed. # # If you want to work out the remaining details, start with the smallest From bfd6247312eeff40bf94ba60489ba8a8cb9bf5ef Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 13 May 2024 13:44:18 -0500 Subject: [PATCH 50/79] Finished the proof - no more hand-waving on any point :-) --- Lib/_pylong.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index af4bce45278f43..5845dd5f5542f5 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -612,21 +612,33 @@ def int_divmod(a, b): # # n.adjusted() - 2 * p256.adjusted() <= 4 # -# Which is true, but tedious to show. At a high level, the worst cases -# are when `w` is odd and `n` large. Then `w2` is less than half `w`. -# p234 ia 256**w2, and the largest `n` can be ia 256**w - 1. Forget the -# -1, call it 256**w. Which is in turn p256**2 * 256. If it weren't for -# the "* 256", n and p256**2 would be the same, but multiplying by 256 -# tacks on another 2 or 3 digits. That's the heart of why the "+4" is -# needed. +# Now x.adjusted() (for x > 0) is the exact mathematical value of +# floor(log10(x)). So x.adjusted() = a if and only if # -# If you want to work out the remaining details, start with the smallest -# "worst case" `w`, which is 5: +# x = f * 10**a for some real f, 1 <= f < 10. +# +# Express p256 as f * 10**pa, 1 <= f < 10, So p256.adjusted() is pa. +# +# What's the largest n can be? n < 255**w = 256**(w2 + (w - w2)). The +# worst case in this context is when w ix odd. and then w-w2 = w2+1. So +# n < 256**(2*w2 + 1) = +# (256**w2)**2 * 256 = +# p259**2 * 256 = +# (f * 10**pa)**2 * 256 = +# f**2 * 10**(2*pa) * 255 < +# 10**2 * 10**(2*pa) * 255 = +# 25600 * 10**(2*pa) = +# 2,56 * 10**(2*pa + 4) +# +# So n.adusted() is at most `2*pa + 4` ao +# n.adjusted() - 2 * p256.adjusted() is at most 2*pa + 4 - 2*pa = 4. QED +# +# For concreteness, the smallest "worst case" `w` is 5, and so `w2` is 2: # # >>> n = D256**5 # >>< n # Decimal('1099511627776') -# >>> n.adjusted() +# >>> n.adjusted() @ just barely 12 - n close to a power of 2 # 12 # >>> p = D256**2 # >>> p @@ -634,8 +646,7 @@ def int_divmod(a, b): # >>> p.adjusted() # 4 # -# n.adjusted() - 2 * p256.adjusted() is then 12 - 2*4 = 4, hugging the -# bound. +# n.adjusted() - 2 * p256.adjusted() is then 12 - 2*4 = 4, at the bound. # # Note: 4 is a rigorous upper bound on what's needed, but even more # tedious analysis may show that 3, or even 2, are enough. I never From 81ef2876bdc0ba1e5613c473d56038424abd8e00 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 13 May 2024 15:25:27 -0500 Subject: [PATCH 51/79] Fix tiny typos in comments. --- Lib/_pylong.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 5845dd5f5542f5..214f14450b48ef 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -625,10 +625,10 @@ def int_divmod(a, b): # (256**w2)**2 * 256 = # p259**2 * 256 = # (f * 10**pa)**2 * 256 = -# f**2 * 10**(2*pa) * 255 < -# 10**2 * 10**(2*pa) * 255 = +# f**2 * 10**(2*pa) * 256 < +# 10**2 * 10**(2*pa) * = # 25600 * 10**(2*pa) = -# 2,56 * 10**(2*pa + 4) +# 2.56 * 10**(2*pa + 4) # # So n.adusted() is at most `2*pa + 4` ao # n.adjusted() - 2 * p256.adjusted() is at most 2*pa + 4 - 2*pa = 4. QED From d5818df7f5b6d59c618ea588d61d75e7a8fb983f Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 13 May 2024 15:28:16 -0500 Subject: [PATCH 52/79] Fudge. Typo repair is adding new typos too :-( --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 214f14450b48ef..9d685b7525866e 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -626,7 +626,7 @@ def int_divmod(a, b): # p259**2 * 256 = # (f * 10**pa)**2 * 256 = # f**2 * 10**(2*pa) * 256 < -# 10**2 * 10**(2*pa) * = +# 10**2 * 10**(2*pa) * 256 = # 25600 * 10**(2*pa) = # 2.56 * 10**(2*pa + 4) # From ca621e2b14d06087e350d16c9f79c5459555b6bd Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 13 May 2024 19:25:37 -0500 Subject: [PATCH 53/79] Consolidate all the seemingly random observations aboud .adjusted(), in a self-contained new section that spells out everything needed. --- Lib/_pylong.py | 62 +++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 9d685b7525866e..5a276ce594d20b 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -515,6 +515,32 @@ def int_divmod(a, b): # I revisited the code, and found ways to improve and simplify it. The # crossover point is at about 3.4 million digits now. # +# About .adjusted() +# ----------------- +# Restrict to Decimal values x > 0. We don't use negative numbers in the +# code, and I don't want to have to keep typing, e.g., "absolute value". +# +# For convenience, I'll use `x.a` to mean `x.adjusted()`. x.a doesn't +# look at the digits of x, but instead returns an integer giving x's +# order of magnitude. These are equivalent: +# +# - x.a is the power-of-10 exponent of x's most significant digit. +# - x.a = the infinitely precise floor(log10(x)) +# - x can be written in this form, where f is a real with 1 <= f < 10: +# x = f * 10**x.a +# +# Observation; if x is an integer, len(str(x)) = x.a + 1. +# +# Lemma 1: ceiling(log10(x/y)) <= x.a - y.a + 1 +# +# To see that, write x = f * 10**x.a and y = g * 10**y.a, where f and g +# are in [1, 10). Then x/y = f/g * 10**(x.a - y.a), where 1/10 < f/g < +# 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else multiply f/g by 10 to bring +# it back into [1, 10], and subtract 1 from the exponent to compensate. +# Then (x/y).a is x.a-y.a-1. So the largest (x/y).a can be is x.a-y.a. +# Since that's the floor of log10(x/y). the ceiling is at most 1 larger +# (with equalify iff f/g = 1 exactly). +# # GUARD digits # ------------ # We only want the integer part of divisions, so don't need to build @@ -565,17 +591,11 @@ def int_divmod(a, b): # -prec <= -1.602 - log10(prod) # prec >= log10(prod) + 1.602 # -# n.adjusted() - p256.adjusted() is s crude integer approximation to -# log10(true_product) - but prec is necessarily an int too, and via -# tedious case analysis it can be shown that the "crude xpproximation" -# is never smaller than the floor of the true ratio's log10. For -# exxmple, in 8E20 / 1E20, it gives 20 - 20 = 0, which is the floor of -# log10(9), It also giver 0 for 1E20/9E20 (`.adjusted()` doesn't look at -# the digits at all - it just gives the power-of-10 exponent of the most -# significant digit, whatever it may be). But in that case it's the -# ceiling of the true log10 (which is a bit larger than -1). So "it's -# close", but since it may be as bad as (but no worse than) 1 too small, -# we have to assume the worst: 1 too small. +# The true product is the same as the true ratio n/p256. By Lemma 1 +# above, n.a - p256.a + 1 is an upper bound on the ceiling of +# log10(prod). Then 2 is the ceiling of 1.602. so n.a - p256.a + 3 is an +# upper bound on the right hand side of the inequality. Any prec >= that +# will work. # # But since this is just a sketch of a proof ;-), the code uses the # empirically tested 8 instead of 3. 5 digits more or less makes no @@ -603,21 +623,16 @@ def int_divmod(a, b): # value `prec` can become when `inner()` is working on `w2`. # # This is the `prec` inner() uses: -# max(n.adjusted() - p256.adjusted(), 0) + GUARD +# max(n.a - p256.a, 0) + GUARD # and what setup uses (renaming its `v` to `p256` - same thing): -# p256.adjusted() + GUARD + 4 +# p256.a + GUARD + 4 # # We need that the second is always at least as large as the first, # which is the same as requiring # -# n.adjusted() - 2 * p256.adjusted() <= 4 -# -# Now x.adjusted() (for x > 0) is the exact mathematical value of -# floor(log10(x)). So x.adjusted() = a if and only if -# -# x = f * 10**a for some real f, 1 <= f < 10. +# n.a - 2 * p256.a <= 4 # -# Express p256 as f * 10**pa, 1 <= f < 10, So p256.adjusted() is pa. +# Express p256 as f * 10**pa, 1 <= f < 10, and pa = p256.a. # # What's the largest n can be? n < 255**w = 256**(w2 + (w - w2)). The # worst case in this context is when w ix odd. and then w-w2 = w2+1. So @@ -630,10 +645,11 @@ def int_divmod(a, b): # 25600 * 10**(2*pa) = # 2.56 * 10**(2*pa + 4) # -# So n.adusted() is at most `2*pa + 4` ao -# n.adjusted() - 2 * p256.adjusted() is at most 2*pa + 4 - 2*pa = 4. QED +# So n.a is at most `2*pa + 4`, ao n.a - 2 * p256.a is at most 2*pa + 4 +# - 2*pa = 4. QED # -# For concreteness, the smallest "worst case" `w` is 5, and so `w2` is 2: +# For concreteness, the smallest "worst case" `w` is 5, and so w2 is +# 2: # # >>> n = D256**5 # >>< n From c8b69b8ca2e45cc03a2ec19feb444c33423e0b7e Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 13 May 2024 23:22:03 -0500 Subject: [PATCH 54/79] typo --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 5a276ce594d20b..9c73640415d078 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -539,7 +539,7 @@ def int_divmod(a, b): # it back into [1, 10], and subtract 1 from the exponent to compensate. # Then (x/y).a is x.a-y.a-1. So the largest (x/y).a can be is x.a-y.a. # Since that's the floor of log10(x/y). the ceiling is at most 1 larger -# (with equalify iff f/g = 1 exactly). +# (with equality iff f/g = 1 exactly). # # GUARD digits # ------------ From 5cff910dbed2df8b9ad7780de1515d1c2edb46b6 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 14 May 2024 01:17:18 -0500 Subject: [PATCH 55/79] And we don't ever need exact reicrocals after all! Unsure about why my protracted brain freeze about this. --- Lib/_pylong.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 9c73640415d078..672a9ebf0ef2a6 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -321,7 +321,6 @@ def inner(n, w): with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) - rpow256 = compute_powers(w, 1 / D256, BYTELIM) # We're going to do inexact, chopped arithmetic, multiplying by # an approximation to the reciprocal of 256**i. We chop to get a # lower bound on the true integer quotient. Our approximation is @@ -329,6 +328,7 @@ def inner(n, w): # to_integral_value() is also chopped. ctx.traps[decimal.Inexact] = 0 ctx.rounding = decimal.ROUND_DOWN + rD256 = 1 / D256 for k, v in pow256.items(): # No need to save much more precision in the reciprocal than # the power of 256 has, plus some guard digits to absorb @@ -336,9 +336,7 @@ def inner(n, w): # 1/2**i has the same number of significant decimal digits # as 5**i, generally over twice the number in 2**i, ctx.prec = v.adjusted() + GUARD + 4 - # The unary "+" chope the reciprocal back to that precision. - pow256[k] = v, +rpow256[k] - del rpow256 # exact reciprocals no longer needed + pow256[k] = v, rD256**k ctx.prec = decimal.MAX_PREC inner(D(s), w) return int.from_bytes(result) @@ -605,18 +603,6 @@ def int_divmod(a, b): # # On Computing Reciprocals # ------------------------ -# In general, the exact reciprocals we compute have over twice as many -# significant digts as needed. 1/256**i has the same number of -# significant decimal digits as 5**i. It's a significant waste of RAM -# to store all those unneded digits. -# -# So we cut exact repicroals back to the least precision that can -# be needed so that the error analysis above is valid, -# -# [Note: turns out it's very significantly faster to do it this way than -# to compute 1 / 256**i directly to the desired precision, because the -# power method doesn't require division.] -# # The hard part is that chopping back to a shorter width occurs # _outside_ of `inner`. We can't know then what `prec` `inner()` will # need. We have to pick, for each value of `w2`, the largest possible From 29ecb3f6bb867b2a6027a6320a4a3ed94ce942b0 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 14 May 2024 01:26:53 -0500 Subject: [PATCH 56/79] Remove a mention of exact reciprocals, since they're no longer used. --- Lib/_pylong.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 672a9ebf0ef2a6..cf6c09a95c4b26 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -562,8 +562,7 @@ def int_divmod(a, b): # hi = +n * +recip # unary `+` chops to ctx.prec digits # # we have 3 visible chopped operationa, but there's also a 4th: -# precomuting `recip`, chopped back from the exact reciprocal as part of -# setup. +# precomuting a truncatrd `recip` as part of setup. # # So the computed product is exactly equal to the true product times # (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an From 617b5e20cb34d2f689fc94a41f6cbeb77c913f18 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 14 May 2024 21:44:34 -0500 Subject: [PATCH 57/79] Update comments. No code changes. --- Lib/_pylong.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index cf6c09a95c4b26..729c5127065a89 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -271,8 +271,8 @@ def inner(n, w): # lower bound on the correct quotient. assert lo >= 0 # Adjust quotient up if needed. It usually isn't. In random - # testing on inputs through 2.5 billion digit strings, the - # test triggered about one in 100 thousand cases. + # testing on inputs through 5 billion digit strings, the + # test triggered once in about 200 thousand tries. count = 0 if lo >= p256: count = 1 @@ -337,6 +337,16 @@ def inner(n, w): # as 5**i, generally over twice the number in 2**i, ctx.prec = v.adjusted() + GUARD + 4 pow256[k] = v, rD256**k + # Note: it's faster to use compute_powers(rD256, ...) at the + # start to conpute exact reciprocals, chop them back in this + # loop, then throw them away. Despite that it creates over + # twice as many significant digits overall, the function is + # much smarter about _how_ to compute the collection of + # powers needed than repeated `**k` (the function uses `**` + # only for the smallest power needed). But the larger the + # input, the less this fine-tuning matters to overall time - + # our goal is better asymptotics, not peak speed in all + # cases. ctx.prec = decimal.MAX_PREC inner(D(s), w) return int.from_bytes(result) @@ -652,3 +662,13 @@ def int_divmod(a, b): # Note: 4 is a rigorous upper bound on what's needed, but even more # tedious analysis may show that 3, or even 2, are enough. I never # saw a failure at 2, but did at 1. +# +# Note: this is the only part of the proof where the target base (256) +# matters. It shows up because it directly determines how badly an +# unbalanced split can hurt. The larger the target base, the higher "4" +# may need to become. If we split on the ceiling of w/2 instead, odd `w` +# would become the best cases, even `w` the worst, and the same kind of +# proof would see the output base effectively "cancel out". "4" could be +# replaced by "1" regardless of base. So it would be more elegant that +# way. Alas, splitting on the ceiling could make other parts messier (in +# particular, `compute_powers()` would have to change to do more work). From 28552fb4c321899408c040e4cae8032652061f41 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 14 May 2024 23:48:29 -0500 Subject: [PATCH 58/79] And exact reciprocals are back. Can't beat them for speed, and that matters most at smaller inputs. So it's either bring them back, or raise the crossover point. --- Lib/_pylong.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 729c5127065a89..4c16d8c3d45887 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -321,6 +321,7 @@ def inner(n, w): with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) pow256 = compute_powers(w, D256, BYTELIM) + rpow256 = compute_powers(w, 1 / D256, BYTELIM) # We're going to do inexact, chopped arithmetic, multiplying by # an approximation to the reciprocal of 256**i. We chop to get a # lower bound on the true integer quotient. Our approximation is @@ -328,7 +329,6 @@ def inner(n, w): # to_integral_value() is also chopped. ctx.traps[decimal.Inexact] = 0 ctx.rounding = decimal.ROUND_DOWN - rD256 = 1 / D256 for k, v in pow256.items(): # No need to save much more precision in the reciprocal than # the power of 256 has, plus some guard digits to absorb @@ -336,17 +336,9 @@ def inner(n, w): # 1/2**i has the same number of significant decimal digits # as 5**i, generally over twice the number in 2**i, ctx.prec = v.adjusted() + GUARD + 4 - pow256[k] = v, rD256**k - # Note: it's faster to use compute_powers(rD256, ...) at the - # start to conpute exact reciprocals, chop them back in this - # loop, then throw them away. Despite that it creates over - # twice as many significant digits overall, the function is - # much smarter about _how_ to compute the collection of - # powers needed than repeated `**k` (the function uses `**` - # only for the smallest power needed). But the larger the - # input, the less this fine-tuning matters to overall time - - # our goal is better asymptotics, not peak speed in all - # cases. + # The unary "+" chope the reciprocal back to that precision. + pow256[k] = v, +rpow256[k] + del rpow256 # exact reciprocals no longer needed ctx.prec = decimal.MAX_PREC inner(D(s), w) return int.from_bytes(result) @@ -612,6 +604,22 @@ def int_divmod(a, b): # # On Computing Reciprocals # ------------------------ +# In general, the exact reciprocals we compute have over twice as many +# significant digts as needed. 1/256**i has the same number of +# significant decimal digits as 5**i. It's a significant waste of RAM +# to store all those unneded digits. +# +# So we cut exact repicroals back to the least precision that can +# be needed so that the error analysis above is valid, +# +# [Note: turns out it's very significantly faster to do it this way than +# to compute 1 / 256**i directly to the desired precision, because the +# power method doesn't require division. It's also faster than computing +# (1/256)**i directly to the desired precision - no material division +# there, but `compute_powers()` is much smarter about _how_ to compute +# all the powers needed than repeated applications of `**` - that +# function invokes `**` for only the smallest power needed.] +# # The hard part is that chopping back to a shorter width occurs # _outside_ of `inner`. We can't know then what `prec` `inner()` will # need. We have to pick, for each value of `w2`, the largest possible From 03225f38162538bbe26fa8a59ccafe1e165788be Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 10:45:56 -0500 Subject: [PATCH 59/79] Split on ceiling(w/2) instead of on the floor. This makes reasoning about reciprocal precision needed much easier, and now nothing in the proofs depends on the output base (256). Alas, `compute_powere()` may, if its new `need_hi=True` argument is specified, need to invoke `**` more than once now. --- Lib/_pylong.py | 145 +++++++++++++++++++++++-------------------------- 1 file changed, 68 insertions(+), 77 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 4c16d8c3d45887..0226eaa11b2451 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -45,12 +45,16 @@ # # and `mycache[lo]` replaces `base**lo` in the inner function. # +# If an algorithm wants the powers of ceiling(w/2) instead of the floor, +# pass keyword argument `need_hi=True` instead. +# # While this does give minor speedups (a few percent at best), the primary # intent is to simplify the functions using this, by eliminating the need for # them to craft their own ad-hoc caching schemes. -def compute_powers(w, base, more_than, show=False): +def compute_powers(w, base, more_than, *, need_hi=False, show=False): seen = set() need = set() + maybe_not_needed = set() ws = {w} while ws: w = ws.pop() # any element is fine to use next @@ -58,11 +62,13 @@ def compute_powers(w, base, more_than, show=False): continue seen.add(w) lo = w >> 1 - # only _need_ lo here; some other path may, or may not, need hi - need.add(lo) - ws.add(lo) - if w & 1: - ws.add(lo + 1) + hi = w - lo + # only _need_ one here; the other may, or may not, be needed + which = hi if need_hi else lo + need.add(which) + ws.add(which) + if lo != hi: + ws.add(w - which) d = {} if not need: @@ -72,6 +78,7 @@ def compute_powers(w, base, more_than, show=False): if show: print("pow at", first) d[first] = base ** first + last = first for this in it: if this - 1 in d: if show: @@ -80,18 +87,25 @@ def compute_powers(w, base, more_than, show=False): else: lo = this >> 1 hi = this - lo - assert lo in d - if show: - print("square at", this) - # Multiplying a bigint by itself (same object!) is about twice - # as fast in CPython. - sq = d[lo] * d[lo] - if hi != lo: - assert hi == lo + 1 + if lo in d: if show: - print(" and * base") - sq *= base - d[this] = sq + print("square at", this) + # Multiplying a bigint by itself is about twice as fast + # in CPython provided it's the same object. + sq = d[lo] * d[lo] # same object + if lo != hi: + assert this == 2 * lo + 1 + if show: + print(" and * base") + sq *= base + d[this] = sq + else: + assert need_hi + diff = this - last + if show: + print(last, "*", diff, "at", this) + d[this] = d[last] * (d[diff] if diff in d else base ** diff) + last = this return d _unbounded_dec_context = decimal.getcontext().copy() @@ -248,7 +262,8 @@ def inner(n, w): # repaired. result.extend(int(str(n)).to_bytes(w)) # big-endian default return - w2 = w >> 1 + w1 = w >> 1 + w2 = w - w1 if 0: # This is maximally clear, but "too slow". `decimal` # division is asymptotically fast, but we have no way to @@ -290,7 +305,7 @@ def inner(n, w): # The assert should always succeed, but way too slow to keep # enabled. #assert hi, lo == divmod(n, pow256[w2][0]) - inner(hi, w - w2) + inner(hi, w1) del hi # at top levels, can free a lot of RAM "early" inner(lo, w2) @@ -320,8 +335,8 @@ def inner(n, w): raise ValueError(f"cannot convert string of len {lenS} to int") with decimal.localcontext(_unbounded_dec_context) as ctx: D256 = D(256) - pow256 = compute_powers(w, D256, BYTELIM) - rpow256 = compute_powers(w, 1 / D256, BYTELIM) + pow256 = compute_powers(w, D256, BYTELIM, need_hi=True) + rpow256 = compute_powers(w, 1 / D256, BYTELIM, need_hi=True) # We're going to do inexact, chopped arithmetic, multiplying by # an approximation to the reciprocal of 256**i. We chop to get a # lower bound on the true integer quotient. Our approximation is @@ -335,7 +350,7 @@ def inner(n, w): # most relevant rounding errors. This is highly signficant: # 1/2**i has the same number of significant decimal digits # as 5**i, generally over twice the number in 2**i, - ctx.prec = v.adjusted() + GUARD + 4 + ctx.prec = v.adjusted() + GUARD + 1 # The unary "+" chope the reciprocal back to that precision. pow256[k] = v, +rpow256[k] del rpow256 # exact reciprocals no longer needed @@ -531,15 +546,22 @@ def int_divmod(a, b): # # Observation; if x is an integer, len(str(x)) = x.a + 1. # -# Lemma 1: ceiling(log10(x/y)) <= x.a - y.a + 1 +# Lemma 1: (x * y).a = x.a + y.a, or one larger # -# To see that, write x = f * 10**x.a and y = g * 10**y.a, where f and g -# are in [1, 10). Then x/y = f/g * 10**(x.a - y.a), where 1/10 < f/g < -# 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else multiply f/g by 10 to bring -# it back into [1, 10], and subtract 1 from the exponent to compensate. -# Then (x/y).a is x.a-y.a-1. So the largest (x/y).a can be is x.a-y.a. -# Since that's the floor of log10(x/y). the ceiling is at most 1 larger -# (with equality iff f/g = 1 exactly). +# Proof: Write x = f * 10**x.a and y = g * 10**y.a, where f and g are in +# [1, 10). Then x*y = f*g * 10**(x.a + y.a), where 1 <= f*g < 100. If +# f*g < 10, (x*y).a is x.a+y.a. Else divide f*g by 10 to bring it back +# into [1, 10], and add 1 to the exponent to compensate. Then (x*y).a is +# x.a+y.a-1. +# +# Lemma 2: ceiling(log10(x/y)) <= x.a - y.a + 1 +# +# Peood: Express x and y is in Lemma 1. Then x/y = f/g * 10**(x.a - +# y.a), where 1/10 < f/g < 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else +# multiply f/g by 10 to bring it back into [1, 10], and subtract 1 from +# the exponent to compensate. Then (x/y).a is x.a-y.a-1. So the largest +# (x/y).a can be is x.a-y.a. Since that's the floor of log10(x/y). the +# ceiling is at most 1 larger (with equality iff f/g = 1 exactly). # # GUARD digits # ------------ @@ -590,7 +612,7 @@ def int_divmod(a, b): # -prec <= -1.602 - log10(prod) # prec >= log10(prod) + 1.602 # -# The true product is the same as the true ratio n/p256. By Lemma 1 +# The true product is the same as the true ratio n/p256. By Lemma 2 # above, n.a - p256.a + 1 is an upper bound on the ceiling of # log10(prod). Then 2 is the ceiling of 1.602. so n.a - p256.a + 3 is an # upper bound on the right hand side of the inequality. Any prec >= that @@ -628,55 +650,24 @@ def int_divmod(a, b): # This is the `prec` inner() uses: # max(n.a - p256.a, 0) + GUARD # and what setup uses (renaming its `v` to `p256` - same thing): -# p256.a + GUARD + 4 +# p256.a + GUARD + 1 # # We need that the second is always at least as large as the first, # which is the same as requiring # -# n.a - 2 * p256.a <= 4 -# -# Express p256 as f * 10**pa, 1 <= f < 10, and pa = p256.a. +# n.a - 2 * p256.a <= 1 # # What's the largest n can be? n < 255**w = 256**(w2 + (w - w2)). The -# worst case in this context is when w ix odd. and then w-w2 = w2+1. So -# n < 256**(2*w2 + 1) = -# (256**w2)**2 * 256 = -# p259**2 * 256 = -# (f * 10**pa)**2 * 256 = -# f**2 * 10**(2*pa) * 256 < -# 10**2 * 10**(2*pa) * 256 = -# 25600 * 10**(2*pa) = -# 2.56 * 10**(2*pa + 4) -# -# So n.a is at most `2*pa + 4`, ao n.a - 2 * p256.a is at most 2*pa + 4 -# - 2*pa = 4. QED -# -# For concreteness, the smallest "worst case" `w` is 5, and so w2 is -# 2: -# -# >>> n = D256**5 -# >>< n -# Decimal('1099511627776') -# >>> n.adjusted() @ just barely 12 - n close to a power of 2 -# 12 -# >>> p = D256**2 -# >>> p -# Decimal('65536') -# >>> p.adjusted() -# 4 -# -# n.adjusted() - 2 * p256.adjusted() is then 12 - 2*4 = 4, at the bound. -# -# Note: 4 is a rigorous upper bound on what's needed, but even more -# tedious analysis may show that 3, or even 2, are enough. I never -# saw a failure at 2, but did at 1. -# -# Note: this is the only part of the proof where the target base (256) -# matters. It shows up because it directly determines how badly an -# unbalanced split can hurt. The larger the target base, the higher "4" -# may need to become. If we split on the ceiling of w/2 instead, odd `w` -# would become the best cases, even `w` the worst, and the same kind of -# proof would see the output base effectively "cancel out". "4" could be -# replaced by "1" regardless of base. So it would be more elegant that -# way. Alas, splitting on the ceiling could make other parts messier (in -# particular, `compute_powers()` would have to change to do more work). +# worst case in this context is when w ix even. and then w = 2*w2, ao +# n < 256**(2*w2) = (256**w2)**2 = p256**2. By Lemma 1, then, n.a +# is at most p256.a + p256.a + 1. +# +# So the most n.a - 2 * p256.a can be is +# p256.a + p256.a + 1 - 2 * p256.a = 1. QES +# +# Note: an earlier version of the code split on floor(e/2) instead of on +# the ceiiing. Thw worst case then is odd `w`, and a more involved proof +# was needed to show that adding 4 (instead of 1) may be neceasary. +# Basically because, in that cass, n may be up to 256 times larger than +# p256**2. Curiously enough, by splitting on the ceiling instead, +# nothing in any proof here actually depends on the output base (256). From 0b454ba9aaf7f86a137dddddf15cb186909ef308 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 11:09:02 -0500 Subject: [PATCH 60/79] Delete a line of unused code. --- Lib/_pylong.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 0226eaa11b2451..9a5c414e7e92c4 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -54,7 +54,6 @@ def compute_powers(w, base, more_than, *, need_hi=False, show=False): seen = set() need = set() - maybe_not_needed = set() ws = {w} while ws: w = ws.pop() # any element is fine to use next From 1afe4df81bc0dece023732505ad93e26c388d825 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 11:10:20 -0500 Subject: [PATCH 61/79] Update Lib/_pylong.py Co-authored-by: sstandre <43125375+sstandre@users.noreply.github.com> --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 0226eaa11b2451..bbc4c9ba812419 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -351,7 +351,7 @@ def inner(n, w): # 1/2**i has the same number of significant decimal digits # as 5**i, generally over twice the number in 2**i, ctx.prec = v.adjusted() + GUARD + 1 - # The unary "+" chope the reciprocal back to that precision. + # The unary "+" chops the reciprocal back to that precision. pow256[k] = v, +rpow256[k] del rpow256 # exact reciprocals no longer needed ctx.prec = decimal.MAX_PREC From 1f52d1da2161025df5a50fb96050dceb6aafc814 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 11:17:24 -0500 Subject: [PATCH 62/79] typo --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index e9f1e898e51b99..86dddb15db3f64 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -555,7 +555,7 @@ def int_divmod(a, b): # # Lemma 2: ceiling(log10(x/y)) <= x.a - y.a + 1 # -# Peood: Express x and y is in Lemma 1. Then x/y = f/g * 10**(x.a - +# Peood: Express x and y as in Lemma 1. Then x/y = f/g * 10**(x.a - # y.a), where 1/10 < f/g < 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else # multiply f/g by 10 to bring it back into [1, 10], and subtract 1 from # the exponent to compensate. Then (x/y).a is x.a-y.a-1. So the largest From f1469cfc2fceaccb0a95f144691b3fd49fe30954 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 11:20:59 -0500 Subject: [PATCH 63/79] Bah. Another typo. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 86dddb15db3f64..92876691d199ea 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -662,7 +662,7 @@ def int_divmod(a, b): # is at most p256.a + p256.a + 1. # # So the most n.a - 2 * p256.a can be is -# p256.a + p256.a + 1 - 2 * p256.a = 1. QES +# p256.a + p256.a + 1 - 2 * p256.a = 1. QED # # Note: an earlier version of the code split on floor(e/2) instead of on # the ceiiing. Thw worst case then is odd `w`, and a more involved proof From d24ae92109fc852e538762c2074dafc8002d9f11 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 11:47:12 -0500 Subject: [PATCH 64/79] Repair thinko in comment. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 92876691d199ea..ca8746fa248e4e 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -551,7 +551,7 @@ def int_divmod(a, b): # [1, 10). Then x*y = f*g * 10**(x.a + y.a), where 1 <= f*g < 100. If # f*g < 10, (x*y).a is x.a+y.a. Else divide f*g by 10 to bring it back # into [1, 10], and add 1 to the exponent to compensate. Then (x*y).a is -# x.a+y.a-1. +# x.a+y.a+1. # # Lemma 2: ceiling(log10(x/y)) <= x.a - y.a + 1 # From 3d74801178866fa5c12117b8ce3203a1f6249fe4 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 15:21:34 -0500 Subject: [PATCH 65/79] Splitting on ceiling(w/2) gave compute_powers() a harder job. So make it smarter. --- Lib/_pylong.py | 75 +++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index ca8746fa248e4e..0b80e541037011 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -46,12 +46,12 @@ # and `mycache[lo]` replaces `base**lo` in the inner function. # # If an algorithm wants the powers of ceiling(w/2) instead of the floor, -# pass keyword argument `need_hi=True` instead. +# pass keyword argument `need_hi=True`. # -# While this does give minor speedups (a few percent at best), the primary -# intent is to simplify the functions using this, by eliminating the need for -# them to craft their own ad-hoc caching schemes. -def compute_powers(w, base, more_than, *, need_hi=False, show=False): +# While this does give minor speedups (a few percent at best), the +# primary intent is to simplify the functions using this, by eliminating +# the need for them to craft their own ad-hoc caching schemes. +def compute_powers(w, base, more_than, *, need_hi=False): seen = set() need = set() ws = {w} @@ -69,42 +69,41 @@ def compute_powers(w, base, more_than, *, need_hi=False, show=False): if lo != hi: ws.add(w - which) + # powbase() knows how to compute powers efficiently. When `need` is + # processed in sorted order, and need_hi is False, powbase won't + # actually recurse: it always finds the predecossor it wants already + # in the cache (`d`). But when need_hi is True, it may need to + # recurse to find powers of predecessora that weren't themselves + # needed. d = {} - if not need: - return d - it = iter(sorted(need)) - first = next(it) - if show: - print("pow at", first) - d[first] = base ** first - last = first - for this in it: - if this - 1 in d: - if show: - print("* base at", this) - d[this] = d[this - 1] * base # cheap - else: - lo = this >> 1 - hi = this - lo - if lo in d: - if show: - print("square at", this) - # Multiplying a bigint by itself is about twice as fast - # in CPython provided it's the same object. - sq = d[lo] * d[lo] # same object - if lo != hi: - assert this == 2 * lo + 1 - if show: - print(" and * base") - sq *= base - d[this] = sq + def powbase(n): + if (result := d.get(n)) is None: + lo = n >> 1 + hi = n - lo + if n-1 in d: + result = d[n-1] * base # cheap! + elif lo in d: + # Multiplying a bigint by itself is about twice as + # fast in CPython provided it's the same object. + result = d[lo] * d[lo] # same object + if hi != lo: + assert 2 * lo + 1 == n + result *= base + elif n <= more_than: # rare + result = base ** n else: assert need_hi - diff = this - last - if show: - print(last, "*", diff, "at", this) - d[this] = d[last] * (d[diff] if diff in d else base ** diff) - last = this + result = powbase(lo) * powbase(hi) + d[n] = result + return result + + for n in sorted(need): + powbase(n) + if need_hi: + for n in d.keys() - need: + del d[n] + else: + assert d.keys() == need return d _unbounded_dec_context = decimal.getcontext().copy() From c6f6126c77f2a7fff68b75940549f94e7c2639bf Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Wed, 15 May 2024 16:47:26 -0500 Subject: [PATCH 66/79] Reduce excessive indentation. --- Lib/_pylong.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 0b80e541037011..4dc588b3b301c7 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -83,12 +83,12 @@ def powbase(n): if n-1 in d: result = d[n-1] * base # cheap! elif lo in d: - # Multiplying a bigint by itself is about twice as - # fast in CPython provided it's the same object. - result = d[lo] * d[lo] # same object - if hi != lo: - assert 2 * lo + 1 == n - result *= base + # Multiplying a bigint by itself is about twice as fast + # in CPython provided it's the same object. + result = d[lo] * d[lo] # same object + if hi != lo: + assert 2 * lo + 1 == n + result *= base elif n <= more_than: # rare result = base ** n else: From 61253a45bcbb9e72de0d16d0766811433afca909 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 16 May 2024 12:41:11 -0500 Subject: [PATCH 67/79] Give compute_powers() another IQ boost. Also run millions of tests, but, at least for now, they need to be enabled "by hand". --- Lib/_pylong.py | 120 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 34 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 4dc588b3b301c7..92f56a92ded7c0 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -51,7 +51,10 @@ # While this does give minor speedups (a few percent at best), the # primary intent is to simplify the functions using this, by eliminating # the need for them to craft their own ad-hoc caching schemes. -def compute_powers(w, base, more_than, *, need_hi=False): +# +# See code near end of file for a block of code that can be enabled to +# run millions of tests. +def compute_powers(w, base, more_than, *, need_hi=False, show=False): seen = set() need = set() ws = {w} @@ -69,41 +72,62 @@ def compute_powers(w, base, more_than, *, need_hi=False): if lo != hi: ws.add(w - which) - # powbase() knows how to compute powers efficiently. When `need` is - # processed in sorted order, and need_hi is False, powbase won't - # actually recurse: it always finds the predecossor it wants already - # in the cache (`d`). But when need_hi is True, it may need to - # recurse to find powers of predecessora that weren't themselves - # needed. + # `need` is the set of exponents needed. To compute them all + # efficiently, possibly add other exponents to `extra`. The goal is + # to ensure that each exponent can be gotten from a smaller one via + # multiplying by the base, squaring it, or squaring and then + # multiplying by the base. + # + # If need_hi is False, this is already the case (w can always be + # gotten from w >> 1 via one of the squaring strategies). But we do + # the work anyway, just in case ;-) + # + # Note that speed is irrelevant. These loops are working on little + # ints (exponents) and go around O(log w) times. The total cost is + # insignificant compared to just one of the bigint multiplies. + cands = need.copy() + extra = set() + while cands: + w = max(cands) + cands.remove(w) + lo = w >> 1 + if lo > more_than and w-1 not in cands and lo not in cands: + extra.add(lo) + cands.add(lo) + assert need_hi or not extra + d = {} - def powbase(n): - if (result := d.get(n)) is None: - lo = n >> 1 - hi = n - lo - if n-1 in d: - result = d[n-1] * base # cheap! - elif lo in d: - # Multiplying a bigint by itself is about twice as fast - # in CPython provided it's the same object. - result = d[lo] * d[lo] # same object - if hi != lo: - assert 2 * lo + 1 == n - result *= base - elif n <= more_than: # rare - result = base ** n - else: - assert need_hi - result = powbase(lo) * powbase(hi) - d[n] = result - return result - - for n in sorted(need): - powbase(n) - if need_hi: - for n in d.keys() - need: + for n in sorted(need | extra): + lo = n >> 1 + hi = n - lo + if n-1 in d: + if show: + print("* base", end="") + result = d[n-1] * base # cheap! + elif lo in d: + # Multiplying a bigint by itself is about twice as fast + # in CPython provided it's the same object. + if show: + print("square", end="") + result = d[lo] * d[lo] # same object + if hi != lo: + if show: + print(" * base", end="") + assert 2 * lo + 1 == n + result *= base + else: # rare + if show: + print("pow", end='') + result = base ** n + if show: + print(" at", n, "needed" if n in need else "extra") + d[n] = result + + assert need <= d.keys() + if excess := d.keys() - need: + assert need_hi + for n in excess: del d[n] - else: - assert d.keys() == need return d _unbounded_dec_context = decimal.getcontext().copy() @@ -669,3 +693,31 @@ def int_divmod(a, b): # Basically because, in that cass, n may be up to 256 times larger than # p256**2. Curiously enough, by splitting on the ceiling instead, # nothing in any proof here actually depends on the output base (256). + +# Enable for brute-force testing of compute_powers(). This takes about a +# minute, because it tries millions of cases. +if 0: + def consumer(w, limir, need_hi): + seen = set() + need = set() + def inner(w): + if w <= limit: + return + if w in seen: + return + seen.add(w) + lo = w >> 1 + hi = w - lo + need.add(hi if need_hi else lo) + inner(lo) + inner(hi) + inner(w) + exp = compute_powers(w, 1, limir, need_hi=need_hi) + assert exp.keys() == need + + from itertools import chain + for need_hi in (False, True): + for limit in (1, 10, 100, 1_000, 10_000, 100_000): + for w in chain(range(1, 100_000), + (10**i for i in range(5, 30))): + consumer(w, limit, need_hi) From a2814b538b03450b9795563d2f5d8a05163daeb4 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 16 May 2024 14:28:38 -0500 Subject: [PATCH 68/79] Add basic compoute_powers() test. --- Lib/test/test_int.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index e9d0f3bb4e9ace..588b75869c9eac 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -971,5 +971,32 @@ def __len__(self): _pylong._dec_str_to_int_inner, liar) + @unittest.skipUnless(_pylong, "_pylong module required") + def test_pylong_compute_powers(self): + # Basic sanity tests. See end of _pylong.py for manual heavy tests. + def consumer(w, base, limir, need_hi): + seen = set() + need = set() + def inner(w): + if w <= limit or w in seen: + return + seen.add(w) + lo = w >> 1 + hi = w - lo + need.add(hi if need_hi else lo) + inner(lo) + inner(hi) + inner(w) + d = _pylong.compute_powers(w, base, limir, need_hi=need_hi) + self.assertEqual(d.keys(), need) + for k, v in d.items(): + self.assertEqual(v, base ** k) + + for base in 2, 5: + for need_hi in False, True: + for limit in 1, 11: + for w in range(250, 550): + consumer(w, base, limit, need_hi) + if __name__ == "__main__": unittest.main() From b16e639448979dadd9454c19c4d279fd658b7f4d Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 16 May 2024 14:32:28 -0500 Subject: [PATCH 69/79] At least whan I make a typo, auto-commplete reproduces it ;-) --- Lib/test/test_int.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 588b75869c9eac..b865e4e396e853 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -974,7 +974,7 @@ def __len__(self): @unittest.skipUnless(_pylong, "_pylong module required") def test_pylong_compute_powers(self): # Basic sanity tests. See end of _pylong.py for manual heavy tests. - def consumer(w, base, limir, need_hi): + def consumer(w, base, limit, need_hi): seen = set() need = set() def inner(w): @@ -987,7 +987,7 @@ def inner(w): inner(lo) inner(hi) inner(w) - d = _pylong.compute_powers(w, base, limir, need_hi=need_hi) + d = _pylong.compute_powers(w, base, limit, need_hi=need_hi) self.assertEqual(d.keys(), need) for k, v in d.items(): self.assertEqual(v, base ** k) From bc440b8eb90c3c3a8a0ea1822694590f415668f2 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 16 May 2024 15:22:12 -0500 Subject: [PATCH 70/79] Fix old typo in comment everyone missed ;-) --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 92f56a92ded7c0..8c03f429581b2b 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -651,7 +651,7 @@ def int_divmod(a, b): # In general, the exact reciprocals we compute have over twice as many # significant digts as needed. 1/256**i has the same number of # significant decimal digits as 5**i. It's a significant waste of RAM -# to store all those unneded digits. +# to store all those unneeded digits. # # So we cut exact repicroals back to the least precision that can # be needed so that the error analysis above is valid, From 60797ab504791a7bbeae863bbdb7e0cabc294a90 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 16 May 2024 15:24:01 -0500 Subject: [PATCH 71/79] Correct technical detail in comment. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 8c03f429581b2b..67cd964a60bd14 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -662,7 +662,7 @@ def int_divmod(a, b): # (1/256)**i directly to the desired precision - no material division # there, but `compute_powers()` is much smarter about _how_ to compute # all the powers needed than repeated applications of `**` - that -# function invokes `**` for only the smallest power needed.] +# function invokes `**` for at most the few smallest powers needed.] # # The hard part is that chopping back to a shorter width occurs # _outside_ of `inner`. We can't know then what `prec` `inner()` will From 1d4f3a0913d1527fc221834a07ba3cbf3b113a75 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Thu, 16 May 2024 15:25:56 -0500 Subject: [PATCH 72/79] And another stray typo :-( --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 67cd964a60bd14..32419e9afcab4b 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -691,7 +691,7 @@ def int_divmod(a, b): # the ceiiing. Thw worst case then is odd `w`, and a more involved proof # was needed to show that adding 4 (instead of 1) may be neceasary. # Basically because, in that cass, n may be up to 256 times larger than -# p256**2. Curiously enough, by splitting on the ceiling instead, +# p256**2. Curiously enough, by splitting on the ceiling instead, # nothing in any proof here actually depends on the output base (256). # Enable for brute-force testing of compute_powers(). This takes about a From 057b5e91a7889dd4d2728eb370debac8cfc20695 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 17 May 2024 12:47:09 -0500 Subject: [PATCH 73/79] Apply suggestions from code review Co-authored-by: Nice Zombies --- Lib/_pylong.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 32419e9afcab4b..5baa27997b2c3a 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -351,7 +351,7 @@ def inner(n, w): # And what if we've plain exhausted the limits of HW floats? We # could compute the log to any desired precision using `decimal`, # but it's not plausible that anyone will pass a string requiring - # trillions of bytes (unles they're just trying to "break things"). + # trillions of bytes (unless they're just trying to "break things"). if w.bit_length() >= 46: # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double. raise ValueError(f"cannot convert string of len {lenS} to int") @@ -369,7 +369,7 @@ def inner(n, w): for k, v in pow256.items(): # No need to save much more precision in the reciprocal than # the power of 256 has, plus some guard digits to absorb - # most relevant rounding errors. This is highly signficant: + # most relevant rounding errors. This is highly significant: # 1/2**i has the same number of significant decimal digits # as 5**i, generally over twice the number in 2**i, ctx.prec = v.adjusted() + GUARD + 1 @@ -578,7 +578,7 @@ def int_divmod(a, b): # # Lemma 2: ceiling(log10(x/y)) <= x.a - y.a + 1 # -# Peood: Express x and y as in Lemma 1. Then x/y = f/g * 10**(x.a - +# Proof: Express x and y as in Lemma 1. Then x/y = f/g * 10**(x.a - # y.a), where 1/10 < f/g < 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else # multiply f/g by 10 to bring it back into [1, 10], and subtract 1 from # the exponent to compensate. Then (x/y).a is x.a-y.a-1. So the largest @@ -608,7 +608,7 @@ def int_divmod(a, b): # hi = +n * +recip # unary `+` chops to ctx.prec digits # # we have 3 visible chopped operationa, but there's also a 4th: -# precomuting a truncatrd `recip` as part of setup. +# precomuting a truncated `recip` as part of setup. # # So the computed product is exactly equal to the true product times # (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an @@ -649,11 +649,11 @@ def int_divmod(a, b): # On Computing Reciprocals # ------------------------ # In general, the exact reciprocals we compute have over twice as many -# significant digts as needed. 1/256**i has the same number of +# significant digits as needed. 1/256**i has the same number of # significant decimal digits as 5**i. It's a significant waste of RAM # to store all those unneeded digits. # -# So we cut exact repicroals back to the least precision that can +# So we cut exact reciprocals back to the least precision that can # be needed so that the error analysis above is valid, # # [Note: turns out it's very significantly faster to do it this way than @@ -688,9 +688,9 @@ def int_divmod(a, b): # p256.a + p256.a + 1 - 2 * p256.a = 1. QED # # Note: an earlier version of the code split on floor(e/2) instead of on -# the ceiiing. Thw worst case then is odd `w`, and a more involved proof +# the ceiling. The worst case then is odd `w`, and a more involved proof # was needed to show that adding 4 (instead of 1) may be neceasary. -# Basically because, in that cass, n may be up to 256 times larger than +# Basically because, in that case, n may be up to 256 times larger than # p256**2. Curiously enough, by splitting on the ceiling instead, # nothing in any proof here actually depends on the output base (256). From fc09650ee0b55a8fed55cd19a1704a9cfabe692a Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 17 May 2024 13:19:03 -0500 Subject: [PATCH 74/79] Apply suggestions from code review Co-authored-by: Nice Zombies --- Lib/_pylong.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 5baa27997b2c3a..fb05ebaeb82949 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -594,7 +594,7 @@ def int_divmod(a, b): # GUARD additional digits. # # The default 8 is more than enough so no more than 1 correction step -# was ever needed for all inputs tried through 2.5 billion digita. In +# was ever needed for all inputs tried through 2.5 billion digits. In # fact, I believe 3 guard digits are always enough - but the proof is # very involved, so better safe than sorry. # @@ -608,7 +608,7 @@ def int_divmod(a, b): # hi = +n * +recip # unary `+` chops to ctx.prec digits # # we have 3 visible chopped operationa, but there's also a 4th: -# precomuting a truncated `recip` as part of setup. +# precomputing a truncated `recip` as part of setup. # # So the computed product is exactly equal to the true product times # (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an @@ -680,7 +680,7 @@ def int_divmod(a, b): # n.a - 2 * p256.a <= 1 # # What's the largest n can be? n < 255**w = 256**(w2 + (w - w2)). The -# worst case in this context is when w ix even. and then w = 2*w2, ao +# worst case in this context is when w ix even. and then w = 2*w2, so # n < 256**(2*w2) = (256**w2)**2 = p256**2. By Lemma 1, then, n.a # is at most p256.a + p256.a + 1. # @@ -689,7 +689,7 @@ def int_divmod(a, b): # # Note: an earlier version of the code split on floor(e/2) instead of on # the ceiling. The worst case then is odd `w`, and a more involved proof -# was needed to show that adding 4 (instead of 1) may be neceasary. +# was needed to show that adding 4 (instead of 1) may be necessary. # Basically because, in that case, n may be up to 256 times larger than # p256**2. Curiously enough, by splitting on the ceiling instead, # nothing in any proof here actually depends on the output base (256). From 6c634c713801ccae7f70cd031caf85428b17d50b Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Fri, 17 May 2024 22:38:56 -0500 Subject: [PATCH 75/79] Add limit=0 to compute_powers() "by hand" testing. No particularly good reason to - it just "looked funny" that 0 _wasn't_ tested. It's not a realistic value to use, though. --- Lib/_pylong.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index fb05ebaeb82949..1ff9afed89b9e4 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -717,7 +717,7 @@ def inner(w): from itertools import chain for need_hi in (False, True): - for limit in (1, 10, 100, 1_000, 10_000, 100_000): + for limit in (0, 1, 10, 100, 1_000, 10_000, 100_000): for w in chain(range(1, 100_000), (10**i for i in range(5, 30))): consumer(w, limit, need_hi) From 9ee61f6d20fe7e4e629a014f86b884c64c232654 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 18 May 2024 02:03:12 -0500 Subject: [PATCH 76/79] Reduce cutoff for calling the new implementation from 3.5M to 2M. It could probably be reduced more as-is, and more still with more fine-tuning. This was remarkably easy: it just required inner() to let Python convert much larger strings to ints, so large that the existing Karatsuba int(str) usually gets invoked. --- Lib/_pylong.py | 9 +++++++-- Lib/test/test_int.py | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 1ff9afed89b9e4..613085eaee9992 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -269,7 +269,12 @@ def inner(a, b): del defaultdict def _dec_str_to_int_inner(s, *, GUARD=8): - BYTELIM = 512 + # Yes, BYTELIM is "large". Large enough that CPython will usually + # use the Karatsuba _str_to_int_inner to convert the string. This + # allowed reducing the cutoff for calling _this_ function from 3.5M + # to 2M digits. We could almost certainly do even better by + # fine-tuning this and/or using a larger output base than 256. + BYTELIM = 100_000 D = decimal.Decimal result = bytearray() # See notes at end of file for discussion of GUARD. @@ -389,7 +394,7 @@ def int_from_string(s): # contain underscores and have trailing whitespace. s = s.rstrip().replace('_', '') func = _str_to_int_inner - if len(s) >= 3_500_000 and _decimal is not None: + if len(s) >= 2_000_000 and _decimal is not None: func = _dec_str_to_int_inner return func(s) diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index b865e4e396e853..67c080117edcc3 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -938,8 +938,8 @@ def test_whitebox_dec_str_to_int_inner_failsafe(self): # wrong about that. We have no input that reaches that block. # Here we test a contrived input that _does_ reach that block, # provided the number of guard digits is reduced to 1. - sn = "6" * (4000000 - 1) - n = (10**len(sn) - 1) // 9 * 6 + sn = "9" * 2000156 + n = 10**len(sn) - 1 orig_spread = _pylong._spread.copy() _pylong._spread.clear() try: From 8d5bc36772f4c0e2c15ee52d85ad206880637bd8 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 18 May 2024 02:39:59 -0500 Subject: [PATCH 77/79] Update NEWS. --- .../2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst index e56580ba83eba5..fefdf540eaec26 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst @@ -1 +1 @@ -If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 3.5 million digits. +If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 3 million digits. From 46cf316c213b1b7920fcd279c465e2a1a0ea80d4 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 18 May 2024 02:42:49 -0500 Subject: [PATCH 78/79] Sheesh - put a wrong number in the new NEWS. --- .../2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst index fefdf540eaec26..727427d451d1e0 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst @@ -1 +1 @@ -If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 3 million digits. +If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 2 million digits. From f1cf31591a4f7f8e42a8171fa7b2e6318a63c9ed Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 18 May 2024 14:05:03 -0500 Subject: [PATCH 79/79] Spell out the additional new reason not to do int(Decimal) directly. --- Lib/_pylong.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/_pylong.py b/Lib/_pylong.py index 613085eaee9992..f7aabde1434725 100644 --- a/Lib/_pylong.py +++ b/Lib/_pylong.py @@ -285,8 +285,9 @@ def inner(n, w): if w <= BYTELIM: # XXX Stefan Pochmann discovered that, for 1024-bit ints, # `int(Decimal)` took 2.5x longer than `int(str(Decimal))`. - # So simplify this code to the former if/when that gets - # repaired. + # Worse, `int(Decimal) is still quadratic-time for much + # larger ints. So unless/until all that is repaired, the + # seemingly redundant `str(Decimal)` is crucial to speed. result.extend(int(str(n)).to_bytes(w)) # big-endian default return w1 = w >> 1