Skip to content

Commit b047fa5

Browse files
authored
gh-105549: Tokenize separately NUMBER and NAME tokens and allow 0-prefixed literals (#105555)
1 parent 00b599a commit b047fa5

File tree

3 files changed

+45
-3
lines changed

3 files changed

+45
-3
lines changed

Lib/test/test_tokenize.py

+33
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,12 @@ def number_token(s):
284284
# this won't work with compound complex inputs
285285
continue
286286
self.assertEqual(number_token(lit), lit)
287+
# Valid cases with extra underscores in the tokenize module
288+
# See gh-105549 for context
289+
extra_valid_cases = {"0_7", "09_99"}
287290
for lit in INVALID_UNDERSCORE_LITERALS:
291+
if lit in extra_valid_cases:
292+
continue
288293
try:
289294
number_token(lit)
290295
except TokenError:
@@ -1873,6 +1878,34 @@ def test_indentation_semantics_retained(self):
18731878
self.check_roundtrip(code)
18741879

18751880

1881+
class InvalidPythonTests(TestCase):
1882+
def test_number_followed_by_name(self):
1883+
# See issue #gh-105549
1884+
source = "2sin(x)"
1885+
expected_tokens = [
1886+
TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'),
1887+
TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'),
1888+
TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'),
1889+
TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'),
1890+
TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'),
1891+
TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'),
1892+
TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
1893+
]
1894+
1895+
tokens = list(generate_tokens(StringIO(source).readline))
1896+
self.assertEqual(tokens, expected_tokens)
1897+
1898+
def test_number_starting_with_zero(self):
1899+
source = "01234"
1900+
expected_tokens = [
1901+
TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'),
1902+
TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'),
1903+
TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
1904+
]
1905+
1906+
tokens = list(generate_tokens(StringIO(source).readline))
1907+
self.assertEqual(tokens, expected_tokens)
1908+
18761909
class CTokenizeTest(TestCase):
18771910
def check_tokenize(self, s, expected):
18781911
# Format the tokens in s in a table format.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Tokenize separately `NUMBER` and `NAME` tokens that are not ambiguous. Patch
2+
by Pablo Galindo

Parser/tokenizer.c

+10-3
Original file line numberDiff line numberDiff line change
@@ -1600,8 +1600,12 @@ lookahead(struct tok_state *tok, const char *test)
16001600
}
16011601

16021602
static int
1603-
verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1604-
{
1603+
verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
1604+
if (tok->tok_extra_tokens) {
1605+
// When we are parsing extra tokens, we don't want to emit warnings
1606+
// about invalid literals, because we want to be a bit more liberal.
1607+
return 1;
1608+
}
16051609
/* Emit a deprecation warning only if the numeric literal is immediately
16061610
* followed by one of keywords which can occur after a numeric literal
16071611
* in valid code: "and", "else", "for", "if", "in", "is" and "or".
@@ -1659,6 +1663,9 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind)
16591663
static int
16601664
verify_identifier(struct tok_state *tok)
16611665
{
1666+
if (tok->tok_extra_tokens) {
1667+
return 1;
1668+
}
16621669
PyObject *s;
16631670
if (tok->decoding_erred)
16641671
return 0;
@@ -2318,7 +2325,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
23182325
else if (c == 'j' || c == 'J') {
23192326
goto imaginary;
23202327
}
2321-
else if (nonzero) {
2328+
else if (nonzero && !tok->tok_extra_tokens) {
23222329
/* Old-style octal: now disallowed. */
23232330
tok_backup(tok, c);
23242331
return MAKE_TOKEN(syntaxerror_known_range(

0 commit comments

Comments
 (0)