Skip to content

gh-105549: Tokenize separately NUMBER and NAME tokens and allow 0-prefixed literals #105555

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,12 @@ def number_token(s):
# this won't work with compound complex inputs
continue
self.assertEqual(number_token(lit), lit)
# Valid cases with extra underscores in the tokenize module
# See gh-105549 for context
extra_valid_cases = {"0_7", "09_99"}
for lit in INVALID_UNDERSCORE_LITERALS:
if lit in extra_valid_cases:
continue
try:
number_token(lit)
except TokenError:
Expand Down Expand Up @@ -1873,6 +1878,34 @@ def test_indentation_semantics_retained(self):
self.check_roundtrip(code)


class InvalidPythonTests(TestCase):
def test_number_followed_by_name(self):
# See issue #gh-105549
source = "2sin(x)"
expected_tokens = [
TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'),
TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'),
TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'),
TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'),
TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'),
TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'),
TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
]

tokens = list(generate_tokens(StringIO(source).readline))
self.assertEqual(tokens, expected_tokens)

def test_number_starting_with_zero(self):
source = "01234"
expected_tokens = [
TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'),
TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'),
TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
]

tokens = list(generate_tokens(StringIO(source).readline))
self.assertEqual(tokens, expected_tokens)

class CTokenizeTest(TestCase):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Tokenize separately `NUMBER` and `NAME` tokens that are not ambiguous. Patch
by Pablo Galindo
13 changes: 10 additions & 3 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1600,8 +1600,12 @@ lookahead(struct tok_state *tok, const char *test)
}

static int
verify_end_of_number(struct tok_state *tok, int c, const char *kind)
{
verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
if (tok->tok_extra_tokens) {
// When we are parsing extra tokens, we don't want to emit warnings
// about invalid literals, because we want to be a bit more liberal.
return 1;
}
/* Emit a deprecation warning only if the numeric literal is immediately
* followed by one of keywords which can occur after a numeric literal
* in valid code: "and", "else", "for", "if", "in", "is" and "or".
Expand Down Expand Up @@ -1659,6 +1663,9 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind)
static int
verify_identifier(struct tok_state *tok)
{
if (tok->tok_extra_tokens) {
return 1;
}
PyObject *s;
if (tok->decoding_erred)
return 0;
Expand Down Expand Up @@ -2318,7 +2325,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
else if (c == 'j' || c == 'J') {
goto imaginary;
}
else if (nonzero) {
else if (nonzero && !tok->tok_extra_tokens) {
/* Old-style octal: now disallowed. */
tok_backup(tok, c);
return MAKE_TOKEN(syntaxerror_known_range(
Expand Down