Skip to content

Commit 2c02c68

Browse files
[3.12] gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer (GH-104980) (#105000)
1 parent 05189f3 commit 2c02c68

File tree

4 files changed

+34
-15
lines changed

4 files changed

+34
-15
lines changed

Lib/test/test_tokenize.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_basic(self):
8282
NAME 'False' (4, 11) (4, 16)
8383
COMMENT '# NEWLINE' (4, 17) (4, 26)
8484
NEWLINE '\\n' (4, 26) (4, 27)
85-
DEDENT '' (4, 27) (4, 27)
85+
DEDENT '' (5, 0) (5, 0)
8686
""")
8787
indent_error_file = b"""\
8888
def k(x):
@@ -755,8 +755,8 @@ def test_tabs(self):
755755
NEWLINE '\\n' (2, 5) (2, 6)
756756
INDENT ' \\t' (3, 0) (3, 9)
757757
NAME 'pass' (3, 9) (3, 13)
758-
DEDENT '' (3, 14) (3, 14)
759-
DEDENT '' (3, 14) (3, 14)
758+
DEDENT '' (4, 0) (4, 0)
759+
DEDENT '' (4, 0) (4, 0)
760760
""")
761761

762762
def test_non_ascii_identifiers(self):
@@ -968,7 +968,7 @@ async def foo():
968968
NUMBER '1' (2, 17) (2, 18)
969969
OP ':' (2, 18) (2, 19)
970970
NAME 'pass' (2, 20) (2, 24)
971-
DEDENT '' (2, 25) (2, 25)
971+
DEDENT '' (3, 0) (3, 0)
972972
""")
973973

974974
self.check_tokenize('''async def foo(async): await''', """\
@@ -1016,7 +1016,7 @@ async def bar(): pass
10161016
NAME 'await' (6, 2) (6, 7)
10171017
OP '=' (6, 8) (6, 9)
10181018
NUMBER '2' (6, 10) (6, 11)
1019-
DEDENT '' (6, 12) (6, 12)
1019+
DEDENT '' (7, 0) (7, 0)
10201020
""")
10211021

10221022
self.check_tokenize('''\
@@ -1054,7 +1054,7 @@ async def bar(): pass
10541054
NAME 'await' (6, 2) (6, 7)
10551055
OP '=' (6, 8) (6, 9)
10561056
NUMBER '2' (6, 10) (6, 11)
1057-
DEDENT '' (6, 12) (6, 12)
1057+
DEDENT '' (7, 0) (7, 0)
10581058
""")
10591059

10601060
def test_newline_after_parenthesized_block_with_comment(self):
@@ -2680,7 +2680,8 @@ def generate_source(indents):
26802680

26812681
valid = generate_source(MAXINDENT - 1)
26822682
tokens = list(_generate_tokens_from_c_tokenizer(valid))
2683-
self.assertEqual(tokens[-1].type, DEDENT)
2683+
self.assertEqual(tokens[-2].type, DEDENT)
2684+
self.assertEqual(tokens[-1].type, ENDMARKER)
26842685
compile(valid, "<string>", "exec")
26852686

26862687
invalid = generate_source(MAXINDENT)

Lib/tokenize.py

-5
Original file line numberDiff line numberDiff line change
@@ -447,13 +447,8 @@ def tokenize(readline):
447447

448448
def _tokenize(rl_gen, encoding):
449449
source = b"".join(rl_gen).decode(encoding)
450-
token = None
451450
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
452451
yield token
453-
if token is not None:
454-
last_line, _ = token.start
455-
yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
456-
457452

458453
def generate_tokens(readline):
459454
"""Tokenize a source reading Python code as unicode strings.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted
2+
by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo
3+
Galindo

Python/Python-tokenize.c

+23-3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
3030
typedef struct
3131
{
3232
PyObject_HEAD struct tok_state *tok;
33+
int done;
3334
} tokenizeriterobject;
3435

3536
/*[clinic input]
@@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
6364
if (extra_tokens) {
6465
self->tok->tok_extra_tokens = 1;
6566
}
67+
self->done = 0;
6668
return (PyObject *)self;
6769
}
6870

@@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it)
179181
}
180182
goto exit;
181183
}
182-
if (type == ERRORTOKEN || type == ENDMARKER) {
184+
if (it->done || type == ERRORTOKEN) {
183185
PyErr_SetString(PyExc_StopIteration, "EOF");
186+
it->done = 1;
184187
goto exit;
185188
}
186189
PyObject *str = NULL;
@@ -194,9 +197,19 @@ tokenizeriter_next(tokenizeriterobject *it)
194197
goto exit;
195198
}
196199

200+
int is_trailing_token = 0;
201+
if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
202+
is_trailing_token = 1;
203+
}
204+
197205
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
198-
Py_ssize_t size = it->tok->inp - line_start;
199-
PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
206+
PyObject* line = NULL;
207+
if (it->tok->tok_extra_tokens && is_trailing_token) {
208+
line = PyUnicode_FromString("");
209+
} else {
210+
Py_ssize_t size = it->tok->inp - line_start;
211+
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
212+
}
200213
if (line == NULL) {
201214
Py_DECREF(str);
202215
goto exit;
@@ -214,6 +227,10 @@ tokenizeriter_next(tokenizeriterobject *it)
214227
}
215228

216229
if (it->tok->tok_extra_tokens) {
230+
if (is_trailing_token) {
231+
lineno = end_lineno = lineno + 1;
232+
col_offset = end_col_offset = 0;
233+
}
217234
// Necessary adjustments to match the original Python tokenize
218235
// implementation
219236
if (type > DEDENT && type < OP) {
@@ -231,6 +248,9 @@ tokenizeriter_next(tokenizeriterobject *it)
231248
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
232249
exit:
233250
_PyToken_Free(&token);
251+
if (type == ENDMARKER) {
252+
it->done = 1;
253+
}
234254
return result;
235255
}
236256

0 commit comments

Comments
 (0)