Skip to content

Commit 3b87e52

Browse files
lysnikolaouFFY00serhiy-storchaka
authored
[3.12] gh-107450: Check for overflow in the tokenizer and fix overflow test (GH-110832) (#110931)
(cherry picked from commit a1ac559) Co-authored-by: Lysandros Nikolaou <[email protected]> Co-authored-by: Filipe Laíns <[email protected]> Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent b8e5b1b commit 3b87e52

File tree

4 files changed

+40
-29
lines changed

4 files changed

+40
-29
lines changed

Include/errcode.h

+19-19
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
extern "C" {
55
#endif
66

7-
87
/* Error codes passed around between file input, tokenizer, parser and
98
interpreter. This is necessary so we can turn them into Python
109
exceptions at a higher level. Note that some errors have a
@@ -13,24 +12,25 @@ extern "C" {
1312
the parser only returns E_EOF when it hits EOF immediately, and it
1413
never returns E_OK. */
1514

16-
#define E_OK 10 /* No error */
17-
#define E_EOF 11 /* End Of File */
18-
#define E_INTR 12 /* Interrupted */
19-
#define E_TOKEN 13 /* Bad token */
20-
#define E_SYNTAX 14 /* Syntax error */
21-
#define E_NOMEM 15 /* Ran out of memory */
22-
#define E_DONE 16 /* Parsing complete */
23-
#define E_ERROR 17 /* Execution error */
24-
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
25-
#define E_OVERFLOW 19 /* Node had too many children */
26-
#define E_TOODEEP 20 /* Too many indentation levels */
27-
#define E_DEDENT 21 /* No matching outer block for dedent */
28-
#define E_DECODE 22 /* Error in decoding into Unicode */
29-
#define E_EOFS 23 /* EOF in triple-quoted string */
30-
#define E_EOLS 24 /* EOL in single-quoted string */
31-
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
32-
#define E_BADSINGLE 27 /* Ill-formed single statement input */
33-
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
15+
#define E_OK 10 /* No error */
16+
#define E_EOF 11 /* End Of File */
17+
#define E_INTR 12 /* Interrupted */
18+
#define E_TOKEN 13 /* Bad token */
19+
#define E_SYNTAX 14 /* Syntax error */
20+
#define E_NOMEM 15 /* Ran out of memory */
21+
#define E_DONE 16 /* Parsing complete */
22+
#define E_ERROR 17 /* Execution error */
23+
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
24+
#define E_OVERFLOW 19 /* Node had too many children */
25+
#define E_TOODEEP 20 /* Too many indentation levels */
26+
#define E_DEDENT 21 /* No matching outer block for dedent */
27+
#define E_DECODE 22 /* Error in decoding into Unicode */
28+
#define E_EOFS 23 /* EOF in triple-quoted string */
29+
#define E_EOLS 24 /* EOL in single-quoted string */
30+
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
31+
#define E_BADSINGLE 27 /* Ill-formed single statement input */
32+
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
33+
#define E_COLUMNOVERFLOW 29 /* Column offset overflow */
3434

3535
#ifdef __cplusplus
3636
}

Lib/test/test_exceptions.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
from test.support.warnings_helper import check_warnings
1919
from test import support
2020

21+
try:
22+
from _testcapi import INT_MAX
23+
except ImportError:
24+
INT_MAX = 2**31 - 1
25+
26+
2127

2228
class NaiveException(Exception):
2329
def __init__(self, x):
@@ -318,11 +324,13 @@ def baz():
318324
check('(yield i) = 2', 1, 2)
319325
check('def f(*):\n pass', 1, 7)
320326

327+
@unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset")
321328
@support.requires_resource('cpu')
322-
@support.bigmemtest(support._2G, memuse=1.5)
323-
def testMemoryErrorBigSource(self, _size):
324-
with self.assertRaises(OverflowError):
325-
exec(f"if True:\n {' ' * 2**31}print('hello world')")
329+
@support.bigmemtest(INT_MAX, memuse=2, dry_run=False)
330+
def testMemoryErrorBigSource(self, size):
331+
src = b"if True:\n%*s" % (size, b"pass")
332+
with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
333+
compile(src, '<fragment>', 'exec')
326334

327335
@cpython_only
328336
def testSettingException(self):

Parser/pegen_errors.c

+5-6
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ _Pypegen_tokenizer_error(Parser *p)
6666
const char *msg = NULL;
6767
PyObject* errtype = PyExc_SyntaxError;
6868
Py_ssize_t col_offset = -1;
69+
p->error_indicator = 1;
6970
switch (p->tok->done) {
7071
case E_TOKEN:
7172
msg = "invalid token";
@@ -101,6 +102,10 @@ _Pypegen_tokenizer_error(Parser *p)
101102
msg = "unexpected character after line continuation character";
102103
break;
103104
}
105+
case E_COLUMNOVERFLOW:
106+
PyErr_SetString(PyExc_OverflowError,
107+
"Parser column offset overflow - source line is too big");
108+
return -1;
104109
default:
105110
msg = "unknown parsing error";
106111
}
@@ -233,12 +238,6 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *err
233238
col_offset = 0;
234239
} else {
235240
const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
236-
if (p->tok->cur - start > INT_MAX) {
237-
PyErr_SetString(PyExc_OverflowError,
238-
"Parser column offset overflow - source line is too big");
239-
p->error_indicator = 1;
240-
return NULL;
241-
}
242241
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
243242
}
244243
} else {

Parser/tokenizer.c

+4
Original file line numberDiff line numberDiff line change
@@ -1366,6 +1366,10 @@ tok_nextc(struct tok_state *tok)
13661366
int rc;
13671367
for (;;) {
13681368
if (tok->cur != tok->inp) {
1369+
if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
1370+
tok->done = E_COLUMNOVERFLOW;
1371+
return EOF;
1372+
}
13691373
tok->col_offset++;
13701374
return Py_CHARMASK(*tok->cur++); /* Fast path */
13711375
}

0 commit comments

Comments
 (0)