Skip to content

Commit 3036fad

Browse files
committed
gh-107450: Check for overflow in the tokenizer and fix overflow test
1 parent e1d8c65 commit 3036fad

File tree

4 files changed

+44
-22
lines changed

4 files changed

+44
-22
lines changed

Include/errcode.h

+19-18
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,25 @@
1919
extern "C" {
2020
#endif
2121

22-
#define E_OK 10 /* No error */
23-
#define E_EOF 11 /* End Of File */
24-
#define E_INTR 12 /* Interrupted */
25-
#define E_TOKEN 13 /* Bad token */
26-
#define E_SYNTAX 14 /* Syntax error */
27-
#define E_NOMEM 15 /* Ran out of memory */
28-
#define E_DONE 16 /* Parsing complete */
29-
#define E_ERROR 17 /* Execution error */
30-
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
31-
#define E_OVERFLOW 19 /* Node had too many children */
32-
#define E_TOODEEP 20 /* Too many indentation levels */
33-
#define E_DEDENT 21 /* No matching outer block for dedent */
34-
#define E_DECODE 22 /* Error in decoding into Unicode */
35-
#define E_EOFS 23 /* EOF in triple-quoted string */
36-
#define E_EOLS 24 /* EOL in single-quoted string */
37-
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
38-
#define E_BADSINGLE 27 /* Ill-formed single statement input */
39-
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
22+
#define E_OK 10 /* No error */
23+
#define E_EOF 11 /* End Of File */
24+
#define E_INTR 12 /* Interrupted */
25+
#define E_TOKEN 13 /* Bad token */
26+
#define E_SYNTAX 14 /* Syntax error */
27+
#define E_NOMEM 15 /* Ran out of memory */
28+
#define E_DONE 16 /* Parsing complete */
29+
#define E_ERROR 17 /* Execution error */
30+
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
31+
#define E_OVERFLOW 19 /* Node had too many children */
32+
#define E_TOODEEP 20 /* Too many indentation levels */
33+
#define E_DEDENT 21 /* No matching outer block for dedent */
34+
#define E_DECODE 22 /* Error in decoding into Unicode */
35+
#define E_EOFS 23 /* EOF in triple-quoted string */
36+
#define E_EOLS 24 /* EOL in single-quoted string */
37+
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
38+
#define E_BADSINGLE 27 /* Ill-formed single statement input */
39+
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
40+
#define E_COLUMNOVERFLOW 29 /* Column offset overflow */
4041

4142
#ifdef __cplusplus
4243
}

Lib/test/test_exceptions.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Python test set -- part 5, built-in exceptions
22

33
import copy
4+
import ctypes
45
import os
56
import sys
67
import unittest
@@ -318,11 +319,22 @@ def baz():
318319
check('(yield i) = 2', 1, 2)
319320
check('def f(*):\n pass', 1, 7)
320321

322+
@unittest.skipIf(ctypes.sizeof(ctypes.c_int) >= ctypes.sizeof(ctypes.c_ssize_t),
323+
"Downcasting to int is safe for col_offset")
321324
@support.requires_resource('cpu')
322-
@support.bigmemtest(support._2G, memuse=1.5)
323-
def testMemoryErrorBigSource(self, _size):
324-
with self.assertRaises(OverflowError):
325-
exec(f"if True:\n {' ' * 2**31}print('hello world')")
325+
@support.bigmemtest(2**(ctypes.sizeof(ctypes.c_int)*8-1)-1-len("pass"), memuse=1)
326+
def testMemoryErrorBigSource(self, size):
327+
if size < 2**(ctypes.sizeof(ctypes.c_int)*8-1)-1-len("pass"):
328+
self.skipTest('Not enough memory for overflow to occur')
329+
330+
# Construct buffer to hold just enough characters so that the tokenizer offset overflows.
331+
# This makes sure that we don't overflow in the string creation itself
332+
distance_to_prev_divisible_by_8 = size & 7
333+
padding = ' ' * distance_to_prev_divisible_by_8
334+
padding += ' ' * ((size - distance_to_prev_divisible_by_8) // 8)
335+
336+
with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
337+
exec(f"if True:\n{padding}pass")
326338

327339
@cpython_only
328340
def testSettingException(self):

Parser/lexer/lexer.c

+4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ tok_nextc(struct tok_state *tok)
5959
int rc;
6060
for (;;) {
6161
if (tok->cur != tok->inp) {
62+
if (INT_MAX - tok->col_offset - 1 < 0) {
63+
tok->done = E_COLUMNOVERFLOW;
64+
return EOF;
65+
}
6266
tok->col_offset++;
6367
return Py_CHARMASK(*tok->cur++); /* Fast path */
6468
}

Parser/pegen_errors.c

+5
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ _Pypegen_tokenizer_error(Parser *p)
6868
const char *msg = NULL;
6969
PyObject* errtype = PyExc_SyntaxError;
7070
Py_ssize_t col_offset = -1;
71+
p->error_indicator = 1;
7172
switch (p->tok->done) {
7273
case E_TOKEN:
7374
msg = "invalid token";
@@ -103,6 +104,10 @@ _Pypegen_tokenizer_error(Parser *p)
103104
msg = "unexpected character after line continuation character";
104105
break;
105106
}
107+
case E_COLUMNOVERFLOW:
108+
PyErr_SetString(PyExc_OverflowError,
109+
"Parser column offset overflow - source line is too big");
110+
return -1;
106111
default:
107112
msg = "unknown parsing error";
108113
}

0 commit comments

Comments
 (0)