Skip to content

gh-107450: Check for overflow in the tokenizer and fix overflow test #110832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions Include/errcode.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,25 @@
extern "C" {
#endif

#define E_OK 10 /* No error */
#define E_EOF 11 /* End Of File */
#define E_INTR 12 /* Interrupted */
#define E_TOKEN 13 /* Bad token */
#define E_SYNTAX 14 /* Syntax error */
#define E_NOMEM 15 /* Ran out of memory */
#define E_DONE 16 /* Parsing complete */
#define E_ERROR 17 /* Execution error */
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
#define E_OVERFLOW 19 /* Node had too many children */
#define E_TOODEEP 20 /* Too many indentation levels */
#define E_DEDENT 21 /* No matching outer block for dedent */
#define E_DECODE 22 /* Error in decoding into Unicode */
#define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_BADSINGLE 27 /* Ill-formed single statement input */
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
#define E_OK 10 /* No error */
#define E_EOF 11 /* End Of File */
#define E_INTR 12 /* Interrupted */
#define E_TOKEN 13 /* Bad token */
#define E_SYNTAX 14 /* Syntax error */
#define E_NOMEM 15 /* Ran out of memory */
#define E_DONE 16 /* Parsing complete */
#define E_ERROR 17 /* Execution error */
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
#define E_OVERFLOW 19 /* Node had too many children */
#define E_TOODEEP 20 /* Too many indentation levels */
#define E_DEDENT 21 /* No matching outer block for dedent */
#define E_DECODE 22 /* Error in decoding into Unicode */
#define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_BADSINGLE 27 /* Ill-formed single statement input */
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
#define E_COLUMNOVERFLOW 29 /* Column offset overflow */

#ifdef __cplusplus
}
Expand Down
16 changes: 12 additions & 4 deletions Lib/test/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
from test.support.warnings_helper import check_warnings
from test import support

try:
from _testcapi import INT_MAX
except ImportError:
INT_MAX = 2**31 - 1



class NaiveException(Exception):
def __init__(self, x):
Expand Down Expand Up @@ -318,11 +324,13 @@ def baz():
check('(yield i) = 2', 1, 2)
check('def f(*):\n pass', 1, 7)

@unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset")
@support.requires_resource('cpu')
@support.bigmemtest(support._2G, memuse=1.5)
def testMemoryErrorBigSource(self, _size):
with self.assertRaises(OverflowError):
exec(f"if True:\n {' ' * 2**31}print('hello world')")
@support.bigmemtest(INT_MAX, memuse=2, dry_run=False)
def testMemoryErrorBigSource(self, size):
src = b"if True:\n%*s" % (size, b"pass")
with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
compile(src, '<fragment>', 'exec')

@cpython_only
def testSettingException(self):
Expand Down
4 changes: 4 additions & 0 deletions Parser/lexer/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ tok_nextc(struct tok_state *tok)
int rc;
for (;;) {
if (tok->cur != tok->inp) {
if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
tok->done = E_COLUMNOVERFLOW;
return EOF;
}
tok->col_offset++;
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
Expand Down
5 changes: 5 additions & 0 deletions Parser/pegen_errors.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ _Pypegen_tokenizer_error(Parser *p)
const char *msg = NULL;
PyObject* errtype = PyExc_SyntaxError;
Py_ssize_t col_offset = -1;
p->error_indicator = 1;
switch (p->tok->done) {
case E_TOKEN:
msg = "invalid token";
Expand Down Expand Up @@ -103,6 +104,10 @@ _Pypegen_tokenizer_error(Parser *p)
msg = "unexpected character after line continuation character";
break;
}
case E_COLUMNOVERFLOW:
PyErr_SetString(PyExc_OverflowError,
"Parser column offset overflow - source line is too big");
return -1;
default:
msg = "unknown parsing error";
}
Expand Down