From 99047c07dbe8f7c970070df59cf6d46df3b7729b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Oct 2023 16:42:49 +0200 Subject: [PATCH 1/2] [3.12] gh-107450: Check for overflow in the tokenizer and fix overflow test (GH-110832) (cherry picked from commit a1ac5590e0f8fe008e5562d22edab65d0c1c5507) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Lysandros Nikolaou Co-authored-by: Filipe LaĆ­ns Co-authored-by: Serhiy Storchaka --- Include/errcode.h | 38 ++++++++++++++++++------------------- Lib/test/test_exceptions.py | 16 ++++++++++++---- Parser/pegen_errors.c | 5 +++++ Parser/tokenizer.c | 4 ++++ 4 files changed, 40 insertions(+), 23 deletions(-) diff --git a/Include/errcode.h b/Include/errcode.h index 54ae929bf25870..bd9066bb41516f 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -4,7 +4,6 @@ extern "C" { #endif - /* Error codes passed around between file input, tokenizer, parser and interpreter. This is necessary so we can turn them into Python exceptions at a higher level. Note that some errors have a @@ -13,24 +12,25 @@ extern "C" { the parser only returns E_EOF when it hits EOF immediately, and it never returns E_OK. */ -#define E_OK 10 /* No error */ -#define E_EOF 11 /* End Of File */ -#define E_INTR 12 /* Interrupted */ -#define E_TOKEN 13 /* Bad token */ -#define E_SYNTAX 14 /* Syntax error */ -#define E_NOMEM 15 /* Ran out of memory */ -#define E_DONE 16 /* Parsing complete */ -#define E_ERROR 17 /* Execution error */ -#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ -#define E_OVERFLOW 19 /* Node had too many children */ -#define E_TOODEEP 20 /* Too many indentation levels */ -#define E_DEDENT 21 /* No matching outer block for dedent */ -#define E_DECODE 22 /* Error in decoding into Unicode */ -#define E_EOFS 23 /* EOF in triple-quoted string */ -#define E_EOLS 24 /* EOL in single-quoted string */ -#define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_BADSINGLE 27 /* Ill-formed single statement input */ -#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_OK 10 /* No error */ +#define E_EOF 11 /* End Of File */ +#define E_INTR 12 /* Interrupted */ +#define E_TOKEN 13 /* Bad token */ +#define E_SYNTAX 14 /* Syntax error */ +#define E_NOMEM 15 /* Ran out of memory */ +#define E_DONE 16 /* Parsing complete */ +#define E_ERROR 17 /* Execution error */ +#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ +#define E_OVERFLOW 19 /* Node had too many children */ +#define E_TOODEEP 20 /* Too many indentation levels */ +#define E_DEDENT 21 /* No matching outer block for dedent */ +#define E_DECODE 22 /* Error in decoding into Unicode */ +#define E_EOFS 23 /* EOF in triple-quoted string */ +#define E_EOLS 24 /* EOL in single-quoted string */ +#define E_LINECONT 25 /* Unexpected characters after a line continuation */ +#define E_BADSINGLE 27 /* Ill-formed single statement input */ +#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_COLUMNOVERFLOW 29 /* Column offset overflow */ #ifdef __cplusplus } diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 9de7e7355e5742..304901515992e8 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -18,6 +18,12 @@ from test.support.warnings_helper import check_warnings from test import support +try: + from _testcapi import INT_MAX +except ImportError: + INT_MAX = 2**31 - 1 + + class NaiveException(Exception): def __init__(self, x): @@ -318,11 +324,13 @@ def baz(): check('(yield i) = 2', 1, 2) check('def f(*):\n pass', 1, 7) + @unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset") @support.requires_resource('cpu') - @support.bigmemtest(support._2G, memuse=1.5) - def testMemoryErrorBigSource(self, _size): - with self.assertRaises(OverflowError): - exec(f"if True:\n {' ' * 2**31}print('hello world')") + @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) + def testMemoryErrorBigSource(self, size): + src = b"if True:\n%*s" % (size, b"pass") + with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): + compile(src, '', 'exec') @cpython_only def testSettingException(self): diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 71c476517375bb..ec671f6bddb898 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -66,6 +66,7 @@ _Pypegen_tokenizer_error(Parser *p) const char *msg = NULL; PyObject* errtype = PyExc_SyntaxError; Py_ssize_t col_offset = -1; + p->error_indicator = 1; switch (p->tok->done) { case E_TOKEN: msg = "invalid token"; @@ -101,6 +102,10 @@ _Pypegen_tokenizer_error(Parser *p) msg = "unexpected character after line continuation character"; break; } + case E_COLUMNOVERFLOW: + PyErr_SetString(PyExc_OverflowError, + "Parser column offset overflow - source line is too big"); + return -1; default: msg = "unknown parsing error"; } diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index a7786d0c17e9e3..a59b728e60c1fa 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1366,6 +1366,10 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { + if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { + tok->done = E_COLUMNOVERFLOW; + return EOF; + } tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } From 18e4807f0024238dac384d84ea198eae5efb479c Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Oct 2023 18:14:38 +0200 Subject: [PATCH 2/2] Remove unnecessary check in _PyPegen_raise_errors --- Parser/pegen_errors.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index ec671f6bddb898..6390a66719259a 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -238,12 +238,6 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *err col_offset = 0; } else { const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; - if (p->tok->cur - start > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, - "Parser column offset overflow - source line is too big"); - p->error_indicator = 1; - return NULL; - } col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); } } else {