From 99047c07dbe8f7c970070df59cf6d46df3b7729b Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Mon, 16 Oct 2023 16:42:49 +0200
Subject: [PATCH 1/2] [3.12] gh-107450: Check for overflow in the tokenizer and
 fix overflow test (GH-110832) (cherry picked from commit
 a1ac5590e0f8fe008e5562d22edab65d0c1c5507)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Filipe Laíns <lains@riseup.net>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
---
 Include/errcode.h           | 38 ++++++++++++++++++-------------------
 Lib/test/test_exceptions.py | 16 ++++++++++++----
 Parser/pegen_errors.c       |  5 +++++
 Parser/tokenizer.c          |  4 ++++
 4 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/Include/errcode.h b/Include/errcode.h
index 54ae929bf25870..bd9066bb41516f 100644
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -4,7 +4,6 @@
 extern "C" {
 #endif
 
-
 /* Error codes passed around between file input, tokenizer, parser and
    interpreter.  This is necessary so we can turn them into Python
    exceptions at a higher level.  Note that some errors have a
@@ -13,24 +12,25 @@ extern "C" {
    the parser only returns E_EOF when it hits EOF immediately, and it
    never returns E_OK. */
 
-#define E_OK            10      /* No error */
-#define E_EOF           11      /* End Of File */
-#define E_INTR          12      /* Interrupted */
-#define E_TOKEN         13      /* Bad token */
-#define E_SYNTAX        14      /* Syntax error */
-#define E_NOMEM         15      /* Ran out of memory */
-#define E_DONE          16      /* Parsing complete */
-#define E_ERROR         17      /* Execution error */
-#define E_TABSPACE      18      /* Inconsistent mixing of tabs and spaces */
-#define E_OVERFLOW      19      /* Node had too many children */
-#define E_TOODEEP       20      /* Too many indentation levels */
-#define E_DEDENT        21      /* No matching outer block for dedent */
-#define E_DECODE        22      /* Error in decoding into Unicode */
-#define E_EOFS          23      /* EOF in triple-quoted string */
-#define E_EOLS          24      /* EOL in single-quoted string */
-#define E_LINECONT      25      /* Unexpected characters after a line continuation */
-#define E_BADSINGLE     27      /* Ill-formed single statement input */
-#define E_INTERACT_STOP 28      /* Interactive mode stopped tokenization */
+#define E_OK             10      /* No error */
+#define E_EOF            11      /* End Of File */
+#define E_INTR           12      /* Interrupted */
+#define E_TOKEN          13      /* Bad token */
+#define E_SYNTAX         14      /* Syntax error */
+#define E_NOMEM          15      /* Ran out of memory */
+#define E_DONE           16      /* Parsing complete */
+#define E_ERROR          17      /* Execution error */
+#define E_TABSPACE       18      /* Inconsistent mixing of tabs and spaces */
+#define E_OVERFLOW       19      /* Node had too many children */
+#define E_TOODEEP        20      /* Too many indentation levels */
+#define E_DEDENT         21      /* No matching outer block for dedent */
+#define E_DECODE         22      /* Error in decoding into Unicode */
+#define E_EOFS           23      /* EOF in triple-quoted string */
+#define E_EOLS           24      /* EOL in single-quoted string */
+#define E_LINECONT       25      /* Unexpected characters after a line continuation */
+#define E_BADSINGLE      27      /* Ill-formed single statement input */
+#define E_INTERACT_STOP  28      /* Interactive mode stopped tokenization */
+#define E_COLUMNOVERFLOW 29      /* Column offset overflow */
 
 #ifdef __cplusplus
 }
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
index 9de7e7355e5742..304901515992e8 100644
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -18,6 +18,12 @@
 from test.support.warnings_helper import check_warnings
 from test import support
 
+try:
+    from _testcapi import INT_MAX
+except ImportError:
+    INT_MAX = 2**31 - 1
+
+
 
 class NaiveException(Exception):
     def __init__(self, x):
@@ -318,11 +324,13 @@ def baz():
         check('(yield i) = 2', 1, 2)
         check('def f(*):\n  pass', 1, 7)
 
+    @unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset")
     @support.requires_resource('cpu')
-    @support.bigmemtest(support._2G, memuse=1.5)
-    def testMemoryErrorBigSource(self, _size):
-        with self.assertRaises(OverflowError):
-            exec(f"if True:\n {' ' * 2**31}print('hello world')")
+    @support.bigmemtest(INT_MAX, memuse=2, dry_run=False)
+    def testMemoryErrorBigSource(self, size):
+        src = b"if True:\n%*s" % (size, b"pass")
+        with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
+            compile(src, '<fragment>', 'exec')
 
     @cpython_only
     def testSettingException(self):
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 71c476517375bb..ec671f6bddb898 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -66,6 +66,7 @@ _Pypegen_tokenizer_error(Parser *p)
     const char *msg = NULL;
     PyObject* errtype = PyExc_SyntaxError;
     Py_ssize_t col_offset = -1;
+    p->error_indicator = 1;
     switch (p->tok->done) {
         case E_TOKEN:
             msg = "invalid token";
@@ -101,6 +102,10 @@ _Pypegen_tokenizer_error(Parser *p)
             msg = "unexpected character after line continuation character";
             break;
         }
+        case E_COLUMNOVERFLOW:
+            PyErr_SetString(PyExc_OverflowError,
+                    "Parser column offset overflow - source line is too big");
+            return -1;
         default:
             msg = "unknown parsing error";
     }
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index a7786d0c17e9e3..a59b728e60c1fa 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1366,6 +1366,10 @@ tok_nextc(struct tok_state *tok)
     int rc;
     for (;;) {
         if (tok->cur != tok->inp) {
+            if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
+                tok->done = E_COLUMNOVERFLOW;
+                return EOF;
+            }
             tok->col_offset++;
             return Py_CHARMASK(*tok->cur++); /* Fast path */
         }

From 18e4807f0024238dac384d84ea198eae5efb479c Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Mon, 16 Oct 2023 18:14:38 +0200
Subject: [PATCH 2/2] Remove unnecessary check in _PyPegen_raise_errors

---
 Parser/pegen_errors.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index ec671f6bddb898..6390a66719259a 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -238,12 +238,6 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *err
             col_offset = 0;
         } else {
             const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
-            if (p->tok->cur - start > INT_MAX) {
-                PyErr_SetString(PyExc_OverflowError,
-                    "Parser column offset overflow - source line is too big");
-                p->error_indicator = 1;
-                return NULL;
-            }
             col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
         }
     } else {