From 8909d1481f4e1a96e9d01ba4fa4a776dccfea635 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 19 Apr 2022 14:43:01 +0300
Subject: [PATCH 1/3] gh-91760: More strict rules for numerical group
 references and group names in RE

Only sequence of ASCII digits not starting with 0 (except group 0) is
now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
---
 Doc/library/re.rst                            | 16 ++++-
 Doc/whatsnew/3.11.rst                         |  8 +++
 Lib/re/_parser.py                             | 59 +++++++++----------
 Lib/test/test_re.py                           | 29 +++++++++
 ...2-04-21-19-14-29.gh-issue-91760.54AR-m.rst |  5 ++
 5 files changed, 84 insertions(+), 33 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 89de9286ace79c..c65aaf58990b73 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -395,7 +395,8 @@ The special characters are:
 ``(?P<name>...)``
    Similar to regular parentheses, but the substring matched by the group is
    accessible via the symbolic group name *name*.  Group names must be valid
-   Python identifiers, and each group name must be defined only once within a
+   Python identifiers, and in bytes patterns they must contain only characters
+   in the ASCII range.  Each group name must be defined only once within a
    regular expression.  A symbolic group is also a numbered group, just as if
    the group were not named.
 
@@ -417,6 +418,10 @@ The special characters are:
    |                                       | * ``\1``                         |
    +---------------------------------------+----------------------------------+
 
+   .. versionchanged:: 3.11
+      In bytes patterns group names must contain only characters in
+      the ASCII range.
+
 .. index:: single: (?P=; in regular expressions
 
 ``(?P=name)``
@@ -486,6 +491,9 @@ The special characters are:
    will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
    not with ``'<user@host.com'`` nor ``'user@host.com>'``.
 
+   .. versionchanged:: 3.11
+      Group *id* can only contain ASCII digits and cannot start with ``0``.
+
 
 The special sequences consist of ``'\'`` and a character from the list below.
 If the ordinary character is not an ASCII digit or an ASCII letter, then the
@@ -995,6 +1003,12 @@ form.
       Empty matches for the pattern are replaced when adjacent to a previous
       non-empty match.
 
+   .. versionchanged:: 3.11
+      Group *id* can only contain ASCII digits and cannot start with ``0``
+      (except group 0).
+      In bytes replacement strings group names must contain only characters
+      in the ASCII range.
+
 
 .. function:: subn(pattern, repl, string, count=0, flags=0)
 
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index 6540a255a0ed82..88881e5c7180e1 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -1060,6 +1060,14 @@ Changes in the Python API
   before.
   (Contributed by Ma Lin in :issue:`35859`.)
 
+* More strict rules are now applied for numerical group references and
+  group names in regular expressions.
+  Only sequence of ASCII digits not starting with ``0`` (except group 0) is
+  now accepted as a numerical reference.
+  The group name in bytes patterns and replacement strings can now only
+  contain ASCII letters and digits and underscore.
+  (Contributed by Serhiy Storchaka in :issue:`91760`.)
+
 * The *population* parameter of :func:`random.sample` must be a sequence.
   Automatic conversion of sets to lists is no longer supported. If the sample size
   is larger than the population size, a :exc:`ValueError` is raised.
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index f191f809a1491e..486bf8c2d307a6 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -295,6 +295,14 @@ def seek(self, index):
     def error(self, msg, offset=0):
         return error(msg, self.string, self.tell() - offset)
 
+    def checkgroupname(self, name, offset):
+        if not (self.istext or name.isascii()):
+            msg = "bad character in group name %a" % name
+            raise self.error(msg, len(name) + offset)
+        if not name.isidentifier():
+            msg = "bad character in group name %r" % name
+            raise self.error(msg, len(name) + offset)
+
 def _class_escape(source, escape):
     # handle escape code inside character class
     code = ESCAPES.get(escape)
@@ -707,15 +715,11 @@ def _parse(source, state, verbose, nested, first=False):
                     if sourcematch("<"):
                         # named group: skip forward to end of name
                         name = source.getuntil(">", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1)
                     elif sourcematch("="):
                         # named backreference
                         name = source.getuntil(")", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1)
                         gid = state.groupdict.get(name)
                         if gid is None:
                             msg = "unknown group name %r" % name
@@ -776,25 +780,21 @@ def _parse(source, state, verbose, nested, first=False):
                 elif char == "(":
                     # conditional backreference group
                     condname = source.getuntil(")", "group name")
-                    if condname.isidentifier():
-                        condgroup = state.groupdict.get(condname)
-                        if condgroup is None:
-                            msg = "unknown group name %r" % condname
-                            raise source.error(msg, len(condname) + 1)
-                    else:
-                        try:
-                            condgroup = int(condname)
-                            if condgroup < 0:
-                                raise ValueError
-                        except ValueError:
-                            msg = "bad character in group name %r" % condname
-                            raise source.error(msg, len(condname) + 1) from None
+                    if (condname.isdecimal() and condname.isascii() and
+                            (condname[0] != "0" or condname == "0")):
+                        condgroup = int(condname)
                         if not condgroup:
                             raise source.error("bad group number",
                                                len(condname) + 1)
                         if condgroup >= MAXGROUPS:
                             msg = "invalid group reference %d" % condgroup
                             raise source.error(msg, len(condname) + 1)
+                    else:
+                        source.checkgroupname(condname, 1)
+                        condgroup = state.groupdict.get(condname)
+                        if condgroup is None:
+                            msg = "unknown group name %r" % condname
+                            raise source.error(msg, len(condname) + 1)
                     state.checklookbehindgroup(condgroup, source)
                     item_yes = _parse(source, state, verbose, nested + 1)
                     if source.match("|"):
@@ -1006,26 +1006,21 @@ def addgroup(index, pos):
             # group
             c = this[1]
             if c == "g":
-                name = ""
                 if not s.match("<"):
                     raise s.error("missing <")
                 name = s.getuntil(">", "group name")
-                if name.isidentifier():
+                if (name.isdecimal() and name.isascii() and
+                        (name[0] != "0" or name == "0")):
+                    index = int(name)
+                    if index >= MAXGROUPS:
+                        raise s.error("invalid group reference %d" % index,
+                                      len(name) + 1)
+                else:
+                    s.checkgroupname(name, 1)
                     try:
                         index = groupindex[name]
                     except KeyError:
                         raise IndexError("unknown group name %r" % name) from None
-                else:
-                    try:
-                        index = int(name)
-                        if index < 0:
-                            raise ValueError
-                    except ValueError:
-                        raise s.error("bad character in group name %r" % name,
-                                      len(name) + 1) from None
-                    if index >= MAXGROUPS:
-                        raise s.error("invalid group reference %d" % index,
-                                      len(name) + 1)
                 addgroup(index, len(name) + 1)
             elif c == "0":
                 if s.next in OCTDIGITS:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 781bfd6ea2edac..3316296ffc86e0 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -135,6 +135,7 @@ def test_basic_re_sub(self):
         self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
+        self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
 
         self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
         self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
@@ -274,6 +275,12 @@ def test_symbolic_groups_errors(self):
         self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
         self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
         self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
+        self.checkPatternError(b'(?P<\xc2\xb5>x)',
+                               r"bad character in group name '\xc2\xb5'", 4)
+        self.checkPatternError(b'(?P=\xc2\xb5)',
+                               r"bad character in group name '\xc2\xb5'", 4)
+        self.checkPatternError(b'(?(\xc2\xb5)y)',
+                               r"bad character in group name '\xc2\xb5'", 3)
 
     def test_symbolic_refs(self):
         self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@@ -306,12 +313,24 @@ def test_symbolic_refs_errors(self):
             re.sub('(?P<a>x)', r'\g<ab>', 'xx')
         self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
                                 "bad character in group name '-1'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
+                                "bad character in group name '+1'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g<01>', 'xx',
+                                "bad character in group name '01'", 3)
+        self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
+                                "bad character in group name '1_0'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
+                                "bad character in group name ' 1 '", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
                                 "bad character in group name '©'", 3)
+        self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
+                                r"bad character in group name '\xc2\xb5'", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
                                 "bad character in group name '㊀'", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
                                 "bad character in group name '¹'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
+                                "bad character in group name '१'", 3)
 
     def test_re_subn(self):
         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -577,10 +596,20 @@ def test_re_groupref_exists_errors(self):
         self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
         self.checkPatternError(r'()(?(-1)a|b)',
                                "bad character in group name '-1'", 5)
+        self.checkPatternError(r'()(?(+1)a|b)',
+                               "bad character in group name '+1'", 5)
+        self.checkPatternError(r'()(?(01)a|b)',
+                               "bad character in group name '01'", 5)
+        self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
+                               "bad character in group name '1_0'", 23)
+        self.checkPatternError(r'()(?( 1 )a|b)',
+                               "bad character in group name ' 1 '", 5)
         self.checkPatternError(r'()(?(㊀)a|b)',
                                "bad character in group name '㊀'", 5)
         self.checkPatternError(r'()(?(¹)a|b)',
                                "bad character in group name '¹'", 5)
+        self.checkPatternError(r'()(?(१)a|b)',
+                               "bad character in group name '१'", 5)
         self.checkPatternError(r'()(?(1',
                                "missing ), unterminated name", 5)
         self.checkPatternError(r'()(?(1)a',
diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
new file mode 100644
index 00000000000000..439313c433bf36
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
@@ -0,0 +1,5 @@
+Apply more strict rules for numerical group references and group names in
+regular expressions. Only sequence of ASCII digits not starting with 0
+(except group 0) is now accepted as a numerical reference. The group name in
+bytes patterns and replacement strings can now only contain ASCII letters
+and digits and underscore.

From 5026649f9833eb0879a7951f8aeb003414695cf1 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sun, 24 Apr 2022 22:42:18 +0300
Subject: [PATCH 2/3] Address review comments and minimize the diff.

---
 Doc/library/re.rst                            |  5 ++--
 Doc/whatsnew/3.11.rst                         |  5 ++--
 Lib/re/_parser.py                             | 28 +++++++++----------
 Lib/test/test_re.py                           |  4 ---
 ...2-04-21-19-14-29.gh-issue-91760.54AR-m.rst |  4 +--
 5 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index c65aaf58990b73..714d2f7ffd0dfe 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -492,7 +492,7 @@ The special characters are:
    not with ``'<user@host.com'`` nor ``'user@host.com>'``.
 
    .. versionchanged:: 3.11
-      Group *id* can only contain ASCII digits and cannot start with ``0``.
+      Group *id* can only contain ASCII digits.
 
 
 The special sequences consist of ``'\'`` and a character from the list below.
@@ -1004,8 +1004,7 @@ form.
       non-empty match.
 
    .. versionchanged:: 3.11
-      Group *id* can only contain ASCII digits and cannot start with ``0``
-      (except group 0).
+      Group *id* can only contain ASCII digits.
       In bytes replacement strings group names must contain only characters
       in the ASCII range.
 
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index 61eaf19b3b4b68..cc87e5e4ee3b98 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -1080,11 +1080,10 @@ Changes in the Python API
 
 * More strict rules are now applied for numerical group references and
   group names in regular expressions.
-  Only sequence of ASCII digits not starting with ``0`` (except group 0) is
-  now accepted as a numerical reference.
+  Only sequence of ASCII digits is now accepted as a numerical reference.
   The group name in bytes patterns and replacement strings can now only
   contain ASCII letters and digits and underscore.
-  (Contributed by Serhiy Storchaka in :issue:`91760`.)
+  (Contributed by Serhiy Storchaka in :gh:`91760`.)
 
 * The *population* parameter of :func:`random.sample` must be a sequence.
   Automatic conversion of sets to lists is no longer supported. If the sample size
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index 8db7f451a8cf18..a2f98620b6a05d 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -780,8 +780,13 @@ def _parse(source, state, verbose, nested, first=False):
                 elif char == "(":
                     # conditional backreference group
                     condname = source.getuntil(")", "group name")
-                    if (condname.isdecimal() and condname.isascii() and
-                            (condname[0] != "0" or condname == "0")):
+                    if not (condname.isdecimal() and condname.isascii()):
+                        source.checkgroupname(condname, 1)
+                        condgroup = state.groupdict.get(condname)
+                        if condgroup is None:
+                            msg = "unknown group name %r" % condname
+                            raise source.error(msg, len(condname) + 1)
+                    else:
                         condgroup = int(condname)
                         if not condgroup:
                             raise source.error("bad group number",
@@ -793,12 +798,6 @@ def _parse(source, state, verbose, nested, first=False):
                             state.grouprefpos[condgroup] = (
                                 source.tell() - len(condname) - 1
                             )
-                    else:
-                        source.checkgroupname(condname, 1)
-                        condgroup = state.groupdict.get(condname)
-                        if condgroup is None:
-                            msg = "unknown group name %r" % condname
-                            raise source.error(msg, len(condname) + 1)
                     state.checklookbehindgroup(condgroup, source)
                     item_yes = _parse(source, state, verbose, nested + 1)
                     if source.match("|"):
@@ -1007,18 +1006,17 @@ def addgroup(index, pos):
                 if not s.match("<"):
                     raise s.error("missing <")
                 name = s.getuntil(">", "group name")
-                if (name.isdecimal() and name.isascii() and
-                        (name[0] != "0" or name == "0")):
-                    index = int(name)
-                    if index >= MAXGROUPS:
-                        raise s.error("invalid group reference %d" % index,
-                                      len(name) + 1)
-                else:
+                if not (name.isdecimal() and name.isascii()):
                     s.checkgroupname(name, 1)
                     try:
                         index = groupindex[name]
                     except KeyError:
                         raise IndexError("unknown group name %r" % name) from None
+                else:
+                    index = int(name)
+                    if index >= MAXGROUPS:
+                        raise s.error("invalid group reference %d" % index,
+                                      len(name) + 1)
                 addgroup(index, len(name) + 1)
             elif c == "0":
                 if s.next in OCTDIGITS:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 626e086eaff398..ba70de4344bd9d 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -315,8 +315,6 @@ def test_symbolic_refs_errors(self):
                                 "bad character in group name '-1'", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
                                 "bad character in group name '+1'", 3)
-        self.checkTemplateError('(?P<a>x)', r'\g<01>', 'xx',
-                                "bad character in group name '01'", 3)
         self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
                                 "bad character in group name '1_0'", 3)
         self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
@@ -598,8 +596,6 @@ def test_re_groupref_exists_errors(self):
                                "bad character in group name '-1'", 5)
         self.checkPatternError(r'()(?(+1)a|b)',
                                "bad character in group name '+1'", 5)
-        self.checkPatternError(r'()(?(01)a|b)',
-                               "bad character in group name '01'", 5)
         self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
                                "bad character in group name '1_0'", 23)
         self.checkPatternError(r'()(?( 1 )a|b)',
diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
index 439313c433bf36..ac3e7cdd4bace2 100644
--- a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
+++ b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
@@ -1,5 +1,5 @@
 Apply more strict rules for numerical group references and group names in
-regular expressions. Only sequence of ASCII digits not starting with 0
-(except group 0) is now accepted as a numerical reference. The group name in
+regular expressions. Only sequence of ASCII digits is now accepted as
+a numerical reference. The group name in
 bytes patterns and replacement strings can now only contain ASCII letters
 and digits and underscore.

From 3c0dfcc91894c895add3563a42e9cf178ed5b737 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sun, 8 May 2022 18:28:35 +0300
Subject: [PATCH 3/3] Update What's New

---
 Doc/whatsnew/3.11.rst |  7 -------
 Doc/whatsnew/3.12.rst | 10 ++++++++++
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index 90f76f284f05fe..ddae025af5b162 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -1422,13 +1422,6 @@ Changes in the Python API
   before.
   (Contributed by Ma Lin in :issue:`35859`.)
 
-* More strict rules are now applied for numerical group references and
-  group names in regular expressions.
-  Only sequence of ASCII digits is now accepted as a numerical reference.
-  The group name in bytes patterns and replacement strings can now only
-  contain ASCII letters and digits and underscore.
-  (Contributed by Serhiy Storchaka in :gh:`91760`.)
-
 * The *population* parameter of :func:`random.sample` must be a sequence.
   Automatic conversion of sets to lists is no longer supported. If the sample size
   is larger than the population size, a :exc:`ValueError` is raised.
diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst
index bb6730c073e279..4f17175508fc41 100644
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@@ -114,3 +114,13 @@ Porting to Python 3.12
 
 This section lists previously described changes and other bugfixes
 that may require changes to your code.
+
+Changes in the Python API
+-------------------------
+
+* More strict rules are now applied for numerical group references and
+  group names in regular expressions.
+  Only sequence of ASCII digits is now accepted as a numerical reference.
+  The group name in bytes patterns and replacement strings can now only
+  contain ASCII letters and digits and underscore.
+  (Contributed by Serhiy Storchaka in :gh:`91760`.)