From 8909d1481f4e1a96e9d01ba4fa4a776dccfea635 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 19 Apr 2022 14:43:01 +0300 Subject: [PATCH 1/3] gh-91760: More strict rules for numerical group references and group names in RE Only sequence of ASCII digits not starting with 0 (except group 0) is now accepted as a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore. --- Doc/library/re.rst | 16 ++++- Doc/whatsnew/3.11.rst | 8 +++ Lib/re/_parser.py | 59 +++++++++---------- Lib/test/test_re.py | 29 +++++++++ ...2-04-21-19-14-29.gh-issue-91760.54AR-m.rst | 5 ++ 5 files changed, 84 insertions(+), 33 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 89de9286ace79c..c65aaf58990b73 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -395,7 +395,8 @@ The special characters are: ``(?P...)`` Similar to regular parentheses, but the substring matched by the group is accessible via the symbolic group name *name*. Group names must be valid - Python identifiers, and each group name must be defined only once within a + Python identifiers, and in bytes patterns they must contain only characters + in the ASCII range. Each group name must be defined only once within a regular expression. A symbolic group is also a numbered group, just as if the group were not named. @@ -417,6 +418,10 @@ The special characters are: | | * ``\1`` | +---------------------------------------+----------------------------------+ + .. versionchanged:: 3.11 + In bytes patterns group names must contain only characters in + the ASCII range. + .. index:: single: (?P=; in regular expressions ``(?P=name)`` @@ -486,6 +491,9 @@ The special characters are: will match with ``''`` as well as ``'user@host.com'``, but not with ``''``. + .. versionchanged:: 3.11 + Group *id* can only contain ASCII digits and cannot start with ``0``. + The special sequences consist of ``'\'`` and a character from the list below. If the ordinary character is not an ASCII digit or an ASCII letter, then the @@ -995,6 +1003,12 @@ form. Empty matches for the pattern are replaced when adjacent to a previous non-empty match. + .. versionchanged:: 3.11 + Group *id* can only contain ASCII digits and cannot start with ``0`` + (except group 0). + In bytes replacement strings group names must contain only characters + in the ASCII range. + .. function:: subn(pattern, repl, string, count=0, flags=0) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 6540a255a0ed82..88881e5c7180e1 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -1060,6 +1060,14 @@ Changes in the Python API before. (Contributed by Ma Lin in :issue:`35859`.) +* More strict rules are now applied for numerical group references and + group names in regular expressions. + Only sequence of ASCII digits not starting with ``0`` (except group 0) is + now accepted as a numerical reference. + The group name in bytes patterns and replacement strings can now only + contain ASCII letters and digits and underscore. + (Contributed by Serhiy Storchaka in :issue:`91760`.) + * The *population* parameter of :func:`random.sample` must be a sequence. Automatic conversion of sets to lists is no longer supported. If the sample size is larger than the population size, a :exc:`ValueError` is raised. diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index f191f809a1491e..486bf8c2d307a6 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -295,6 +295,14 @@ def seek(self, index): def error(self, msg, offset=0): return error(msg, self.string, self.tell() - offset) + def checkgroupname(self, name, offset): + if not (self.istext or name.isascii()): + msg = "bad character in group name %a" % name + raise self.error(msg, len(name) + offset) + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise self.error(msg, len(name) + offset) + def _class_escape(source, escape): # handle escape code inside character class code = ESCAPES.get(escape) @@ -707,15 +715,11 @@ def _parse(source, state, verbose, nested, first=False): if sourcematch("<"): # named group: skip forward to end of name name = source.getuntil(">", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) + source.checkgroupname(name, 1) elif sourcematch("="): # named backreference name = source.getuntil(")", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) + source.checkgroupname(name, 1) gid = state.groupdict.get(name) if gid is None: msg = "unknown group name %r" % name @@ -776,25 +780,21 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group condname = source.getuntil(")", "group name") - if condname.isidentifier(): - condgroup = state.groupdict.get(condname) - if condgroup is None: - msg = "unknown group name %r" % condname - raise source.error(msg, len(condname) + 1) - else: - try: - condgroup = int(condname) - if condgroup < 0: - raise ValueError - except ValueError: - msg = "bad character in group name %r" % condname - raise source.error(msg, len(condname) + 1) from None + if (condname.isdecimal() and condname.isascii() and + (condname[0] != "0" or condname == "0")): + condgroup = int(condname) if not condgroup: raise source.error("bad group number", len(condname) + 1) if condgroup >= MAXGROUPS: msg = "invalid group reference %d" % condgroup raise source.error(msg, len(condname) + 1) + else: + source.checkgroupname(condname, 1) + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) state.checklookbehindgroup(condgroup, source) item_yes = _parse(source, state, verbose, nested + 1) if source.match("|"): @@ -1006,26 +1006,21 @@ def addgroup(index, pos): # group c = this[1] if c == "g": - name = "" if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") - if name.isidentifier(): + if (name.isdecimal() and name.isascii() and + (name[0] != "0" or name == "0")): + index = int(name) + if index >= MAXGROUPS: + raise s.error("invalid group reference %d" % index, + len(name) + 1) + else: + s.checkgroupname(name, 1) try: index = groupindex[name] except KeyError: raise IndexError("unknown group name %r" % name) from None - else: - try: - index = int(name) - if index < 0: - raise ValueError - except ValueError: - raise s.error("bad character in group name %r" % name, - len(name) + 1) from None - if index >= MAXGROUPS: - raise s.error("invalid group reference %d" % index, - len(name) + 1) addgroup(index, len(name) + 1) elif c == "0": if s.next in OCTDIGITS: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 781bfd6ea2edac..3316296ffc86e0 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -135,6 +135,7 @@ def test_basic_re_sub(self): self.assertEqual(re.sub('(?Px)', r'\g\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', r'\g\g', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', r'\g<1>\g<1>', 'xx'), 'xxxx') + self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx') self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') @@ -274,6 +275,12 @@ def test_symbolic_groups_errors(self): self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) self.checkPatternError('(?P=©)', "bad character in group name '©'", 4) self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3) + self.checkPatternError(b'(?P<\xc2\xb5>x)', + r"bad character in group name '\xc2\xb5'", 4) + self.checkPatternError(b'(?P=\xc2\xb5)', + r"bad character in group name '\xc2\xb5'", 4) + self.checkPatternError(b'(?(\xc2\xb5)y)', + r"bad character in group name '\xc2\xb5'", 3) def test_symbolic_refs(self): self.assertEqual(re.sub('(?Px)|(?Py)', r'\g', 'xx'), '') @@ -306,12 +313,24 @@ def test_symbolic_refs_errors(self): re.sub('(?Px)', r'\g', 'xx') self.checkTemplateError('(?Px)', r'\g<-1>', 'xx', "bad character in group name '-1'", 3) + self.checkTemplateError('(?Px)', r'\g<+1>', 'xx', + "bad character in group name '+1'", 3) + self.checkTemplateError('(?Px)', r'\g<01>', 'xx', + "bad character in group name '01'", 3) + self.checkTemplateError('()'*10, r'\g<1_0>', 'xx', + "bad character in group name '1_0'", 3) + self.checkTemplateError('(?Px)', r'\g< 1 >', 'xx', + "bad character in group name ' 1 '", 3) self.checkTemplateError('(?Px)', r'\g<©>', 'xx', "bad character in group name '©'", 3) + self.checkTemplateError(b'(?Px)', b'\\g<\xc2\xb5>', b'xx', + r"bad character in group name '\xc2\xb5'", 3) self.checkTemplateError('(?Px)', r'\g<㊀>', 'xx', "bad character in group name '㊀'", 3) self.checkTemplateError('(?Px)', r'\g<¹>', 'xx', "bad character in group name '¹'", 3) + self.checkTemplateError('(?Px)', r'\g<१>', 'xx', + "bad character in group name '१'", 3) def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) @@ -577,10 +596,20 @@ def test_re_groupref_exists_errors(self): self.checkPatternError(r'(?P)(?(0)a|b)', 'bad group number', 10) self.checkPatternError(r'()(?(-1)a|b)', "bad character in group name '-1'", 5) + self.checkPatternError(r'()(?(+1)a|b)', + "bad character in group name '+1'", 5) + self.checkPatternError(r'()(?(01)a|b)', + "bad character in group name '01'", 5) + self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)', + "bad character in group name '1_0'", 23) + self.checkPatternError(r'()(?( 1 )a|b)', + "bad character in group name ' 1 '", 5) self.checkPatternError(r'()(?(㊀)a|b)', "bad character in group name '㊀'", 5) self.checkPatternError(r'()(?(¹)a|b)', "bad character in group name '¹'", 5) + self.checkPatternError(r'()(?(१)a|b)', + "bad character in group name '१'", 5) self.checkPatternError(r'()(?(1', "missing ), unterminated name", 5) self.checkPatternError(r'()(?(1)a', diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst new file mode 100644 index 00000000000000..439313c433bf36 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst @@ -0,0 +1,5 @@ +Apply more strict rules for numerical group references and group names in +regular expressions. Only sequence of ASCII digits not starting with 0 +(except group 0) is now accepted as a numerical reference. The group name in +bytes patterns and replacement strings can now only contain ASCII letters +and digits and underscore. From 5026649f9833eb0879a7951f8aeb003414695cf1 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 24 Apr 2022 22:42:18 +0300 Subject: [PATCH 2/3] Address review comments and minimize the diff. --- Doc/library/re.rst | 5 ++-- Doc/whatsnew/3.11.rst | 5 ++-- Lib/re/_parser.py | 28 +++++++++---------- Lib/test/test_re.py | 4 --- ...2-04-21-19-14-29.gh-issue-91760.54AR-m.rst | 4 +-- 5 files changed, 19 insertions(+), 27 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index c65aaf58990b73..714d2f7ffd0dfe 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -492,7 +492,7 @@ The special characters are: not with ``''``. .. versionchanged:: 3.11 - Group *id* can only contain ASCII digits and cannot start with ``0``. + Group *id* can only contain ASCII digits. The special sequences consist of ``'\'`` and a character from the list below. @@ -1004,8 +1004,7 @@ form. non-empty match. .. versionchanged:: 3.11 - Group *id* can only contain ASCII digits and cannot start with ``0`` - (except group 0). + Group *id* can only contain ASCII digits. In bytes replacement strings group names must contain only characters in the ASCII range. diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 61eaf19b3b4b68..cc87e5e4ee3b98 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -1080,11 +1080,10 @@ Changes in the Python API * More strict rules are now applied for numerical group references and group names in regular expressions. - Only sequence of ASCII digits not starting with ``0`` (except group 0) is - now accepted as a numerical reference. + Only sequence of ASCII digits is now accepted as a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore. - (Contributed by Serhiy Storchaka in :issue:`91760`.) + (Contributed by Serhiy Storchaka in :gh:`91760`.) * The *population* parameter of :func:`random.sample` must be a sequence. Automatic conversion of sets to lists is no longer supported. If the sample size diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 8db7f451a8cf18..a2f98620b6a05d 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -780,8 +780,13 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group condname = source.getuntil(")", "group name") - if (condname.isdecimal() and condname.isascii() and - (condname[0] != "0" or condname == "0")): + if not (condname.isdecimal() and condname.isascii()): + source.checkgroupname(condname, 1) + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) + else: condgroup = int(condname) if not condgroup: raise source.error("bad group number", @@ -793,12 +798,6 @@ def _parse(source, state, verbose, nested, first=False): state.grouprefpos[condgroup] = ( source.tell() - len(condname) - 1 ) - else: - source.checkgroupname(condname, 1) - condgroup = state.groupdict.get(condname) - if condgroup is None: - msg = "unknown group name %r" % condname - raise source.error(msg, len(condname) + 1) state.checklookbehindgroup(condgroup, source) item_yes = _parse(source, state, verbose, nested + 1) if source.match("|"): @@ -1007,18 +1006,17 @@ def addgroup(index, pos): if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") - if (name.isdecimal() and name.isascii() and - (name[0] != "0" or name == "0")): - index = int(name) - if index >= MAXGROUPS: - raise s.error("invalid group reference %d" % index, - len(name) + 1) - else: + if not (name.isdecimal() and name.isascii()): s.checkgroupname(name, 1) try: index = groupindex[name] except KeyError: raise IndexError("unknown group name %r" % name) from None + else: + index = int(name) + if index >= MAXGROUPS: + raise s.error("invalid group reference %d" % index, + len(name) + 1) addgroup(index, len(name) + 1) elif c == "0": if s.next in OCTDIGITS: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 626e086eaff398..ba70de4344bd9d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -315,8 +315,6 @@ def test_symbolic_refs_errors(self): "bad character in group name '-1'", 3) self.checkTemplateError('(?Px)', r'\g<+1>', 'xx', "bad character in group name '+1'", 3) - self.checkTemplateError('(?Px)', r'\g<01>', 'xx', - "bad character in group name '01'", 3) self.checkTemplateError('()'*10, r'\g<1_0>', 'xx', "bad character in group name '1_0'", 3) self.checkTemplateError('(?Px)', r'\g< 1 >', 'xx', @@ -598,8 +596,6 @@ def test_re_groupref_exists_errors(self): "bad character in group name '-1'", 5) self.checkPatternError(r'()(?(+1)a|b)', "bad character in group name '+1'", 5) - self.checkPatternError(r'()(?(01)a|b)', - "bad character in group name '01'", 5) self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)', "bad character in group name '1_0'", 23) self.checkPatternError(r'()(?( 1 )a|b)', diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst index 439313c433bf36..ac3e7cdd4bace2 100644 --- a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst +++ b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst @@ -1,5 +1,5 @@ Apply more strict rules for numerical group references and group names in -regular expressions. Only sequence of ASCII digits not starting with 0 -(except group 0) is now accepted as a numerical reference. The group name in +regular expressions. Only sequence of ASCII digits is now accepted as +a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore. From 3c0dfcc91894c895add3563a42e9cf178ed5b737 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 8 May 2022 18:28:35 +0300 Subject: [PATCH 3/3] Update What's New --- Doc/whatsnew/3.11.rst | 7 ------- Doc/whatsnew/3.12.rst | 10 ++++++++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 90f76f284f05fe..ddae025af5b162 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -1422,13 +1422,6 @@ Changes in the Python API before. (Contributed by Ma Lin in :issue:`35859`.) -* More strict rules are now applied for numerical group references and - group names in regular expressions. - Only sequence of ASCII digits is now accepted as a numerical reference. - The group name in bytes patterns and replacement strings can now only - contain ASCII letters and digits and underscore. - (Contributed by Serhiy Storchaka in :gh:`91760`.) - * The *population* parameter of :func:`random.sample` must be a sequence. Automatic conversion of sets to lists is no longer supported. If the sample size is larger than the population size, a :exc:`ValueError` is raised. diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index bb6730c073e279..4f17175508fc41 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -114,3 +114,13 @@ Porting to Python 3.12 This section lists previously described changes and other bugfixes that may require changes to your code. + +Changes in the Python API +------------------------- + +* More strict rules are now applied for numerical group references and + group names in regular expressions. + Only sequence of ASCII digits is now accepted as a numerical reference. + The group name in bytes patterns and replacement strings can now only + contain ASCII letters and digits and underscore. + (Contributed by Serhiy Storchaka in :gh:`91760`.)