Skip to content

gh-91760: More strict rules for numerical group references and group names in RE #91792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 8, 2022
Merged
16 changes: 15 additions & 1 deletion Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,8 @@ The special characters are:
``(?P<name>...)``
Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid
Python identifiers, and each group name must be defined only once within a
Python identifiers, and in bytes patterns they must contain only characters
in the ASCII range. Each group name must be defined only once within a
regular expression. A symbolic group is also a numbered group, just as if
the group were not named.

Expand All @@ -417,6 +418,10 @@ The special characters are:
| | * ``\1`` |
+---------------------------------------+----------------------------------+

.. versionchanged:: 3.11
In bytes patterns group names must contain only characters in
the ASCII range.

.. index:: single: (?P=; in regular expressions

``(?P=name)``
Expand Down Expand Up @@ -486,6 +491,9 @@ The special characters are:
will match with ``'<[email protected]>'`` as well as ``'[email protected]'``, but
not with ``'<[email protected]'`` nor ``'[email protected]>'``.

.. versionchanged:: 3.11
Group *id* can only contain ASCII digits and cannot start with ``0``.


The special sequences consist of ``'\'`` and a character from the list below.
If the ordinary character is not an ASCII digit or an ASCII letter, then the
Expand Down Expand Up @@ -995,6 +1003,12 @@ form.
Empty matches for the pattern are replaced when adjacent to a previous
non-empty match.

.. versionchanged:: 3.11
Group *id* can only contain ASCII digits and cannot start with ``0``
(except group 0).
In bytes replacement strings group names must contain only characters
in the ASCII range.


.. function:: subn(pattern, repl, string, count=0, flags=0)

Expand Down
8 changes: 8 additions & 0 deletions Doc/whatsnew/3.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,14 @@ Changes in the Python API
before.
(Contributed by Ma Lin in :issue:`35859`.)

* More strict rules are now applied for numerical group references and
group names in regular expressions.
Only sequence of ASCII digits not starting with ``0`` (except group 0) is
now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
(Contributed by Serhiy Storchaka in :issue:`91760`.)

* The *population* parameter of :func:`random.sample` must be a sequence.
Automatic conversion of sets to lists is no longer supported. If the sample size
is larger than the population size, a :exc:`ValueError` is raised.
Expand Down
59 changes: 27 additions & 32 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,14 @@ def seek(self, index):
def error(self, msg, offset=0):
return error(msg, self.string, self.tell() - offset)

def checkgroupname(self, name, offset):
if not (self.istext or name.isascii()):
msg = "bad character in group name %a" % name
raise self.error(msg, len(name) + offset)
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)

def _class_escape(source, escape):
# handle escape code inside character class
code = ESCAPES.get(escape)
Expand Down Expand Up @@ -707,15 +715,11 @@ def _parse(source, state, verbose, nested, first=False):
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
source.checkgroupname(name, 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
source.checkgroupname(name, 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name %r" % name
Expand Down Expand Up @@ -776,19 +780,9 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(":
# conditional backreference group
condname = source.getuntil(")", "group name")
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
else:
try:
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
msg = "bad character in group name %r" % condname
raise source.error(msg, len(condname) + 1) from None
if (condname.isdecimal() and condname.isascii() and
(condname[0] != "0" or condname == "0")):
condgroup = int(condname)
if not condgroup:
raise source.error("bad group number",
len(condname) + 1)
Expand All @@ -799,6 +793,12 @@ def _parse(source, state, verbose, nested, first=False):
state.grouprefpos[condgroup] = (
source.tell() - len(condname) - 1
)
else:
source.checkgroupname(condname, 1)
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose, nested + 1)
if source.match("|"):
Expand Down Expand Up @@ -1004,26 +1004,21 @@ def addgroup(index, pos):
# group
c = this[1]
if c == "g":
name = ""
if not s.match("<"):
raise s.error("missing <")
name = s.getuntil(">", "group name")
if name.isidentifier():
if (name.isdecimal() and name.isascii() and
(name[0] != "0" or name == "0")):
index = int(name)
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)
else:
s.checkgroupname(name, 1)
try:
index = groupindex[name]
except KeyError:
raise IndexError("unknown group name %r" % name) from None
else:
try:
index = int(name)
if index < 0:
raise ValueError
except ValueError:
raise s.error("bad character in group name %r" % name,
len(name) + 1) from None
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)
addgroup(index, len(name) + 1)
elif c == "0":
if s.next in OCTDIGITS:
Expand Down
29 changes: 29 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def test_basic_re_sub(self):
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')

self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
Expand Down Expand Up @@ -274,6 +275,12 @@ def test_symbolic_groups_errors(self):
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
self.checkPatternError(b'(?P<\xc2\xb5>x)',
r"bad character in group name '\xc2\xb5'", 4)
self.checkPatternError(b'(?P=\xc2\xb5)',
r"bad character in group name '\xc2\xb5'", 4)
self.checkPatternError(b'(?(\xc2\xb5)y)',
r"bad character in group name '\xc2\xb5'", 3)

def test_symbolic_refs(self):
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
Expand Down Expand Up @@ -306,12 +313,24 @@ def test_symbolic_refs_errors(self):
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
"bad character in group name '-1'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
"bad character in group name '+1'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<01>', 'xx',
"bad character in group name '01'", 3)
self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
"bad character in group name '1_0'", 3)
self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
"bad character in group name ' 1 '", 3)
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
"bad character in group name '©'", 3)
self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
r"bad character in group name '\xc2\xb5'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
"bad character in group name '㊀'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
"bad character in group name '¹'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
"bad character in group name '१'", 3)

def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
Expand Down Expand Up @@ -577,10 +596,20 @@ def test_re_groupref_exists_errors(self):
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
self.checkPatternError(r'()(?(-1)a|b)',
"bad character in group name '-1'", 5)
self.checkPatternError(r'()(?(+1)a|b)',
"bad character in group name '+1'", 5)
self.checkPatternError(r'()(?(01)a|b)',
"bad character in group name '01'", 5)
self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
"bad character in group name '1_0'", 23)
self.checkPatternError(r'()(?( 1 )a|b)',
"bad character in group name ' 1 '", 5)
self.checkPatternError(r'()(?(㊀)a|b)',
"bad character in group name '㊀'", 5)
self.checkPatternError(r'()(?(¹)a|b)',
"bad character in group name '¹'", 5)
self.checkPatternError(r'()(?(१)a|b)',
"bad character in group name '१'", 5)
self.checkPatternError(r'()(?(1',
"missing ), unterminated name", 5)
self.checkPatternError(r'()(?(1)a',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Apply more strict rules for numerical group references and group names in
regular expressions. Only sequence of ASCII digits not starting with 0
(except group 0) is now accepted as a numerical reference. The group name in
bytes patterns and replacement strings can now only contain ASCII letters
and digits and underscore.