Skip to content

Commit 8909d14

Browse files
gh-91760: More strict rules for numerical group references and group names in RE
Only sequence of ASCII digits not starting with 0 (except group 0) is now accepted as a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore.
1 parent 944fffe commit 8909d14

File tree

5 files changed

+84
-33
lines changed

5 files changed

+84
-33
lines changed

Doc/library/re.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,8 @@ The special characters are:
395395
``(?P<name>...)``
396396
Similar to regular parentheses, but the substring matched by the group is
397397
accessible via the symbolic group name *name*. Group names must be valid
398-
Python identifiers, and each group name must be defined only once within a
398+
Python identifiers, and in bytes patterns they must contain only characters
399+
in the ASCII range. Each group name must be defined only once within a
399400
regular expression. A symbolic group is also a numbered group, just as if
400401
the group were not named.
401402

@@ -417,6 +418,10 @@ The special characters are:
417418
| | * ``\1`` |
418419
+---------------------------------------+----------------------------------+
419420

421+
.. versionchanged:: 3.11
422+
In bytes patterns group names must contain only characters in
423+
the ASCII range.
424+
420425
.. index:: single: (?P=; in regular expressions
421426

422427
``(?P=name)``
@@ -486,6 +491,9 @@ The special characters are:
486491
will match with ``'<[email protected]>'`` as well as ``'[email protected]'``, but
487492
not with ``'<[email protected]'`` nor ``'[email protected]>'``.
488493

494+
.. versionchanged:: 3.11
495+
Group *id* can only contain ASCII digits and cannot start with ``0``.
496+
489497

490498
The special sequences consist of ``'\'`` and a character from the list below.
491499
If the ordinary character is not an ASCII digit or an ASCII letter, then the
@@ -995,6 +1003,12 @@ form.
9951003
Empty matches for the pattern are replaced when adjacent to a previous
9961004
non-empty match.
9971005

1006+
.. versionchanged:: 3.11
1007+
Group *id* can only contain ASCII digits and cannot start with ``0``
1008+
(except group 0).
1009+
In bytes replacement strings group names must contain only characters
1010+
in the ASCII range.
1011+
9981012

9991013
.. function:: subn(pattern, repl, string, count=0, flags=0)
10001014

Doc/whatsnew/3.11.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,14 @@ Changes in the Python API
10601060
before.
10611061
(Contributed by Ma Lin in :issue:`35859`.)
10621062

1063+
* More strict rules are now applied for numerical group references and
1064+
group names in regular expressions.
1065+
Only sequence of ASCII digits not starting with ``0`` (except group 0) is
1066+
now accepted as a numerical reference.
1067+
The group name in bytes patterns and replacement strings can now only
1068+
contain ASCII letters and digits and underscore.
1069+
(Contributed by Serhiy Storchaka in :issue:`91760`.)
1070+
10631071
* The *population* parameter of :func:`random.sample` must be a sequence.
10641072
Automatic conversion of sets to lists is no longer supported. If the sample size
10651073
is larger than the population size, a :exc:`ValueError` is raised.

Lib/re/_parser.py

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,14 @@ def seek(self, index):
295295
def error(self, msg, offset=0):
296296
return error(msg, self.string, self.tell() - offset)
297297

298+
def checkgroupname(self, name, offset):
299+
if not (self.istext or name.isascii()):
300+
msg = "bad character in group name %a" % name
301+
raise self.error(msg, len(name) + offset)
302+
if not name.isidentifier():
303+
msg = "bad character in group name %r" % name
304+
raise self.error(msg, len(name) + offset)
305+
298306
def _class_escape(source, escape):
299307
# handle escape code inside character class
300308
code = ESCAPES.get(escape)
@@ -707,15 +715,11 @@ def _parse(source, state, verbose, nested, first=False):
707715
if sourcematch("<"):
708716
# named group: skip forward to end of name
709717
name = source.getuntil(">", "group name")
710-
if not name.isidentifier():
711-
msg = "bad character in group name %r" % name
712-
raise source.error(msg, len(name) + 1)
718+
source.checkgroupname(name, 1)
713719
elif sourcematch("="):
714720
# named backreference
715721
name = source.getuntil(")", "group name")
716-
if not name.isidentifier():
717-
msg = "bad character in group name %r" % name
718-
raise source.error(msg, len(name) + 1)
722+
source.checkgroupname(name, 1)
719723
gid = state.groupdict.get(name)
720724
if gid is None:
721725
msg = "unknown group name %r" % name
@@ -776,25 +780,21 @@ def _parse(source, state, verbose, nested, first=False):
776780
elif char == "(":
777781
# conditional backreference group
778782
condname = source.getuntil(")", "group name")
779-
if condname.isidentifier():
780-
condgroup = state.groupdict.get(condname)
781-
if condgroup is None:
782-
msg = "unknown group name %r" % condname
783-
raise source.error(msg, len(condname) + 1)
784-
else:
785-
try:
786-
condgroup = int(condname)
787-
if condgroup < 0:
788-
raise ValueError
789-
except ValueError:
790-
msg = "bad character in group name %r" % condname
791-
raise source.error(msg, len(condname) + 1) from None
783+
if (condname.isdecimal() and condname.isascii() and
784+
(condname[0] != "0" or condname == "0")):
785+
condgroup = int(condname)
792786
if not condgroup:
793787
raise source.error("bad group number",
794788
len(condname) + 1)
795789
if condgroup >= MAXGROUPS:
796790
msg = "invalid group reference %d" % condgroup
797791
raise source.error(msg, len(condname) + 1)
792+
else:
793+
source.checkgroupname(condname, 1)
794+
condgroup = state.groupdict.get(condname)
795+
if condgroup is None:
796+
msg = "unknown group name %r" % condname
797+
raise source.error(msg, len(condname) + 1)
798798
state.checklookbehindgroup(condgroup, source)
799799
item_yes = _parse(source, state, verbose, nested + 1)
800800
if source.match("|"):
@@ -1006,26 +1006,21 @@ def addgroup(index, pos):
10061006
# group
10071007
c = this[1]
10081008
if c == "g":
1009-
name = ""
10101009
if not s.match("<"):
10111010
raise s.error("missing <")
10121011
name = s.getuntil(">", "group name")
1013-
if name.isidentifier():
1012+
if (name.isdecimal() and name.isascii() and
1013+
(name[0] != "0" or name == "0")):
1014+
index = int(name)
1015+
if index >= MAXGROUPS:
1016+
raise s.error("invalid group reference %d" % index,
1017+
len(name) + 1)
1018+
else:
1019+
s.checkgroupname(name, 1)
10141020
try:
10151021
index = groupindex[name]
10161022
except KeyError:
10171023
raise IndexError("unknown group name %r" % name) from None
1018-
else:
1019-
try:
1020-
index = int(name)
1021-
if index < 0:
1022-
raise ValueError
1023-
except ValueError:
1024-
raise s.error("bad character in group name %r" % name,
1025-
len(name) + 1) from None
1026-
if index >= MAXGROUPS:
1027-
raise s.error("invalid group reference %d" % index,
1028-
len(name) + 1)
10291024
addgroup(index, len(name) + 1)
10301025
elif c == "0":
10311026
if s.next in OCTDIGITS:

Lib/test/test_re.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ def test_basic_re_sub(self):
135135
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
136136
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
137137
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
138+
self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
138139

139140
self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
140141
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
@@ -274,6 +275,12 @@ def test_symbolic_groups_errors(self):
274275
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
275276
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
276277
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
278+
self.checkPatternError(b'(?P<\xc2\xb5>x)',
279+
r"bad character in group name '\xc2\xb5'", 4)
280+
self.checkPatternError(b'(?P=\xc2\xb5)',
281+
r"bad character in group name '\xc2\xb5'", 4)
282+
self.checkPatternError(b'(?(\xc2\xb5)y)',
283+
r"bad character in group name '\xc2\xb5'", 3)
277284

278285
def test_symbolic_refs(self):
279286
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@@ -306,12 +313,24 @@ def test_symbolic_refs_errors(self):
306313
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
307314
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
308315
"bad character in group name '-1'", 3)
316+
self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
317+
"bad character in group name '+1'", 3)
318+
self.checkTemplateError('(?P<a>x)', r'\g<01>', 'xx',
319+
"bad character in group name '01'", 3)
320+
self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
321+
"bad character in group name '1_0'", 3)
322+
self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
323+
"bad character in group name ' 1 '", 3)
309324
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
310325
"bad character in group name '©'", 3)
326+
self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
327+
r"bad character in group name '\xc2\xb5'", 3)
311328
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
312329
"bad character in group name '㊀'", 3)
313330
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
314331
"bad character in group name '¹'", 3)
332+
self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
333+
"bad character in group name '१'", 3)
315334

316335
def test_re_subn(self):
317336
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -577,10 +596,20 @@ def test_re_groupref_exists_errors(self):
577596
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
578597
self.checkPatternError(r'()(?(-1)a|b)',
579598
"bad character in group name '-1'", 5)
599+
self.checkPatternError(r'()(?(+1)a|b)',
600+
"bad character in group name '+1'", 5)
601+
self.checkPatternError(r'()(?(01)a|b)',
602+
"bad character in group name '01'", 5)
603+
self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
604+
"bad character in group name '1_0'", 23)
605+
self.checkPatternError(r'()(?( 1 )a|b)',
606+
"bad character in group name ' 1 '", 5)
580607
self.checkPatternError(r'()(?(㊀)a|b)',
581608
"bad character in group name '㊀'", 5)
582609
self.checkPatternError(r'()(?(¹)a|b)',
583610
"bad character in group name '¹'", 5)
611+
self.checkPatternError(r'()(?(१)a|b)',
612+
"bad character in group name '१'", 5)
584613
self.checkPatternError(r'()(?(1',
585614
"missing ), unterminated name", 5)
586615
self.checkPatternError(r'()(?(1)a',
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Apply more strict rules for numerical group references and group names in
2+
regular expressions. Only sequence of ASCII digits not starting with 0
3+
(except group 0) is now accepted as a numerical reference. The group name in
4+
bytes patterns and replacement strings can now only contain ASCII letters
5+
and digits and underscore.

0 commit comments

Comments
 (0)