Skip to content

Commit 75a6fad

Browse files
gh-91524: Speed up the regular expression substitution (#91525)
Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. Closes #91524 Primarily authored by serhiy-storchaka Serhiy Storchaka Minor-cleanups-by: Gregory P. Smith [Google] <[email protected]>
1 parent 176b6c5 commit 75a6fad

File tree

9 files changed

+358
-91
lines changed

9 files changed

+358
-91
lines changed

Doc/whatsnew/3.12.rst

+5
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,11 @@ Optimizations
205205
process, which improves performance by 1-5%.
206206
(Contributed by Kevin Modzelewski in :gh:`90536`.)
207207

208+
* Speed up the regular expression substitution (functions :func:`re.sub` and
209+
:func:`re.subn` and corresponding :class:`re.Pattern` methods) for
210+
replacement strings containing group references by 2--3 times.
211+
(Contributed by Serhiy Storchaka in :gh:`91524`.)
212+
208213

209214
CPython bytecode changes
210215
========================

Lib/re/__init__.py

+4-18
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
import enum
125125
from . import _compiler, _parser
126126
import functools
127+
import _sre
127128

128129

129130
# public symbols
@@ -230,7 +231,7 @@ def purge():
230231
"Clear the regular expression caches"
231232
_cache.clear()
232233
_cache2.clear()
233-
_compile_repl.cache_clear()
234+
_compile_template.cache_clear()
234235

235236
def template(pattern, flags=0):
236237
"Compile a template pattern, returning a Pattern object, deprecated"
@@ -328,24 +329,9 @@ def _compile(pattern, flags):
328329
return p
329330

330331
@functools.lru_cache(_MAXCACHE)
331-
def _compile_repl(repl, pattern):
332+
def _compile_template(pattern, repl):
332333
# internal: compile replacement pattern
333-
return _parser.parse_template(repl, pattern)
334-
335-
def _expand(pattern, match, template):
336-
# internal: Match.expand implementation hook
337-
template = _parser.parse_template(template, pattern)
338-
return _parser.expand_template(template, match)
339-
340-
def _subx(pattern, template):
341-
# internal: Pattern.sub/subn implementation helper
342-
template = _compile_repl(template, pattern)
343-
if not template[0] and len(template[1]) == 1:
344-
# literal replacement
345-
return template[1][0]
346-
def filter(match, template=template):
347-
return _parser.expand_template(template, match)
348-
return filter
334+
return _sre.template(pattern, _parser.parse_template(repl, pattern))
349335

350336
# register myself for pickling
351337

Lib/re/_constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
# update when constants are added or removed
1515

16-
MAGIC = 20220615
16+
MAGIC = 20221023
1717

1818
from _sre import MAXREPEAT, MAXGROUPS
1919

Lib/re/_parser.py

+16-29
Original file line numberDiff line numberDiff line change
@@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):
984984

985985
return p
986986

987-
def parse_template(source, state):
987+
def parse_template(source, pattern):
988988
# parse 're' replacement string into list of literals and
989989
# group references
990990
s = Tokenizer(source)
991991
sget = s.get
992-
groups = []
993-
literals = []
992+
result = []
994993
literal = []
995994
lappend = literal.append
995+
def addliteral():
996+
if s.istext:
997+
result.append(''.join(literal))
998+
else:
999+
# The tokenizer implicitly decodes bytes objects as latin-1, we must
1000+
# therefore re-encode the final representation.
1001+
result.append(''.join(literal).encode('latin-1'))
1002+
del literal[:]
9961003
def addgroup(index, pos):
997-
if index > state.groups:
1004+
if index > pattern.groups:
9981005
raise s.error("invalid group reference %d" % index, pos)
999-
if literal:
1000-
literals.append(''.join(literal))
1001-
del literal[:]
1002-
groups.append((len(literals), index))
1003-
literals.append(None)
1004-
groupindex = state.groupindex
1006+
addliteral()
1007+
result.append(index)
1008+
groupindex = pattern.groupindex
10051009
while True:
10061010
this = sget()
10071011
if this is None:
@@ -1063,22 +1067,5 @@ def addgroup(index, pos):
10631067
lappend(this)
10641068
else:
10651069
lappend(this)
1066-
if literal:
1067-
literals.append(''.join(literal))
1068-
if not isinstance(source, str):
1069-
# The tokenizer implicitly decodes bytes objects as latin-1, we must
1070-
# therefore re-encode the final representation.
1071-
literals = [None if s is None else s.encode('latin-1') for s in literals]
1072-
return groups, literals
1073-
1074-
def expand_template(template, match):
1075-
g = match.group
1076-
empty = match.string[:0]
1077-
groups, literals = template
1078-
literals = literals[:]
1079-
try:
1080-
for index, group in groups:
1081-
literals[index] = g(group) or empty
1082-
except IndexError:
1083-
raise error("invalid group reference %d" % index) from None
1084-
return empty.join(literals)
1070+
addliteral()
1071+
return result
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Speed up the regular expression substitution (functions :func:`re.sub` and
2+
:func:`re.subn` and corresponding :class:`re.Pattern` methods) for
3+
replacement strings containing group references by 2--3 times.

Modules/_sre/clinic/sre.c.h

+40-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)