Skip to content

Commit 8116599

Browse files
sylvinusrsc
authored andcommitted
regexp: optimize for provably too short inputs
For many patterns we can compute the minimum input length at compile time. If the input is shorter, we can return early and get a huge speedup. As pointed out by Damian Gryski, Perl's regex engine contains a number of these kinds of fail-fast optimizations: https://perldoc.perl.org/perlreguts.html#Peep-hole-Optimisation-and-Analysis Benchmarks: (including new ones for compile time) name old time/op new time/op delta Compile/Onepass-8 4.39µs ± 1% 4.40µs ± 0% +0.34% (p=0.029 n=9+8) Compile/Medium-8 9.80µs ± 0% 9.91µs ± 0% +1.17% (p=0.000 n=10+10) Compile/Hard-8 72.7µs ± 0% 73.5µs ± 0% +1.10% (p=0.000 n=9+10) name old time/op new time/op delta Match/Easy0/16-8 52.6ns ± 5% 4.9ns ± 0% -90.68% (p=0.000 n=10+9) Match/Easy0/32-8 64.1ns ±10% 61.4ns ± 1% ~ (p=0.188 n=10+9) Match/Easy0/1K-8 280ns ± 1% 277ns ± 2% -0.97% (p=0.004 n=10+10) Match/Easy0/32K-8 4.61µs ± 1% 4.55µs ± 1% -1.49% (p=0.000 n=9+10) Match/Easy0/1M-8 229µs ± 0% 226µs ± 1% -1.29% (p=0.000 n=8+10) Match/Easy0/32M-8 7.50ms ± 1% 7.47ms ± 1% ~ (p=0.165 n=10+10) Match/Easy0i/16-8 533ns ± 1% 5ns ± 2% -99.07% (p=0.000 n=10+10) Match/Easy0i/32-8 950ns ± 0% 950ns ± 1% ~ (p=0.920 n=10+9) Match/Easy0i/1K-8 27.5µs ± 1% 27.5µs ± 0% ~ (p=0.739 n=10+10) Match/Easy0i/32K-8 1.13ms ± 0% 1.13ms ± 1% ~ (p=0.079 n=9+10) Match/Easy0i/1M-8 36.7ms ± 2% 36.1ms ± 0% -1.64% (p=0.000 n=10+9) Match/Easy0i/32M-8 1.17s ± 0% 1.16s ± 1% -0.80% (p=0.004 n=8+9) Match/Easy1/16-8 55.5ns ± 6% 4.9ns ± 1% -91.19% (p=0.000 n=10+9) Match/Easy1/32-8 58.3ns ± 8% 56.6ns ± 1% ~ (p=0.449 n=10+8) Match/Easy1/1K-8 750ns ± 0% 748ns ± 1% ~ (p=0.072 n=8+10) Match/Easy1/32K-8 31.8µs ± 0% 31.6µs ± 1% -0.50% (p=0.035 n=10+9) Match/Easy1/1M-8 1.10ms ± 1% 1.09ms ± 0% -0.95% (p=0.000 n=10+9) Match/Easy1/32M-8 35.5ms ± 0% 35.2ms ± 1% -1.05% (p=0.000 n=9+10) Match/Medium/16-8 442ns ± 2% 5ns ± 1% -98.89% (p=0.000 n=10+10) Match/Medium/32-8 875ns ± 0% 878ns ± 1% ~ (p=0.071 n=9+10) Match/Medium/1K-8 26.1µs ± 0% 25.9µs ± 0% -0.64% (p=0.000 n=10+10) Match/Medium/32K-8 1.09ms ± 1% 1.08ms ± 0% -0.84% (p=0.000 n=10+9) Match/Medium/1M-8 34.9ms ± 0% 34.6ms ± 1% -0.98% (p=0.000 n=9+10) Match/Medium/32M-8 1.12s ± 1% 1.11s ± 1% -0.98% (p=0.000 n=10+9) Match/Hard/16-8 721ns ± 1% 5ns ± 0% -99.32% (p=0.000 n=10+9) Match/Hard/32-8 1.32µs ± 1% 1.31µs ± 0% -0.71% (p=0.000 n=9+9) Match/Hard/1K-8 39.8µs ± 1% 39.7µs ± 1% ~ (p=0.165 n=10+10) Match/Hard/32K-8 1.57ms ± 0% 1.56ms ± 0% -0.70% (p=0.000 n=10+9) Match/Hard/1M-8 50.4ms ± 1% 50.1ms ± 1% -0.57% (p=0.007 n=10+10) Match/Hard/32M-8 1.62s ± 1% 1.60s ± 0% -0.98% (p=0.000 n=10+10) Match/Hard1/16-8 3.88µs ± 1% 3.86µs ± 0% ~ (p=0.118 n=10+10) Match/Hard1/32-8 7.44µs ± 1% 7.46µs ± 1% ~ (p=0.109 n=10+10) Match/Hard1/1K-8 232µs ± 1% 229µs ± 1% -1.31% (p=0.000 n=10+9) Match/Hard1/32K-8 7.41ms ± 2% 7.41ms ± 0% ~ (p=0.237 n=10+8) Match/Hard1/1M-8 238ms ± 1% 238ms ± 0% ~ (p=0.481 n=10+10) Match/Hard1/32M-8 7.69s ± 1% 7.61s ± 0% -1.00% (p=0.000 n=10+10) Fixes #31329 Change-Id: I04640e8c59178ec8b3106e13ace9b109b6bdbc25 Reviewed-on: https://go-review.googlesource.com/c/go/+/171023 Reviewed-by: Rob Pike <[email protected]> Reviewed-by: Russ Cox <[email protected]> Run-TryBot: Rob Pike <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent cefc0bb commit 8116599

File tree

5 files changed

+90
-10
lines changed

5 files changed

+90
-10
lines changed

src/regexp/all_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,6 +860,25 @@ func BenchmarkQuoteMetaNone(b *testing.B) {
860860
}
861861
}
862862

863+
var compileBenchData = []struct{ name, re string }{
864+
{"Onepass", `^a.[l-nA-Cg-j]?e$`},
865+
{"Medium", `^((a|b|[d-z0-9])*(日){4,5}.)+$`},
866+
{"Hard", strings.Repeat(`((abc)*|`, 50) + strings.Repeat(`)`, 50)},
867+
}
868+
869+
func BenchmarkCompile(b *testing.B) {
870+
for _, data := range compileBenchData {
871+
b.Run(data.name, func(b *testing.B) {
872+
b.ReportAllocs()
873+
for i := 0; i < b.N; i++ {
874+
if _, err := Compile(data.re); err != nil {
875+
b.Fatal(err)
876+
}
877+
}
878+
})
879+
}
880+
}
881+
863882
func TestDeepEqual(t *testing.T) {
864883
re1 := MustCompile("a.*b.*c.*d")
865884
re2 := MustCompile("a.*b.*c.*d")
@@ -882,3 +901,31 @@ func TestDeepEqual(t *testing.T) {
882901
t.Errorf("DeepEqual(re1, re2) = false, want true")
883902
}
884903
}
904+
905+
var minInputLenTests = []struct {
906+
Regexp string
907+
min int
908+
}{
909+
{``, 0},
910+
{`a`, 1},
911+
{`aa`, 2},
912+
{`(aa)a`, 3},
913+
{`(?:aa)a`, 3},
914+
{`a?a`, 1},
915+
{`(aaa)|(aa)`, 2},
916+
{`(aa)+a`, 3},
917+
{`(aa)*a`, 1},
918+
{`(aa){3,5}`, 6},
919+
{`[a-z]`, 1},
920+
{`日`, 3},
921+
}
922+
923+
func TestMinInputLen(t *testing.T) {
924+
for _, tt := range minInputLenTests {
925+
re, _ := syntax.Parse(tt.Regexp, syntax.Perl)
926+
m := minInputLen(re)
927+
if m != tt.min {
928+
t.Errorf("regexp %#q has minInputLen %d, should be %d", tt.Regexp, m, tt.min)
929+
}
930+
}
931+
}

src/regexp/exec.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,10 @@ func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap i
524524
dstCap = arrayNoInts[:0:0]
525525
}
526526

527+
if r == nil && len(b)+len(s) < re.minInputLen {
528+
return nil
529+
}
530+
527531
if re.onepass != nil {
528532
return re.doOnePass(r, b, s, pos, ncap, dstCap)
529533
}

src/regexp/exec_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,7 @@ var benchSizes = []struct {
717717
name string
718718
n int
719719
}{
720+
{"16", 16},
720721
{"32", 32},
721722
{"1K", 1 << 10},
722723
{"32K", 32 << 10},

src/regexp/onepass_test.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,3 @@ func TestRunOnePass(t *testing.T) {
223223
}
224224
}
225225
}
226-
227-
func BenchmarkCompileOnepass(b *testing.B) {
228-
b.ReportAllocs()
229-
const re = `^a.[l-nA-Cg-j]?e$`
230-
for i := 0; i < b.N; i++ {
231-
if _, err := Compile(re); err != nil {
232-
b.Fatal(err)
233-
}
234-
}
235-
}

src/regexp/regexp.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ type Regexp struct {
9494
matchcap int // size of recorded match lengths
9595
prefixComplete bool // prefix is the entire regexp
9696
cond syntax.EmptyOp // empty-width conditions required at start of match
97+
minInputLen int // minimum length of the input in bytes
9798

9899
// This field can be modified by the Longest method,
99100
// but it is otherwise read-only.
@@ -191,6 +192,7 @@ func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
191192
cond: prog.StartCond(),
192193
longest: longest,
193194
matchcap: matchcap,
195+
minInputLen: minInputLen(re),
194196
}
195197
if regexp.onepass == nil {
196198
regexp.prefix, regexp.prefixComplete = prog.Prefix()
@@ -264,6 +266,42 @@ func (re *Regexp) put(m *machine) {
264266
matchPool[re.mpool].Put(m)
265267
}
266268

269+
// minInputLen walks the regexp to find the minimum length of any matchable input
270+
func minInputLen(re *syntax.Regexp) int {
271+
switch re.Op {
272+
default:
273+
return 0
274+
case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass:
275+
return 1
276+
case syntax.OpLiteral:
277+
l := 0
278+
for _, r := range re.Rune {
279+
l += utf8.RuneLen(r)
280+
}
281+
return l
282+
case syntax.OpCapture, syntax.OpPlus:
283+
return minInputLen(re.Sub[0])
284+
case syntax.OpRepeat:
285+
return re.Min * minInputLen(re.Sub[0])
286+
case syntax.OpConcat:
287+
l := 0
288+
for _, sub := range re.Sub {
289+
l += minInputLen(sub)
290+
}
291+
return l
292+
case syntax.OpAlternate:
293+
l := minInputLen(re.Sub[0])
294+
var lnext int
295+
for _, sub := range re.Sub[1:] {
296+
lnext = minInputLen(sub)
297+
if lnext < l {
298+
l = lnext
299+
}
300+
}
301+
return l
302+
}
303+
}
304+
267305
// MustCompile is like Compile but panics if the expression cannot be parsed.
268306
// It simplifies safe initialization of global variables holding compiled regular
269307
// expressions.

0 commit comments

Comments
 (0)