Skip to content

Commit 702e337

Browse files
committed
regexp: document and implement that invalid UTF-8 bytes are the same as U+FFFD
What should it mean to run a regexp match on invalid UTF-8 bytes? The coherent behavior options are: 1. Invalid UTF-8 does not match any character classes, nor a U+FFFD literal (nor \x{fffd}). 2. Each byte of invalid UTF-8 is treated identically to a U+FFFD in the input, as a utf8.DecodeRune loop might. RE2 uses Rule 1. Because it works byte at a time, it can also provide \C to match any single byte of input, which matches invalid UTF-8 as well. This provides the nice property that a match for a regexp without \C is guaranteed to be valid UTF-8. Unfortunately, today Go has an incoherent mix of these two, although mostly Rule 2. This is a deviation from RE2, and it gives up the nice property, but we probably can't correct that at this point. In particular .* already matches entire inputs today, valid UTF-8 or not, and I doubt we can break that. This CL adopts Rule 2 officially, fixing the few places that deviate from it. Fixes #48749. Change-Id: I96402527c5dfb1146212f568ffa09dde91d71244 Reviewed-on: https://go-review.googlesource.com/c/go/+/354569 Trust: Russ Cox <[email protected]> Run-TryBot: Russ Cox <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Rob Pike <[email protected]>
1 parent 34f7b1f commit 702e337

File tree

5 files changed

+21
-3
lines changed

5 files changed

+21
-3
lines changed

src/regexp/all_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,9 @@ var literalPrefixTests = []MetaTest{
372372
{`^^0$$`, ``, ``, false},
373373
{`^$^$`, ``, ``, false},
374374
{`$$0^^`, ``, ``, false},
375+
{`a\x{fffd}b`, ``, `a`, false},
376+
{`\x{fffd}b`, ``, ``, false},
377+
{"\ufffd", ``, ``, false},
375378
}
376379

377380
func TestQuoteMeta(t *testing.T) {

src/regexp/find_test.go

+7
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,13 @@ var findTests = []FindTest{
116116
{"\\`", "`", build(1, 0, 1)},
117117
{"[\\`]+", "`", build(1, 0, 1)},
118118

119+
{"\ufffd", "\xff", build(1, 0, 1)},
120+
{"\ufffd", "hello\xffworld", build(1, 5, 6)},
121+
{`.*`, "hello\xffworld", build(1, 0, 11)},
122+
{`\x{fffd}`, "\xc2\x00", build(1, 0, 1)},
123+
{"[\ufffd]", "\xff", build(1, 0, 1)},
124+
{`[\x{fffd}]`, "\xc2\x00", build(1, 0, 1)},
125+
119126
// long set of matches (longer than startSize)
120127
{
121128
".",

src/regexp/onepass.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"sort"
1010
"strings"
1111
"unicode"
12+
"unicode/utf8"
1213
)
1314

1415
// "One-pass" regexp execution.
@@ -55,7 +56,7 @@ func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
5556

5657
// Have prefix; gather characters.
5758
var buf strings.Builder
58-
for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
59+
for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
5960
buf.WriteRune(i.Rune[0])
6061
pc, i = i.Out, &p.Inst[i.Out]
6162
}

src/regexp/regexp.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
// or any book about automata theory.
2121
//
2222
// All characters are UTF-8-encoded code points.
23+
// Following utf8.DecodeRune, each byte of an invalid UTF-8 sequence
24+
// is treated as if it encoded utf8.RuneError (U+FFFD).
2325
//
2426
// There are 16 methods of Regexp that match a regular expression and identify
2527
// the matched text. Their names are matched by this regular expression:
@@ -276,7 +278,11 @@ func minInputLen(re *syntax.Regexp) int {
276278
case syntax.OpLiteral:
277279
l := 0
278280
for _, r := range re.Rune {
279-
l += utf8.RuneLen(r)
281+
if r == utf8.RuneError {
282+
l++
283+
} else {
284+
l += utf8.RuneLen(r)
285+
}
280286
}
281287
return l
282288
case syntax.OpCapture, syntax.OpPlus:

src/regexp/syntax/prog.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"strconv"
99
"strings"
1010
"unicode"
11+
"unicode/utf8"
1112
)
1213

1314
// Compiled program.
@@ -154,7 +155,7 @@ func (p *Prog) Prefix() (prefix string, complete bool) {
154155

155156
// Have prefix; gather characters.
156157
var buf strings.Builder
157-
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
158+
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
158159
buf.WriteRune(i.Rune[0])
159160
i = p.skipNop(i.Out)
160161
}

0 commit comments

Comments
 (0)