Skip to content

Commit 11389ba

Browse files
committed
encoding/json: revert "avoid work when unquoting strings, take 2"
This reverts golang.org/cl/190659 and golang.org/cl/226218, minus the regression tests in the latter. The original work happened in golang.org/cl/151157, which was reverted in golang.org/cl/190909 due to a crash found by fuzzing. We tried a second time in golang.org/cl/190659, which shipped with Go 1.14. A bug was found, where strings would be mangled in certain edge cases. The fix for that was golang.org/cl/226218, which was backported into Go 1.14.4. Unfortunately, a second regression was just reported in #39555, which is a similar case of strings getting mangled when decoding under certain conditions. It would be possible to come up with another small patch to fix that edge case, but instead, let's just revert the entire optimization, as it has proved to do more harm than good. Moreover, it's hard to argue or prove that there will be no more such regressions. However, all the work wasn't for nothing. First, we learned that the way the decoder unquotes tokenized strings isn't simple; initially, we had wrongly assumed that each string was unquoted exactly once and in order. Second, we have gained a number of regression tests which will be useful to prevent the same mistakes in the future, including the test cases we add in this CL. Fixes #39555. Change-Id: I66a6919c2dd6d9789232482ba6cf3814eaa70f61 Reviewed-on: https://go-review.googlesource.com/c/go/+/237838 Run-TryBot: Daniel Martí <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Andrew Bonventre <[email protected]>
1 parent 9340bd6 commit 11389ba

File tree

2 files changed

+48
-42
lines changed

2 files changed

+48
-42
lines changed

src/encoding/json/decode.go

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,6 @@ type decodeState struct {
212212
savedError error
213213
useNumber bool
214214
disallowUnknownFields bool
215-
// safeUnquote is the number of current string literal bytes that don't
216-
// need to be unquoted. When negative, no bytes need unquoting.
217-
safeUnquote int
218215
}
219216

220217
// readIndex returns the position of the last byte read.
@@ -316,27 +313,13 @@ func (d *decodeState) rescanLiteral() {
316313
Switch:
317314
switch data[i-1] {
318315
case '"': // string
319-
// safeUnquote is initialized at -1, which means that all bytes
320-
// checked so far can be unquoted at a later time with no work
321-
// at all. When reaching the closing '"', if safeUnquote is
322-
// still -1, all bytes can be unquoted with no work. Otherwise,
323-
// only those bytes up until the first '\\' or non-ascii rune
324-
// can be safely unquoted.
325-
safeUnquote := -1
326316
for ; i < len(data); i++ {
327-
if c := data[i]; c == '\\' {
328-
if safeUnquote < 0 { // first unsafe byte
329-
safeUnquote = int(i - d.off)
330-
}
317+
switch data[i] {
318+
case '\\':
331319
i++ // escaped char
332-
} else if c == '"' {
333-
d.safeUnquote = safeUnquote
320+
case '"':
334321
i++ // tokenize the closing quote too
335322
break Switch
336-
} else if c >= utf8.RuneSelf {
337-
if safeUnquote < 0 { // first unsafe byte
338-
safeUnquote = int(i - d.off)
339-
}
340323
}
341324
}
342325
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number
@@ -695,7 +678,7 @@ func (d *decodeState) object(v reflect.Value) error {
695678
start := d.readIndex()
696679
d.rescanLiteral()
697680
item := d.data[start:d.readIndex()]
698-
key, ok := d.unquoteBytes(item)
681+
key, ok := unquoteBytes(item)
699682
if !ok {
700683
panic(phasePanicMsg)
701684
}
@@ -896,7 +879,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
896879
d.saveError(&UnmarshalTypeError{Value: val, Type: v.Type(), Offset: int64(d.readIndex())})
897880
return nil
898881
}
899-
s, ok := d.unquoteBytes(item)
882+
s, ok := unquoteBytes(item)
900883
if !ok {
901884
if fromQuoted {
902885
return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -947,7 +930,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
947930
}
948931

949932
case '"': // string
950-
s, ok := d.unquoteBytes(item)
933+
s, ok := unquoteBytes(item)
951934
if !ok {
952935
if fromQuoted {
953936
return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -1107,7 +1090,7 @@ func (d *decodeState) objectInterface() map[string]interface{} {
11071090
start := d.readIndex()
11081091
d.rescanLiteral()
11091092
item := d.data[start:d.readIndex()]
1110-
key, ok := d.unquote(item)
1093+
key, ok := unquote(item)
11111094
if !ok {
11121095
panic(phasePanicMsg)
11131096
}
@@ -1156,7 +1139,7 @@ func (d *decodeState) literalInterface() interface{} {
11561139
return c == 't'
11571140

11581141
case '"': // string
1159-
s, ok := d.unquote(item)
1142+
s, ok := unquote(item)
11601143
if !ok {
11611144
panic(phasePanicMsg)
11621145
}
@@ -1199,33 +1182,40 @@ func getu4(s []byte) rune {
11991182

12001183
// unquote converts a quoted JSON string literal s into an actual string t.
12011184
// The rules are different than for Go, so cannot use strconv.Unquote.
1202-
// The first byte in s must be '"'.
1203-
func (d *decodeState) unquote(s []byte) (t string, ok bool) {
1204-
s, ok = d.unquoteBytes(s)
1185+
func unquote(s []byte) (t string, ok bool) {
1186+
s, ok = unquoteBytes(s)
12051187
t = string(s)
12061188
return
12071189
}
12081190

1209-
func (d *decodeState) unquoteBytes(s []byte) (t []byte, ok bool) {
1210-
// We already know that s[0] == '"'. However, we don't know that the
1211-
// closing quote exists in all cases, such as when the string is nested
1212-
// via the ",string" option.
1213-
if len(s) < 2 || s[len(s)-1] != '"' {
1191+
func unquoteBytes(s []byte) (t []byte, ok bool) {
1192+
if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' {
12141193
return
12151194
}
12161195
s = s[1 : len(s)-1]
12171196

1218-
// If there are no unusual characters, no unquoting is needed, so return
1219-
// a slice of the original bytes.
1220-
r := d.safeUnquote
1221-
if r == -1 {
1197+
// Check for unusual characters. If there are none,
1198+
// then no unquoting is needed, so return a slice of the
1199+
// original bytes.
1200+
r := 0
1201+
for r < len(s) {
1202+
c := s[r]
1203+
if c == '\\' || c == '"' || c < ' ' {
1204+
break
1205+
}
1206+
if c < utf8.RuneSelf {
1207+
r++
1208+
continue
1209+
}
1210+
rr, size := utf8.DecodeRune(s[r:])
1211+
if rr == utf8.RuneError && size == 1 {
1212+
break
1213+
}
1214+
r += size
1215+
}
1216+
if r == len(s) {
12221217
return s, true
12231218
}
1224-
// Only perform up to one safe unquote for each re-scanned string
1225-
// literal. In some edge cases, the decoder unquotes a literal a second
1226-
// time, even after another literal has been re-scanned. Thus, only the
1227-
// first unquote can safely use safeUnquote.
1228-
d.safeUnquote = 0
12291219

12301220
b := make([]byte, len(s)+2*utf8.UTFMax)
12311221
w := copy(b, s[0:r])

src/encoding/json/decode_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2472,6 +2472,22 @@ func TestUnmarshalRescanLiteralMangledUnquote(t *testing.T) {
24722472
if t1 != t2 {
24732473
t.Errorf("Marshal and Unmarshal roundtrip mismatch: want %q got %q", t1, t2)
24742474
}
2475+
2476+
// See golang.org/issues/39555.
2477+
input := map[textUnmarshalerString]string{"FOO": "", `"`: ""}
2478+
2479+
encoded, err := Marshal(input)
2480+
if err != nil {
2481+
t.Fatalf("Marshal unexpected error: %v", err)
2482+
}
2483+
var got map[textUnmarshalerString]string
2484+
if err := Unmarshal(encoded, &got); err != nil {
2485+
t.Fatalf("Unmarshal unexpected error: %v", err)
2486+
}
2487+
want := map[textUnmarshalerString]string{"foo": "", `"`: ""}
2488+
if !reflect.DeepEqual(want, got) {
2489+
t.Fatalf("Unexpected roundtrip result:\nwant: %q\ngot: %q", want, got)
2490+
}
24752491
}
24762492

24772493
func TestUnmarshalMaxDepth(t *testing.T) {

0 commit comments

Comments
 (0)