Skip to content

Commit be0254a

Browse files
mvdandmitshur
authored andcommitted
[release-branch.go1.14] encoding/json: revert "avoid work when unquoting strings, take 2"
This reverts golang.org/cl/190659 and golang.org/cl/226218, minus the regression tests in the latter. The original work happened in golang.org/cl/151157, which was reverted in golang.org/cl/190909 due to a crash found by fuzzing. We tried a second time in golang.org/cl/190659, which shipped with Go 1.14. A bug was found, where strings would be mangled in certain edge cases. The fix for that was golang.org/cl/226218, which was backported into Go 1.14.4. Unfortunately, a second regression was just reported in #39555, which is a similar case of strings getting mangled when decoding under certain conditions. It would be possible to come up with another small patch to fix that edge case, but instead, let's just revert the entire optimization, as it has proved to do more harm than good. Moreover, it's hard to argue or prove that there will be no more such regressions. However, all the work wasn't for nothing. First, we learned that the way the decoder unquotes tokenized strings isn't simple; initially, we had wrongly assumed that each string was unquoted exactly once and in order. Second, we have gained a number of regression tests which will be useful to prevent the same mistakes in the future, including the test cases we add in this CL. For #39555. Fixes #39585. Change-Id: I66a6919c2dd6d9789232482ba6cf3814eaa70f61 Reviewed-on: https://go-review.googlesource.com/c/go/+/237838 Run-TryBot: Daniel Martí <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Andrew Bonventre <[email protected]> (cherry picked from commit 11389ba) Reviewed-on: https://go-review.googlesource.com/c/go/+/241081 Reviewed-by: Daniel Martí <[email protected]> Run-TryBot: Dmitri Shuralyov <[email protected]>
1 parent efed90a commit be0254a

File tree

2 files changed

+48
-42
lines changed

2 files changed

+48
-42
lines changed

src/encoding/json/decode.go

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,6 @@ type decodeState struct {
213213
savedError error
214214
useNumber bool
215215
disallowUnknownFields bool
216-
// safeUnquote is the number of current string literal bytes that don't
217-
// need to be unquoted. When negative, no bytes need unquoting.
218-
safeUnquote int
219216
}
220217

221218
// readIndex returns the position of the last byte read.
@@ -317,27 +314,13 @@ func (d *decodeState) rescanLiteral() {
317314
Switch:
318315
switch data[i-1] {
319316
case '"': // string
320-
// safeUnquote is initialized at -1, which means that all bytes
321-
// checked so far can be unquoted at a later time with no work
322-
// at all. When reaching the closing '"', if safeUnquote is
323-
// still -1, all bytes can be unquoted with no work. Otherwise,
324-
// only those bytes up until the first '\\' or non-ascii rune
325-
// can be safely unquoted.
326-
safeUnquote := -1
327317
for ; i < len(data); i++ {
328-
if c := data[i]; c == '\\' {
329-
if safeUnquote < 0 { // first unsafe byte
330-
safeUnquote = int(i - d.off)
331-
}
318+
switch data[i] {
319+
case '\\':
332320
i++ // escaped char
333-
} else if c == '"' {
334-
d.safeUnquote = safeUnquote
321+
case '"':
335322
i++ // tokenize the closing quote too
336323
break Switch
337-
} else if c >= utf8.RuneSelf {
338-
if safeUnquote < 0 { // first unsafe byte
339-
safeUnquote = int(i - d.off)
340-
}
341324
}
342325
}
343326
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number
@@ -691,7 +674,7 @@ func (d *decodeState) object(v reflect.Value) error {
691674
start := d.readIndex()
692675
d.rescanLiteral()
693676
item := d.data[start:d.readIndex()]
694-
key, ok := d.unquoteBytes(item)
677+
key, ok := unquoteBytes(item)
695678
if !ok {
696679
panic(phasePanicMsg)
697680
}
@@ -892,7 +875,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
892875
d.saveError(&UnmarshalTypeError{Value: val, Type: v.Type(), Offset: int64(d.readIndex())})
893876
return nil
894877
}
895-
s, ok := d.unquoteBytes(item)
878+
s, ok := unquoteBytes(item)
896879
if !ok {
897880
if fromQuoted {
898881
return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -943,7 +926,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
943926
}
944927

945928
case '"': // string
946-
s, ok := d.unquoteBytes(item)
929+
s, ok := unquoteBytes(item)
947930
if !ok {
948931
if fromQuoted {
949932
return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -1103,7 +1086,7 @@ func (d *decodeState) objectInterface() map[string]interface{} {
11031086
start := d.readIndex()
11041087
d.rescanLiteral()
11051088
item := d.data[start:d.readIndex()]
1106-
key, ok := d.unquote(item)
1089+
key, ok := unquote(item)
11071090
if !ok {
11081091
panic(phasePanicMsg)
11091092
}
@@ -1152,7 +1135,7 @@ func (d *decodeState) literalInterface() interface{} {
11521135
return c == 't'
11531136

11541137
case '"': // string
1155-
s, ok := d.unquote(item)
1138+
s, ok := unquote(item)
11561139
if !ok {
11571140
panic(phasePanicMsg)
11581141
}
@@ -1195,33 +1178,40 @@ func getu4(s []byte) rune {
11951178

11961179
// unquote converts a quoted JSON string literal s into an actual string t.
11971180
// The rules are different than for Go, so cannot use strconv.Unquote.
1198-
// The first byte in s must be '"'.
1199-
func (d *decodeState) unquote(s []byte) (t string, ok bool) {
1200-
s, ok = d.unquoteBytes(s)
1181+
func unquote(s []byte) (t string, ok bool) {
1182+
s, ok = unquoteBytes(s)
12011183
t = string(s)
12021184
return
12031185
}
12041186

1205-
func (d *decodeState) unquoteBytes(s []byte) (t []byte, ok bool) {
1206-
// We already know that s[0] == '"'. However, we don't know that the
1207-
// closing quote exists in all cases, such as when the string is nested
1208-
// via the ",string" option.
1209-
if len(s) < 2 || s[len(s)-1] != '"' {
1187+
func unquoteBytes(s []byte) (t []byte, ok bool) {
1188+
if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' {
12101189
return
12111190
}
12121191
s = s[1 : len(s)-1]
12131192

1214-
// If there are no unusual characters, no unquoting is needed, so return
1215-
// a slice of the original bytes.
1216-
r := d.safeUnquote
1217-
if r == -1 {
1193+
// Check for unusual characters. If there are none,
1194+
// then no unquoting is needed, so return a slice of the
1195+
// original bytes.
1196+
r := 0
1197+
for r < len(s) {
1198+
c := s[r]
1199+
if c == '\\' || c == '"' || c < ' ' {
1200+
break
1201+
}
1202+
if c < utf8.RuneSelf {
1203+
r++
1204+
continue
1205+
}
1206+
rr, size := utf8.DecodeRune(s[r:])
1207+
if rr == utf8.RuneError && size == 1 {
1208+
break
1209+
}
1210+
r += size
1211+
}
1212+
if r == len(s) {
12181213
return s, true
12191214
}
1220-
// Only perform up to one safe unquote for each re-scanned string
1221-
// literal. In some edge cases, the decoder unquotes a literal a second
1222-
// time, even after another literal has been re-scanned. Thus, only the
1223-
// first unquote can safely use safeUnquote.
1224-
d.safeUnquote = 0
12251215

12261216
b := make([]byte, len(s)+2*utf8.UTFMax)
12271217
w := copy(b, s[0:r])

src/encoding/json/decode_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2459,4 +2459,20 @@ func TestUnmarshalRescanLiteralMangledUnquote(t *testing.T) {
24592459
if t1 != t2 {
24602460
t.Errorf("Marshal and Unmarshal roundtrip mismatch: want %q got %q", t1, t2)
24612461
}
2462+
2463+
// See golang.org/issues/39555.
2464+
input := map[textUnmarshalerString]string{"FOO": "", `"`: ""}
2465+
2466+
encoded, err := Marshal(input)
2467+
if err != nil {
2468+
t.Fatalf("Marshal unexpected error: %v", err)
2469+
}
2470+
var got map[textUnmarshalerString]string
2471+
if err := Unmarshal(encoded, &got); err != nil {
2472+
t.Fatalf("Unmarshal unexpected error: %v", err)
2473+
}
2474+
want := map[textUnmarshalerString]string{"foo": "", `"`: ""}
2475+
if !reflect.DeepEqual(want, got) {
2476+
t.Fatalf("Unexpected roundtrip result:\nwant: %q\ngot: %q", want, got)
2477+
}
24622478
}

0 commit comments

Comments
 (0)