Skip to content

Commit b1c5619

Browse files
Remove broken surrogate handling for UTF-16 (#314)
The Matrix specification says everything should be UTF-8: > For the default HTTP transport, all API calls use a Content-Type of application/json. In addition, all strings MUST be encoded as UTF-8. ... and our handling of UTF-16 surrogates was quite broken here, which sometimes resulted in broken JSON after `CompactJSON`.
1 parent 5e8740d commit b1c5619

File tree

2 files changed

+4
-24
lines changed

2 files changed

+4
-24
lines changed

json.go

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,7 @@ func CompactJSON(input, output []byte) []byte {
294294
}
295295

296296
// compactUnicodeEscape unpacks a 4 byte unicode escape starting at index.
297-
// If the escape is a surrogate pair then decode the 6 byte \uXXXX escape
298-
// that follows. Returns the output slice and a new input index.
297+
// Returns the output slice and a new input index.
299298
func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) {
300299
const (
301300
ESCAPES = "uuuuuuuubtnufruuuuuuuuuuuuuuuuuu"
@@ -306,7 +305,7 @@ func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) {
306305
return output, len(input)
307306
}
308307
// Decode the 4 hex digits.
309-
c := readHexDigits(input[index:])
308+
c := readHexDigits(input[index : index+4])
310309
index += 4
311310
if c < ' ' {
312311
// If the character is less than SPACE 0x20 then it will need escaping.
@@ -318,26 +317,9 @@ func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) {
318317
} else if c == '\\' || c == '"' {
319318
// Otherwise the character only needs escaping if it is a QUOTE '"' or BACKSLASH '\\'.
320319
output = append(output, '\\', byte(c))
321-
} else if c < 0xD800 || c >= 0xE000 {
322-
// If the character isn't a surrogate pair then encoded it directly as UTF-8.
323-
var buffer [4]byte
324-
n := utf8.EncodeRune(buffer[:], rune(c))
325-
output = append(output, buffer[:n]...)
326320
} else {
327-
// Otherwise the escaped character was the first part of a UTF-16 style surrogate pair.
328-
// The next 6 bytes MUST be a '\uXXXX'.
329-
// If there aren't enough bytes to decode the hex escape then return.
330-
if len(input)-index < 6 {
331-
return output, len(input)
332-
}
333-
// Decode the 4 hex digits from the '\uXXXX'.
334-
surrogate := readHexDigits(input[index+2:])
335-
index += 6
336-
// Reconstruct the UCS4 codepoint from the surrogates.
337-
codepoint := 0x10000 + (((c & 0x3FF) << 10) | (surrogate & 0x3FF))
338-
// Encode the charater as UTF-8.
339321
var buffer [4]byte
340-
n := utf8.EncodeRune(buffer[:], rune(codepoint))
322+
n := utf8.EncodeRune(buffer[:], rune(c))
341323
output = append(output, buffer[:n]...)
342324
}
343325
return output, index

json_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ func TestSortJSON(t *testing.T) {
8080
func testCompactJSON(t *testing.T, input, want string) {
8181
got := string(CompactJSON([]byte(input), nil))
8282
if got != want {
83-
t.Errorf("CompactJSON(%q): want %q got %q", input, want, got)
83+
t.Errorf("CompactJSON(%q):\n want: %q\n got: %q", input, want, got)
8484
}
8585
}
8686

@@ -108,8 +108,6 @@ func TestCompactJSON(t *testing.T) {
108108
testCompactJSON(t, `["\u0FFF"]`, "[\"\u0FFF\"]")
109109
testCompactJSON(t, `["\u1820"]`, "[\"\u1820\"]")
110110
testCompactJSON(t, `["\uFFFF"]`, "[\"\uFFFF\"]")
111-
testCompactJSON(t, `["\uD842\uDC20"]`, "[\"\U00020820\"]")
112-
testCompactJSON(t, `["\uDBFF\uDFFF"]`, "[\"\U0010FFFF\"]")
113111

114112
testCompactJSON(t, `["\"\\\/"]`, `["\"\\/"]`)
115113
}

0 commit comments

Comments
 (0)