Skip to content

Commit b26e170

Browse files
committed
UTF-16 surrogate handling (again)
This somewhat reverts #314 and refactors things so that the broken JSON that could happen before with faulty surrogates will no longer happen.
1 parent b374612 commit b26e170

File tree

2 files changed

+42
-12
lines changed

2 files changed

+42
-12
lines changed

json.go

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"errors"
2121
"sort"
2222
"strings"
23+
"unicode/utf16"
2324
"unicode/utf8"
2425

2526
"github.com/tidwall/gjson"
@@ -78,8 +79,9 @@ func NewEventJSONsFromEvents(he []*Event) EventJSONs {
7879
// CanonicalJSON re-encodes the JSON in a canonical encoding. The encoding is
7980
// the shortest possible encoding using integer values with sorted object keys.
8081
// At present this function performs:
81-
// * shortest encoding, sorted lexicographically by UTF-8 codepoint:
82-
// https://matrix.org/docs/spec/appendices#canonical-json
82+
// - shortest encoding, sorted lexicographically by UTF-8 codepoint:
83+
// https://matrix.org/docs/spec/appendices#canonical-json
84+
//
8385
// Returns a gomatrixserverlib.BadJSONError if JSON validation fails.
8486
func CanonicalJSON(input []byte) ([]byte, error) {
8587
if !gjson.Valid(string(input)) {
@@ -91,10 +93,11 @@ func CanonicalJSON(input []byte) ([]byte, error) {
9193

9294
// Returns a gomatrixserverlib.BadJSONError if the canonical JSON fails enforced
9395
// checks or if JSON validation fails. At present this function performs:
94-
// * integer bounds checking for room version 6 and above:
95-
// https://matrix.org/docs/spec/rooms/v6#canonical-json
96-
// * shortest encoding, sorted lexicographically by UTF-8 codepoint:
97-
// https://matrix.org/docs/spec/appendices#canonical-json
96+
// - integer bounds checking for room version 6 and above:
97+
// https://matrix.org/docs/spec/rooms/v6#canonical-json
98+
// - shortest encoding, sorted lexicographically by UTF-8 codepoint:
99+
// https://matrix.org/docs/spec/appendices#canonical-json
100+
//
98101
// Returns a gomatrixserverlib.BadJSONError if JSON validation fails.
99102
func EnforcedCanonicalJSON(input []byte, roomVersion RoomVersion) ([]byte, error) {
100103
if enforce, err := roomVersion.EnforceCanonicalJSON(); err == nil && enforce {
@@ -296,6 +299,11 @@ func CompactJSON(input, output []byte) []byte {
296299
// compactUnicodeEscape unpacks a 4 byte unicode escape starting at index.
297300
// Returns the output slice and a new input index.
298301
func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) {
302+
appendUTF8 := func(c rune) {
303+
var buffer [4]byte
304+
n := utf8.EncodeRune(buffer[:], c)
305+
output = append(output, buffer[:n]...)
306+
}
299307
const (
300308
ESCAPES = "uuuuuuuubtnufruuuuuuuuuuuuuuuuuu"
301309
HEX = "0123456789ABCDEF"
@@ -317,17 +325,26 @@ func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) {
317325
} else if c == '\\' || c == '"' {
318326
// Otherwise the character only needs escaping if it is a QUOTE '"' or BACKSLASH '\\'.
319327
output = append(output, '\\', byte(c))
328+
} else if utf16.IsSurrogate(c) {
329+
if input[index] != '\\' && input[index+1] != 'u' {
330+
return output, index
331+
}
332+
index += 2 // skip the \u"
333+
if len(input)-index < 4 {
334+
return output, index
335+
}
336+
c2 := readHexDigits(input[index : index+4])
337+
index += 4
338+
appendUTF8(utf16.DecodeRune(c, c2))
320339
} else {
321-
var buffer [4]byte
322-
n := utf8.EncodeRune(buffer[:], rune(c))
323-
output = append(output, buffer[:n]...)
340+
appendUTF8(c)
324341
}
325342
return output, index
326343
}
327344

328345
// Read 4 hex digits from the input slice.
329346
// Taken from https://github.com/NegativeMjark/indolentjson-rust/blob/8b959791fe2656a88f189c5d60d153be05fe3deb/src/readhex.rs#L21
330-
func readHexDigits(input []byte) uint32 {
347+
func readHexDigits(input []byte) rune {
331348
hex := binary.BigEndian.Uint32(input)
332349
// subtract '0'
333350
hex -= 0x30303030
@@ -341,7 +358,7 @@ func readHexDigits(input []byte) uint32 {
341358
hex |= hex >> 4
342359
hex &= 0xFF00FF
343360
hex |= hex >> 8
344-
return hex & 0xFFFF
361+
return rune(hex & 0xFFFF)
345362
}
346363

347364
// RawJSONFromResult extracts the raw JSON bytes pointed to by result.

json_test.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,24 @@ func TestCompactJSON(t *testing.T) {
108108
testCompactJSON(t, `["\u0FFF"]`, "[\"\u0FFF\"]")
109109
testCompactJSON(t, `["\u1820"]`, "[\"\u1820\"]")
110110
testCompactJSON(t, `["\uFFFF"]`, "[\"\uFFFF\"]")
111+
testCompactJSON(t, `["\uD842\uDC20"]`, "[\"\U00020820\"]")
112+
testCompactJSON(t, `["\uDBFF\uDFFF"]`, "[\"\U0010FFFF\"]")
111113

112114
testCompactJSON(t, `["\"\\\/"]`, `["\"\\/"]`)
113115
}
114116

115-
func testReadHex(t *testing.T, input string, want uint32) {
117+
func TestCompactUnicodeEscapeWithUTF16Surrogate(t *testing.T) {
118+
input := []byte(`\ud83d\udc08`)
119+
output, n := compactUnicodeEscape(input[2:], nil, 0)
120+
if n != 10 {
121+
t.Fatalf("should have consumed 10 bytes but consumed only %d bytes", n)
122+
}
123+
if string(output) != "🐈" {
124+
t.Fatalf("expected a cat emoji")
125+
}
126+
}
127+
128+
func testReadHex(t *testing.T, input string, want rune) {
116129
got := readHexDigits([]byte(input))
117130
if want != got {
118131
t.Errorf("readHexDigits(%q): want 0x%x got 0x%x", input, want, got)

0 commit comments

Comments
 (0)