diff --git a/json.go b/json.go index 431a8af6..3e8c603e 100644 --- a/json.go +++ b/json.go @@ -20,6 +20,7 @@ import ( "errors" "sort" "strings" + "unicode/utf16" "unicode/utf8" "github.com/tidwall/gjson" @@ -78,8 +79,9 @@ func NewEventJSONsFromEvents(he []*Event) EventJSONs { // CanonicalJSON re-encodes the JSON in a canonical encoding. The encoding is // the shortest possible encoding using integer values with sorted object keys. // At present this function performs: -// * shortest encoding, sorted lexicographically by UTF-8 codepoint: -// https://matrix.org/docs/spec/appendices#canonical-json +// - shortest encoding, sorted lexicographically by UTF-8 codepoint: +// https://matrix.org/docs/spec/appendices#canonical-json +// // Returns a gomatrixserverlib.BadJSONError if JSON validation fails. func CanonicalJSON(input []byte) ([]byte, error) { if !gjson.Valid(string(input)) { @@ -91,10 +93,11 @@ func CanonicalJSON(input []byte) ([]byte, error) { // Returns a gomatrixserverlib.BadJSONError if the canonical JSON fails enforced // checks or if JSON validation fails. At present this function performs: -// * integer bounds checking for room version 6 and above: -// https://matrix.org/docs/spec/rooms/v6#canonical-json -// * shortest encoding, sorted lexicographically by UTF-8 codepoint: -// https://matrix.org/docs/spec/appendices#canonical-json +// - integer bounds checking for room version 6 and above: +// https://matrix.org/docs/spec/rooms/v6#canonical-json +// - shortest encoding, sorted lexicographically by UTF-8 codepoint: +// https://matrix.org/docs/spec/appendices#canonical-json +// // Returns a gomatrixserverlib.BadJSONError if JSON validation fails. func EnforcedCanonicalJSON(input []byte, roomVersion RoomVersion) ([]byte, error) { if enforce, err := roomVersion.EnforceCanonicalJSON(); err == nil && enforce { @@ -296,6 +299,11 @@ func CompactJSON(input, output []byte) []byte { // compactUnicodeEscape unpacks a 4 byte unicode escape starting at index. // Returns the output slice and a new input index. func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) { + appendUTF8 := func(c rune) { + var buffer [4]byte + n := utf8.EncodeRune(buffer[:], c) + output = append(output, buffer[:n]...) + } const ( ESCAPES = "uuuuuuuubtnufruuuuuuuuuuuuuuuuuu" HEX = "0123456789ABCDEF" @@ -317,17 +325,26 @@ func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) { } else if c == '\\' || c == '"' { // Otherwise the character only needs escaping if it is a QUOTE '"' or BACKSLASH '\\'. output = append(output, '\\', byte(c)) + } else if utf16.IsSurrogate(c) { + if input[index] != '\\' && input[index+1] != 'u' { + return output, index + } + index += 2 // skip the \u" + if len(input)-index < 4 { + return output, index + } + c2 := readHexDigits(input[index : index+4]) + index += 4 + appendUTF8(utf16.DecodeRune(c, c2)) } else { - var buffer [4]byte - n := utf8.EncodeRune(buffer[:], rune(c)) - output = append(output, buffer[:n]...) + appendUTF8(c) } return output, index } // Read 4 hex digits from the input slice. // Taken from https://github.com/NegativeMjark/indolentjson-rust/blob/8b959791fe2656a88f189c5d60d153be05fe3deb/src/readhex.rs#L21 -func readHexDigits(input []byte) uint32 { +func readHexDigits(input []byte) rune { hex := binary.BigEndian.Uint32(input) // subtract '0' hex -= 0x30303030 @@ -341,7 +358,7 @@ func readHexDigits(input []byte) uint32 { hex |= hex >> 4 hex &= 0xFF00FF hex |= hex >> 8 - return hex & 0xFFFF + return rune(hex & 0xFFFF) } // RawJSONFromResult extracts the raw JSON bytes pointed to by result. diff --git a/json_test.go b/json_test.go index 38ee16ce..8c27802e 100644 --- a/json_test.go +++ b/json_test.go @@ -108,11 +108,24 @@ func TestCompactJSON(t *testing.T) { testCompactJSON(t, `["\u0FFF"]`, "[\"\u0FFF\"]") testCompactJSON(t, `["\u1820"]`, "[\"\u1820\"]") testCompactJSON(t, `["\uFFFF"]`, "[\"\uFFFF\"]") + testCompactJSON(t, `["\uD842\uDC20"]`, "[\"\U00020820\"]") + testCompactJSON(t, `["\uDBFF\uDFFF"]`, "[\"\U0010FFFF\"]") testCompactJSON(t, `["\"\\\/"]`, `["\"\\/"]`) } -func testReadHex(t *testing.T, input string, want uint32) { +func TestCompactUnicodeEscapeWithUTF16Surrogate(t *testing.T) { + input := []byte(`\ud83d\udc08`) + output, n := compactUnicodeEscape(input[2:], nil, 0) + if n != 10 { + t.Fatalf("should have consumed 10 bytes but consumed only %d bytes", n) + } + if string(output) != "🐈" { + t.Fatalf("expected a cat emoji") + } +} + +func testReadHex(t *testing.T, input string, want rune) { got := readHexDigits([]byte(input)) if want != got { t.Errorf("readHexDigits(%q): want 0x%x got 0x%x", input, want, got)