Skip to content

Commit 1711de8

Browse files
authored
[mypyc] Support unicode surrogates in string literals (#18209)
Previously surrogates would trigger a compiler crash.
1 parent d39eacc commit 1711de8

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

mypyc/codegen/literals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def format_int(n: int) -> bytes:
230230

231231

232232
def format_str_literal(s: str) -> bytes:
233-
utf8 = s.encode("utf-8")
233+
utf8 = s.encode("utf-8", errors="surrogatepass")
234234
return format_int(len(utf8)) + utf8
235235

236236

mypyc/lib-rt/misc_ops.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ int CPyStatics_Initialize(PyObject **statics,
563563
while (num-- > 0) {
564564
size_t len;
565565
data = parse_int(data, &len);
566-
PyObject *obj = PyUnicode_FromStringAndSize(data, len);
566+
PyObject *obj = PyUnicode_DecodeUTF8(data, len, "surrogatepass");
567567
if (obj == NULL) {
568568
return -1;
569569
}

mypyc/test-data/run-strings.test

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,3 +639,12 @@ def test_encode() -> None:
639639
assert u'\u00E1'.encode('latin1') == b'\xe1'
640640
with assertRaises(UnicodeEncodeError):
641641
u.encode('latin1')
642+
643+
[case testUnicodeSurrogate]
644+
def f() -> str:
645+
return "\ud800"
646+
647+
def test_surrogate() -> None:
648+
assert ord(f()) == 0xd800
649+
assert ord("\udfff") == 0xdfff
650+
assert repr("foobar\x00\xab\ud912\U00012345") == r"'foobar\x00«\ud912𒍅'"

0 commit comments

Comments
 (0)