Skip to content

Commit cac2e87

Browse files
authored
BUG: to_json segfaults when exception occurs in UTF8 encoding of string (#50324)
1 parent d0dbd9f commit cac2e87

File tree

3 files changed

+21
-3
lines changed

3 files changed

+21
-3
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,7 @@ I/O
882882
- Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
883883
- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
884884
- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
885-
-
885+
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
886886

887887
Period
888888
^^^^^^

pandas/_libs/src/ujson/python/objToJSON.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,18 @@ static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
332332
return PyBytes_AS_STRING(obj);
333333
}
334334

335-
static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
335+
static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
336336
size_t *_outLen) {
337-
return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
337+
char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj,
338+
(Py_ssize_t *)_outLen);
339+
if (encoded == NULL) {
340+
/* Something went wrong.
341+
Set errorMsg(to tell encoder to stop),
342+
and let Python exception propagate. */
343+
JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
344+
enc->errorMsg = "Encoding failed.";
345+
}
346+
return encoded;
338347
}
339348

340349
/* JSON callback. returns a char* and mutates the pointer to *len */

pandas/tests/io/json/test_ujson.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,15 @@ def test_encode_unicode_4bytes_utf8highest(self):
291291
assert enc == json.dumps(four_bytes_input)
292292
assert dec == json.loads(enc)
293293

294+
def test_encode_unicode_error(self):
295+
string = "'\udac0'"
296+
msg = (
297+
r"'utf-8' codec can't encode character '\\udac0' "
298+
r"in position 1: surrogates not allowed"
299+
)
300+
with pytest.raises(UnicodeEncodeError, match=msg):
301+
ujson.dumps([string])
302+
294303
def test_encode_array_in_array(self):
295304
arr_in_arr_input = [[[[]]]]
296305
output = ujson.encode(arr_in_arr_input)

0 commit comments

Comments
 (0)