diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 31ef70703e2ca..9b11ae6c0054d 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -755,6 +755,7 @@ I/O - :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 8300e889d4157..19271c78501ba 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -150,7 +150,11 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, return NULL; } else if (!PyBytes_Check(result)) { tmp = PyUnicode_AsUTF8String(result); - Py_XDECREF(result); + Py_DECREF(result); + if (tmp == NULL) { + PyGILState_Release(state); + return NULL; + } result = tmp; } diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 9e871d27f0ce8..064385e60c4ec 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -9,6 +9,7 @@ import sys from datetime import datetime from collections import OrderedDict +from io import TextIOWrapper import pytest import numpy as np @@ -1609,3 +1610,11 @@ def test_skip_bad_lines(self): val = sys.stderr.getvalue() assert 'Skipping line 3' in val assert 'Skipping line 5' in val + + def test_buffer_rd_bytes_bad_unicode(self): + # Regression test for #22748 + t = BytesIO(b"\xB0") + if PY3: + t = TextIOWrapper(t, encoding='ascii', errors='surrogateescape') + with pytest.raises(UnicodeError): + pd.read_csv(t, encoding='UTF-8')