From 9a4f7c221027e3d0f49d9f24ccf7430a2158d500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 3 Jan 2022 17:46:07 -0500 Subject: [PATCH 1/4] BUG: encoding_errors=None with read_csv c-engine --- pandas/_libs/parsers.pyx | 2 ++ pandas/tests/io/test_common.py | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4f80936359263..08c885fba172a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -375,6 +375,8 @@ cdef class TextReader: # set encoding for native Python and C library if isinstance(encoding_errors, str): encoding_errors = encoding_errors.encode("utf-8") + elif encoding_errors is None: + encoding_errors = b"strict" Py_INCREF(encoding_errors) self.encoding_errors = PyBytes_AsString(encoding_errors) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 43a5a33a0fdd4..9a95a253daeea 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -549,15 +549,18 @@ def test_explicit_encoding(io_class, mode, msg): @pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) @pytest.mark.parametrize("format", ["csv", "json"]) -def test_encoding_errors(encoding_errors, format): +def test_encoding_errors(encoding_errors, format, request): # GH39450 msg = "'utf-8' codec can't decode byte" bad_encoding = b"\xe4" if format == "csv": - return content = bad_encoding + b"\n" + bad_encoding reader = pd.read_csv + if encoding_errors == "replace": + request.applymarker( + pytest.mark.xfail(reason="Should work but needs more time to debug.") + ) else: content = ( b'{"' From 5510540f4ae55da582124f215ec92b882949ce19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 3 Jan 2022 20:01:09 -0500 Subject: [PATCH 2/4] fix test case and add whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/tests/io/test_common.py | 8 ++------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4c3e53ddcfa26..6b86f01b1fc7a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -877,6 +877,7 @@ I/O - Bug in :func:`read_csv` silently ignoring errors when failing to create a memory-mapped file (:issue:`44766`) - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) - Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`) +- Bug in :func:`read_csv` when `engine="c"` and `encoding_errors=None` which caused a segfault (:issue:`45180`) - Period diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 9a95a253daeea..07018fe58ef7c 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -555,12 +555,8 @@ def test_encoding_errors(encoding_errors, format, request): bad_encoding = b"\xe4" if format == "csv": - content = bad_encoding + b"\n" + bad_encoding - reader = pd.read_csv - if encoding_errors == "replace": - request.applymarker( - pytest.mark.xfail(reason="Should work but needs more time to debug.") - ) + content = b"," + bad_encoding + b"\n" + bad_encoding * 2 + b"," + bad_encoding + reader = partial(pd.read_csv, index_col=0) else: content = ( b'{"' From 25b78b925acac8c67a6651eef851849d9810e763 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 3 Jan 2022 20:02:16 -0500 Subject: [PATCH 3/4] remove request --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 07018fe58ef7c..b458f3351c860 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -549,7 +549,7 @@ def test_explicit_encoding(io_class, mode, msg): @pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) @pytest.mark.parametrize("format", ["csv", "json"]) -def test_encoding_errors(encoding_errors, format, request): +def test_encoding_errors(encoding_errors, format): # GH39450 msg = "'utf-8' codec can't decode byte" bad_encoding = b"\xe4" From dbf9b02bd0712400eaafb9e2e3c83297e5f7c98d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 3 Jan 2022 21:07:59 -0500 Subject: [PATCH 4/4] double backticks --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 6b86f01b1fc7a..46b79c3dd6d71 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -877,7 +877,7 @@ I/O - Bug in :func:`read_csv` silently ignoring errors when failing to create a memory-mapped file (:issue:`44766`) - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) - Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`) -- Bug in :func:`read_csv` when `engine="c"` and `encoding_errors=None` which caused a segfault (:issue:`45180`) +- Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`) - Period