From e8b55bcf0d8b01986fd95440af1cc0ba3fdaecac Mon Sep 17 00:00:00 2001 From: amithkk Date: Sat, 19 Aug 2023 23:53:55 +0530 Subject: [PATCH 01/14] ENH: Add on_bad_lines for pyarrow (SQUASHED) --- pandas/io/parsers/arrow_parser_wrapper.py | 26 ++++++++++++++++++++++ pandas/io/parsers/readers.py | 13 ++++++++--- pandas/tests/io/parser/test_unsupported.py | 10 ++++++--- 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 71bfb00a95b50..18961adc4b954 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -1,11 +1,14 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency +from pandas.errors import ParserWarning +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.inference import is_integer @@ -85,6 +88,29 @@ def _get_pyarrow_options(self) -> None: and option_name in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") } + + if "on_bad_lines" in self.kwds: + if callable(self.kwds["on_bad_lines"]): + self.parse_options["invalid_row_handler"] = self.kwds["on_bad_lines"] + elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.ERROR: + self.parse_options[ + "invalid_row_handler" + ] = None # PyArrow raises an exception by default + elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.WARN: + + def handle_warning(invalid_row): + warnings.warn( + f"Expected {invalid_row.expected_columns} columns, but found " + f"{invalid_row.actual_columns}: {invalid_row.text}", + ParserWarning, + stacklevel=find_stack_level(), + ) + return "skip" + + self.parse_options["invalid_row_handler"] = handle_warning + elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.SKIP: + self.parse_options["invalid_row_handler"] = lambda _: "skip" + self.convert_options = { option_name: option_value for option_name, option_value in self.kwds.items() diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index f01595a684344..f4a095a580f9b 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -391,6 +391,13 @@ expected, a ``ParserWarning`` will be emitted while dropping extra elements. Only supported when ``engine='python'`` + .. versionchanged:: 1.4.1 + + - Callable, function with signature + as described in `pyarrow documentation + _` when ``engine='pyarrow'`` + delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option @@ -484,7 +491,6 @@ class _Fwf_Defaults(TypedDict): "thousands", "memory_map", "dialect", - "on_bad_lines", "delim_whitespace", "quoting", "lineterminator", @@ -2053,9 +2059,10 @@ def _refine_defaults_read( elif on_bad_lines == "skip": kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP elif callable(on_bad_lines): - if engine != "python": + if engine not in ["python", "pyarrow"]: raise ValueError( - "on_bad_line can only be a callable function if engine='python'" + "on_bad_line can only be a callable function " + "if engine='python' or 'pyarrow'" ) kwds["on_bad_lines"] = on_bad_lines else: diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index f5a0bcd2c00fd..3fdd5198b639d 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -156,13 +156,17 @@ def test_pyarrow_engine(self): with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) - def test_on_bad_lines_callable_python_only(self, all_parsers): + def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 + # GH 54643 sio = StringIO("a,b\n1,2") bad_lines_func = lambda x: x parser = all_parsers - if all_parsers.engine != "python": - msg = "on_bad_line can only be a callable function if engine='python'" + if all_parsers.engine not in ["python", "pyarrow"]: + msg = ( + "on_bad_line can only be a callable " + "function if engine='python' or 'pyarrow'" + ) with pytest.raises(ValueError, match=msg): parser.read_csv(sio, on_bad_lines=bad_lines_func) else: From bee735133a5c25860972219b513a8419325d2bea Mon Sep 17 00:00:00 2001 From: amithkk Date: Tue, 22 Aug 2023 09:30:18 +0530 Subject: [PATCH 02/14] Update to appropriate version in docstring --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index f4a095a580f9b..6a508d20d4074 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -391,7 +391,7 @@ expected, a ``ParserWarning`` will be emitted while dropping extra elements. Only supported when ``engine='python'`` - .. versionchanged:: 1.4.1 + .. versionchanged:: 2.2.0 - Callable, function with signature as described in `pyarrow documentation From e7c485cde7df7466570cd5a98f70bcce7ff13595 Mon Sep 17 00:00:00 2001 From: amithkk Date: Wed, 23 Aug 2023 00:12:35 +0530 Subject: [PATCH 03/14] Address review comments --- doc/source/whatsnew/v2.2.0.rst | 31 ++++++++++++++++++++--- pandas/io/parsers/arrow_parser_wrapper.py | 11 ++++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d8b63a6d1395d..ddfcfa35a1fc0 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -14,10 +14,35 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_220.enhancements.enhancement1: +.. _whatsnew_220.enhancements.pyarrow_on_bad_lines: + +PyArrow engine support for handling misformatted lines in CSV +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Prior to this release, the ability to handle malformed lines in CSV files was +limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter. +This release brings this capability to the `PyArrow `_ engine as well. + +The implementation supports both specifying ``skip`` and ``warn`` values as +well as passing down a callable as defined in the `PyArrow documentation +`_ + +*Example Usage* + +.. ipython:: python + + from io import StringIO + + bad_csv = """a,b,c + acol1,bcol1,ccol1 + acol2,ccol2 + """ + df_arrow = pd.read_csv(StringIO(bad_csv), engine='pyarrow', + dtype_backend='pyarrow', on_bad_lines='skip') + df_arrow + -enhancement1 -^^^^^^^^^^^^ .. _whatsnew_220.enhancements.enhancement2: diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 18961adc4b954..3a8ba09bef7a3 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -90,13 +90,14 @@ def _get_pyarrow_options(self) -> None: } if "on_bad_lines" in self.kwds: - if callable(self.kwds["on_bad_lines"]): - self.parse_options["invalid_row_handler"] = self.kwds["on_bad_lines"] - elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.ERROR: + on_bad_lines = self.kwds["on_bad_lines"] + if callable(on_bad_lines): + self.parse_options["invalid_row_handler"] = on_bad_lines + elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: self.parse_options[ "invalid_row_handler" ] = None # PyArrow raises an exception by default - elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.WARN: + elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: def handle_warning(invalid_row): warnings.warn( @@ -108,7 +109,7 @@ def handle_warning(invalid_row): return "skip" self.parse_options["invalid_row_handler"] = handle_warning - elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.SKIP: + elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP: self.parse_options["invalid_row_handler"] = lambda _: "skip" self.convert_options = { From bd8969ee40b154a2c51fd40977b996d613704024 Mon Sep 17 00:00:00 2001 From: amithkk Date: Wed, 23 Aug 2023 00:19:21 +0530 Subject: [PATCH 04/14] Refine whatsnew --- doc/source/whatsnew/v2.2.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ddfcfa35a1fc0..b3ef7d73089f0 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -16,11 +16,11 @@ Enhancements .. _whatsnew_220.enhancements.pyarrow_on_bad_lines: -PyArrow engine support for handling misformatted lines in CSV -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +PyArrow engine support for handling malformed lines in CSV +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Prior to this release, the ability to handle malformed lines in CSV files was limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter. -This release brings this capability to the `PyArrow `_ engine as well. The implementation supports both specifying ``skip`` and ``warn`` values as From e9c158f3a1810f85b4313e7a210dbeeb741dd9ef Mon Sep 17 00:00:00 2001 From: amithkk Date: Wed, 23 Aug 2023 00:23:56 +0530 Subject: [PATCH 05/14] Add "error" value --- doc/source/whatsnew/v2.2.0.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b3ef7d73089f0..32aaecfb37283 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -16,17 +16,17 @@ Enhancements .. _whatsnew_220.enhancements.pyarrow_on_bad_lines: -PyArrow engine support for handling malformed lines in CSV -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +PyArrow engine support for handling malformed lines in CSV files +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Prior to this release, the ability to handle malformed lines in CSV files was limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter. This release brings that capability to the `PyArrow `_ engine as well. -The implementation supports both specifying ``skip`` and ``warn`` values as -well as passing down a callable as defined in the `PyArrow documentation -`_ +The implementation supports both specifying ``skip``, ``error`` and ``warn`` +values as well as passing down a callable as defined in the `PyArrow +documentation `_ *Example Usage* From 4569f78967740112e7195955cfbb506a36d273f7 Mon Sep 17 00:00:00 2001 From: amithkk Date: Wed, 23 Aug 2023 00:51:05 +0530 Subject: [PATCH 06/14] Condense What's New --- doc/source/whatsnew/v2.2.0.rst | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 32aaecfb37283..533a56d28bdbd 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -18,31 +18,13 @@ Enhancements PyArrow engine support for handling malformed lines in CSV files ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Prior to this release, the ability to handle malformed lines in CSV files was -limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter. -This release brings that capability to the `PyArrow `_ engine as well. - -The implementation supports both specifying ``skip``, ``error`` and ``warn`` -values as well as passing down a callable as defined in the `PyArrow -documentation `_ - -*Example Usage* - -.. ipython:: python - - from io import StringIO - - bad_csv = """a,b,c - acol1,bcol1,ccol1 - acol2,ccol2 - """ - df_arrow = pd.read_csv(StringIO(bad_csv), engine='pyarrow', - dtype_backend='pyarrow', on_bad_lines='skip') - df_arrow - - +This release brings the capability to handle malformed lines in CSV files to +the the `PyArrow `_ engine +using the ``on_bad_lines`` parameter.The implementation supports both specifying +``skip``, ``error`` and ``warn`` values as well as passing down a callable as +defined in the `PyArrow documentation `_. .. _whatsnew_220.enhancements.enhancement2: From 696d935b7f979d3454dcfda69831291ad5ef2ee6 Mon Sep 17 00:00:00 2001 From: amithkk Date: Wed, 23 Aug 2023 02:25:53 +0530 Subject: [PATCH 07/14] Move to "Other Enhancements" --- doc/source/whatsnew/v2.2.0.rst | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 533a56d28bdbd..a5da66f209f28 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -16,15 +16,6 @@ Enhancements .. _whatsnew_220.enhancements.pyarrow_on_bad_lines: -PyArrow engine support for handling malformed lines in CSV files -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This release brings the capability to handle malformed lines in CSV files to -the the `PyArrow `_ engine -using the ``on_bad_lines`` parameter.The implementation supports both specifying -``skip``, ``error`` and ``warn`` values as well as passing down a callable as -defined in the `PyArrow documentation `_. .. _whatsnew_220.enhancements.enhancement2: @@ -35,8 +26,8 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- -- +- Addition of the capability to handle malformed lines in CSV files to the the `PyArrow `_ engine using the ``on_bad_lines`` parameter. (:issue:`54480`) + .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: From 178468a7fd7542ce834e8e595a821b9f71f8eac0 Mon Sep 17 00:00:00 2001 From: amithkk Date: Thu, 24 Aug 2023 00:44:06 +0530 Subject: [PATCH 08/14] Refactor tests in "test_read_errors" to work with added capabilities --- .../io/parser/common/test_read_errors.py | 99 ++++++++++++++----- 1 file changed, 77 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 492b4d5ec058e..e048707e3ed30 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -1,5 +1,5 @@ """ -Tests that work on both the Python and C engines but do not have a +Tests that work on both the Python, C and PyArrow engines but do not have a specific classification into the other test modules. """ import codecs @@ -9,18 +9,21 @@ from pathlib import Path import numpy as np +from pyarrow import ArrowInvalid import pytest from pandas.compat import PY311 from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_empty_decimal_marker(all_parsers): @@ -32,10 +35,17 @@ def test_empty_decimal_marker(all_parsers): msg = "Only length-1 decimal markers supported" parser = all_parsers + if parser.engine == "pyarrow": + msg = ( + "only single character unicode strings can be " + "converted to Py_UCS4, got length 0" + ) + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), decimal="") +@skip_pyarrow def test_bad_stream_exception(all_parsers, csv_dir_path): # see gh-13652 # @@ -56,6 +66,7 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): parser.read_csv(stream) +@skip_pyarrow def test_malformed(all_parsers): # see gh-6607 parser = all_parsers @@ -70,6 +81,7 @@ def test_malformed(all_parsers): parser.read_csv(StringIO(data), header=1, comment="#") +@skip_pyarrow @pytest.mark.parametrize("nrows", [5, 3, None]) def test_malformed_chunks(all_parsers, nrows): data = """ignore @@ -89,6 +101,7 @@ def test_malformed_chunks(all_parsers, nrows): reader.read(nrows) +@skip_pyarrow def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -108,6 +121,7 @@ def test_catch_too_many_names(all_parsers): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) +@skip_pyarrow @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -135,11 +149,16 @@ def test_suppress_error_output(all_parsers, capsys): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="skip") - tm.assert_frame_equal(result, expected) + if parser.engine == "pyarrow": + with tm.assert_produces_warning(False): + result = parser.read_csv(StringIO(data), on_bad_lines="skip") + tm.assert_frame_equal(result, expected) + else: + result = parser.read_csv(StringIO(data), on_bad_lines="skip") + tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert captured.err == "" + captured = capsys.readouterr() + assert captured.err == "" def test_error_bad_lines(all_parsers): @@ -148,7 +167,13 @@ def test_error_bad_lines(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" msg = "Expected 1 fields in line 3, saw 3" - with pytest.raises(ParserError, match=msg): + ex_type = ParserError + + if parser.engine == "pyarrow": + msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3" + ex_type = ArrowInvalid + + with pytest.raises(ex_type, match=msg): parser.read_csv(StringIO(data), on_bad_lines="error") @@ -158,12 +183,21 @@ def test_warn_bad_lines(all_parsers, capsys): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") - tm.assert_frame_equal(result, expected) + if parser.engine == "pyarrow": + with tm.assert_produces_warning( + ParserWarning, + check_stacklevel=False, + match="Expected 1 columns, but found 3: 1,2,3", + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") + tm.assert_frame_equal(result, expected) + else: + result = parser.read_csv(StringIO(data), on_bad_lines="warn") + tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - assert "Skipping line 5" in captured.err + captured = capsys.readouterr() + assert "Skipping line 3" in captured.err + assert "Skipping line 5" in captured.err def test_read_csv_wrong_num_columns(all_parsers): @@ -175,11 +209,17 @@ def test_read_csv_wrong_num_columns(all_parsers): """ parser = all_parsers msg = "Expected 6 fields in line 3, saw 7" + ex_type = ParserError - with pytest.raises(ParserError, match=msg): + if parser.engine == "pyarrow": + msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12" + ex_type = ArrowInvalid + + with pytest.raises(ex_type, match=msg): parser.read_csv(StringIO(data)) +@skip_pyarrow def test_null_byte_char(request, all_parsers): # see gh-2741 data = "\x00,foo" @@ -202,6 +242,7 @@ def test_null_byte_char(request, all_parsers): parser.read_csv(StringIO(data), names=names) +@skip_pyarrow @pytest.mark.filterwarnings("always::ResourceWarning") def test_open_file(request, all_parsers): # GH 39024 @@ -235,13 +276,17 @@ def test_bad_header_uniform_error(all_parsers): parser = all_parsers data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" msg = "Expected 2 fields in line 2, saw 4" + ex_type = ParserError if parser.engine == "c": msg = ( "Could not construct index. Requested to use 1 " "number of columns, but 3 left to parse." ) + elif parser.engine == "pyarrow": + ex_type = ArrowInvalid + msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" - with pytest.raises(ParserError, match=msg): + with pytest.raises(ex_type, match=msg): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") @@ -256,17 +301,27 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): """ expected = DataFrame({"1": "a", "2": ["b"] * 2}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") - tm.assert_frame_equal(result, expected) + # pyarrow engine uses warnings instead of directly printing to stderr + if parser.engine == "pyarrow": + with tm.assert_produces_warning( + ParserWarning, + check_stacklevel=False, + match="Expected 2 columns, but found 3: a,b,c", + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") + tm.assert_frame_equal(result, expected) + else: + result = parser.read_csv(StringIO(data), on_bad_lines="warn") + tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - if parser.engine == "c": - warn = """Skipping line 3: expected 2 fields, saw 3 + captured = capsys.readouterr() + if parser.engine == "c": + warn = """Skipping line 3: expected 2 fields, saw 3 Skipping line 4: expected 2 fields, saw 3 """ - else: - warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 + else: + warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 Skipping line 4: Expected 2 fields in line 4, saw 3 """ - assert captured.err == warn + assert captured.err == warn From 3eb1af8d276f5d9a6444814849f03278eb191cb2 Mon Sep 17 00:00:00 2001 From: amithkk Date: Thu, 24 Aug 2023 01:18:06 +0530 Subject: [PATCH 09/14] Conditionally import pyarrow error types --- pandas/tests/io/parser/common/test_read_errors.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index e048707e3ed30..091cbbc49a39a 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -9,7 +9,6 @@ from pathlib import Path import numpy as np -from pyarrow import ArrowInvalid import pytest from pandas.compat import PY311 @@ -22,6 +21,12 @@ from pandas import DataFrame import pandas._testing as tm +# PyArrow's error types are not available by default +try: + from pyarrow import ArrowInvalid +except ImportError: + pass + xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") From 23ce4abd866606d997736cc5ab383791d75efa31 Mon Sep 17 00:00:00 2001 From: amithkk Date: Sat, 26 Aug 2023 02:19:55 +0530 Subject: [PATCH 10/14] Revert changes in v2.2.0.rst > enhancements --- doc/source/whatsnew/v2.2.0.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index a5da66f209f28..93147f3b3674e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -14,8 +14,10 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_220.enhancements.pyarrow_on_bad_lines: +.. _whatsnew_220.enhancements.enhancement1: +enhancement1 +^^^^^^^^^^^^ .. _whatsnew_220.enhancements.enhancement2: From 6cc4a7c84033fef4de1b178c54c5d6f620b1b020 Mon Sep 17 00:00:00 2001 From: amithkk Date: Sat, 26 Aug 2023 02:35:07 +0530 Subject: [PATCH 11/14] Address review comments --- doc/source/whatsnew/v2.2.0.rst | 2 +- .../tests/io/parser/common/test_read_errors.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 93147f3b3674e..c8437f43a4aaa 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -28,7 +28,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- Addition of the capability to handle malformed lines in CSV files to the the `PyArrow `_ engine using the ``on_bad_lines`` parameter. (:issue:`54480`) +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 091cbbc49a39a..24c8f50d475e8 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -1,5 +1,5 @@ """ -Tests that work on both the Python, C and PyArrow engines but do not have a +Tests that work on the Python, C and PyArrow engines but do not have a specific classification into the other test modules. """ import codecs @@ -21,12 +21,6 @@ from pandas import DataFrame import pandas._testing as tm -# PyArrow's error types are not available by default -try: - from pyarrow import ArrowInvalid -except ImportError: - pass - xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -175,8 +169,9 @@ def test_error_bad_lines(all_parsers): ex_type = ParserError if parser.engine == "pyarrow": + pa = pytest.importorskip("pyarrow") + ex_type = pa.ArrowInvalid msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3" - ex_type = ArrowInvalid with pytest.raises(ex_type, match=msg): parser.read_csv(StringIO(data), on_bad_lines="error") @@ -217,8 +212,9 @@ def test_read_csv_wrong_num_columns(all_parsers): ex_type = ParserError if parser.engine == "pyarrow": + pa = pytest.importorskip("pyarrow") + ex_type = pa.ArrowInvalid msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12" - ex_type = ArrowInvalid with pytest.raises(ex_type, match=msg): parser.read_csv(StringIO(data)) @@ -288,7 +284,8 @@ def test_bad_header_uniform_error(all_parsers): "number of columns, but 3 left to parse." ) elif parser.engine == "pyarrow": - ex_type = ArrowInvalid + pa = pytest.importorskip("pyarrow") + ex_type = pa.ArrowInvalid msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" with pytest.raises(ex_type, match=msg): From 61dcaedcd3aa749b41caeb3d95abbebb84a62cb6 Mon Sep 17 00:00:00 2001 From: amithkk Date: Fri, 1 Sep 2023 23:38:41 +0530 Subject: [PATCH 12/14] Address review comments --- pandas/io/parsers/arrow_parser_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 2cf60cae07389..4fdd84438820d 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -89,8 +89,8 @@ def _get_pyarrow_options(self) -> None: in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") } - if "on_bad_lines" in self.kwds: - on_bad_lines = self.kwds["on_bad_lines"] + on_bad_lines = self.kwds.get("on_bad_lines") + if on_bad_lines is not None: if callable(on_bad_lines): self.parse_options["invalid_row_handler"] = on_bad_lines elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: From 1da74c7eb458ad7ecc232b691b3c0c2f2bb20746 Mon Sep 17 00:00:00 2001 From: amithkk Date: Sat, 9 Sep 2023 03:15:20 +0530 Subject: [PATCH 13/14] Wrap ArrowInvalid with ParserError --- pandas/io/parsers/arrow_parser_wrapper.py | 22 +++++++++++++------ .../io/parser/common/test_read_errors.py | 15 +++---------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 4fdd84438820d..6a07052dd0742 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,11 +3,16 @@ from typing import TYPE_CHECKING import warnings +from pyarrow import ArrowInvalid + from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency -from pandas.errors import ParserWarning +from pandas.errors import ( + ParserError, + ParserWarning, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.inference import is_integer @@ -217,12 +222,15 @@ def read(self) -> DataFrame: pyarrow_csv = import_optional_dependency("pyarrow.csv") self._get_pyarrow_options() - table = pyarrow_csv.read_csv( - self.src, - read_options=pyarrow_csv.ReadOptions(**self.read_options), - parse_options=pyarrow_csv.ParseOptions(**self.parse_options), - convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), - ) + try: + table = pyarrow_csv.read_csv( + self.src, + read_options=pyarrow_csv.ReadOptions(**self.read_options), + parse_options=pyarrow_csv.ParseOptions(**self.parse_options), + convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), + ) + except ArrowInvalid as e: + raise ParserError(e) from e dtype_backend = self.kwds["dtype_backend"] diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 24c8f50d475e8..6f9b6fcffbe9d 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -166,14 +166,11 @@ def test_error_bad_lines(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" msg = "Expected 1 fields in line 3, saw 3" - ex_type = ParserError if parser.engine == "pyarrow": - pa = pytest.importorskip("pyarrow") - ex_type = pa.ArrowInvalid msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3" - with pytest.raises(ex_type, match=msg): + with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), on_bad_lines="error") @@ -209,14 +206,11 @@ def test_read_csv_wrong_num_columns(all_parsers): """ parser = all_parsers msg = "Expected 6 fields in line 3, saw 7" - ex_type = ParserError if parser.engine == "pyarrow": - pa = pytest.importorskip("pyarrow") - ex_type = pa.ArrowInvalid msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12" - with pytest.raises(ex_type, match=msg): + with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) @@ -277,18 +271,15 @@ def test_bad_header_uniform_error(all_parsers): parser = all_parsers data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" msg = "Expected 2 fields in line 2, saw 4" - ex_type = ParserError if parser.engine == "c": msg = ( "Could not construct index. Requested to use 1 " "number of columns, but 3 left to parse." ) elif parser.engine == "pyarrow": - pa = pytest.importorskip("pyarrow") - ex_type = pa.ArrowInvalid msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" - with pytest.raises(ex_type, match=msg): + with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") From e847e8cc215079df6710ea5508962f45dd28af55 Mon Sep 17 00:00:00 2001 From: amithkk Date: Sun, 10 Sep 2023 02:28:46 +0530 Subject: [PATCH 14/14] Change ArrowInvalid to optional import --- pandas/io/parsers/arrow_parser_wrapper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 6a07052dd0742..765a4ffcd2cb9 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pyarrow import ArrowInvalid - from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib @@ -229,7 +227,7 @@ def read(self) -> DataFrame: parse_options=pyarrow_csv.ParseOptions(**self.parse_options), convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), ) - except ArrowInvalid as e: + except pa.ArrowInvalid as e: raise ParserError(e) from e dtype_backend = self.kwds["dtype_backend"]