Skip to content

Commit e8b55bc

Browse files
committed
ENH: Add on_bad_lines for pyarrow (SQUASHED)
1 parent cc76b52 commit e8b55bc

File tree

3 files changed

+43
-6
lines changed

3 files changed

+43
-6
lines changed

pandas/io/parsers/arrow_parser_wrapper.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from __future__ import annotations
22

33
from typing import TYPE_CHECKING
4+
import warnings
45

56
from pandas._config import using_pyarrow_string_dtype
67

78
from pandas._libs import lib
89
from pandas.compat._optional import import_optional_dependency
10+
from pandas.errors import ParserWarning
11+
from pandas.util._exceptions import find_stack_level
912

1013
from pandas.core.dtypes.inference import is_integer
1114

@@ -85,6 +88,29 @@ def _get_pyarrow_options(self) -> None:
8588
and option_name
8689
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
8790
}
91+
92+
if "on_bad_lines" in self.kwds:
93+
if callable(self.kwds["on_bad_lines"]):
94+
self.parse_options["invalid_row_handler"] = self.kwds["on_bad_lines"]
95+
elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.ERROR:
96+
self.parse_options[
97+
"invalid_row_handler"
98+
] = None # PyArrow raises an exception by default
99+
elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.WARN:
100+
101+
def handle_warning(invalid_row):
102+
warnings.warn(
103+
f"Expected {invalid_row.expected_columns} columns, but found "
104+
f"{invalid_row.actual_columns}: {invalid_row.text}",
105+
ParserWarning,
106+
stacklevel=find_stack_level(),
107+
)
108+
return "skip"
109+
110+
self.parse_options["invalid_row_handler"] = handle_warning
111+
elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.SKIP:
112+
self.parse_options["invalid_row_handler"] = lambda _: "skip"
113+
88114
self.convert_options = {
89115
option_name: option_value
90116
for option_name, option_value in self.kwds.items()

pandas/io/parsers/readers.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,13 @@
391391
expected, a ``ParserWarning`` will be emitted while dropping extra elements.
392392
Only supported when ``engine='python'``
393393
394+
.. versionchanged:: 1.4.1
395+
396+
- Callable, function with signature
397+
as described in `pyarrow documentation
398+
<https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
399+
#pyarrow.csv.ParseOptions.invalid_row_handler>_` when ``engine='pyarrow'``
400+
394401
delim_whitespace : bool, default False
395402
Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be
396403
used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
@@ -484,7 +491,6 @@ class _Fwf_Defaults(TypedDict):
484491
"thousands",
485492
"memory_map",
486493
"dialect",
487-
"on_bad_lines",
488494
"delim_whitespace",
489495
"quoting",
490496
"lineterminator",
@@ -2053,9 +2059,10 @@ def _refine_defaults_read(
20532059
elif on_bad_lines == "skip":
20542060
kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
20552061
elif callable(on_bad_lines):
2056-
if engine != "python":
2062+
if engine not in ["python", "pyarrow"]:
20572063
raise ValueError(
2058-
"on_bad_line can only be a callable function if engine='python'"
2064+
"on_bad_line can only be a callable function "
2065+
"if engine='python' or 'pyarrow'"
20592066
)
20602067
kwds["on_bad_lines"] = on_bad_lines
20612068
else:

pandas/tests/io/parser/test_unsupported.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,17 @@ def test_pyarrow_engine(self):
156156
with pytest.raises(ValueError, match=msg):
157157
read_csv(StringIO(data), engine="pyarrow", **kwargs)
158158

159-
def test_on_bad_lines_callable_python_only(self, all_parsers):
159+
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
160160
# GH 5686
161+
# GH 54643
161162
sio = StringIO("a,b\n1,2")
162163
bad_lines_func = lambda x: x
163164
parser = all_parsers
164-
if all_parsers.engine != "python":
165-
msg = "on_bad_line can only be a callable function if engine='python'"
165+
if all_parsers.engine not in ["python", "pyarrow"]:
166+
msg = (
167+
"on_bad_line can only be a callable "
168+
"function if engine='python' or 'pyarrow'"
169+
)
166170
with pytest.raises(ValueError, match=msg):
167171
parser.read_csv(sio, on_bad_lines=bad_lines_func)
168172
else:

0 commit comments

Comments
 (0)