From e8b55bcf0d8b01986fd95440af1cc0ba3fdaecac Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Sat, 19 Aug 2023 23:53:55 +0530
Subject: [PATCH 01/14] ENH: Add on_bad_lines for pyarrow (SQUASHED)

---
 pandas/io/parsers/arrow_parser_wrapper.py  | 26 ++++++++++++++++++++++
 pandas/io/parsers/readers.py               | 13 ++++++++---
 pandas/tests/io/parser/test_unsupported.py | 10 ++++++---
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 71bfb00a95b50..18961adc4b954 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
+import warnings
 
 from pandas._config import using_pyarrow_string_dtype
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
+from pandas.errors import ParserWarning
+from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.inference import is_integer
 
@@ -85,6 +88,29 @@ def _get_pyarrow_options(self) -> None:
             and option_name
             in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
         }
+
+        if "on_bad_lines" in self.kwds:
+            if callable(self.kwds["on_bad_lines"]):
+                self.parse_options["invalid_row_handler"] = self.kwds["on_bad_lines"]
+            elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.ERROR:
+                self.parse_options[
+                    "invalid_row_handler"
+                ] = None  # PyArrow raises an exception by default
+            elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.WARN:
+
+                def handle_warning(invalid_row):
+                    warnings.warn(
+                        f"Expected {invalid_row.expected_columns} columns, but found "
+                        f"{invalid_row.actual_columns}: {invalid_row.text}",
+                        ParserWarning,
+                        stacklevel=find_stack_level(),
+                    )
+                    return "skip"
+
+                self.parse_options["invalid_row_handler"] = handle_warning
+            elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.SKIP:
+                self.parse_options["invalid_row_handler"] = lambda _: "skip"
+
         self.convert_options = {
             option_name: option_value
             for option_name, option_value in self.kwds.items()
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index f01595a684344..f4a095a580f9b 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -391,6 +391,13 @@
           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
           Only supported when ``engine='python'``
 
+    .. versionchanged:: 1.4.1
+
+        - Callable, function with signature
+          as described in `pyarrow documentation
+          <https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
+          #pyarrow.csv.ParseOptions.invalid_row_handler>_` when ``engine='pyarrow'``
+
 delim_whitespace : bool, default False
     Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be
     used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
@@ -484,7 +491,6 @@ class _Fwf_Defaults(TypedDict):
     "thousands",
     "memory_map",
     "dialect",
-    "on_bad_lines",
     "delim_whitespace",
     "quoting",
     "lineterminator",
@@ -2053,9 +2059,10 @@ def _refine_defaults_read(
     elif on_bad_lines == "skip":
         kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
     elif callable(on_bad_lines):
-        if engine != "python":
+        if engine not in ["python", "pyarrow"]:
             raise ValueError(
-                "on_bad_line can only be a callable function if engine='python'"
+                "on_bad_line can only be a callable function "
+                "if engine='python' or 'pyarrow'"
             )
         kwds["on_bad_lines"] = on_bad_lines
     else:
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index f5a0bcd2c00fd..3fdd5198b639d 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -156,13 +156,17 @@ def test_pyarrow_engine(self):
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)
 
-    def test_on_bad_lines_callable_python_only(self, all_parsers):
+    def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
         # GH 5686
+        # GH 54643
         sio = StringIO("a,b\n1,2")
         bad_lines_func = lambda x: x
         parser = all_parsers
-        if all_parsers.engine != "python":
-            msg = "on_bad_line can only be a callable function if engine='python'"
+        if all_parsers.engine not in ["python", "pyarrow"]:
+            msg = (
+                "on_bad_line can only be a callable "
+                "function if engine='python' or 'pyarrow'"
+            )
             with pytest.raises(ValueError, match=msg):
                 parser.read_csv(sio, on_bad_lines=bad_lines_func)
         else:

From bee735133a5c25860972219b513a8419325d2bea Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Tue, 22 Aug 2023 09:30:18 +0530
Subject: [PATCH 02/14] Update to appropriate version in docstring

---
 pandas/io/parsers/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index f4a095a580f9b..6a508d20d4074 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -391,7 +391,7 @@
           expected, a ``ParserWarning`` will be emitted while dropping extra elements.
           Only supported when ``engine='python'``
 
-    .. versionchanged:: 1.4.1
+    .. versionchanged:: 2.2.0
 
         - Callable, function with signature
           as described in `pyarrow documentation

From e7c485cde7df7466570cd5a98f70bcce7ff13595 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Wed, 23 Aug 2023 00:12:35 +0530
Subject: [PATCH 03/14] Address review comments

---
 doc/source/whatsnew/v2.2.0.rst            | 31 ++++++++++++++++++++---
 pandas/io/parsers/arrow_parser_wrapper.py | 11 ++++----
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index d8b63a6d1395d..ddfcfa35a1fc0 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -14,10 +14,35 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_220.enhancements.enhancement1:
+.. _whatsnew_220.enhancements.pyarrow_on_bad_lines:
+
+PyArrow engine support for handling misformatted lines in CSV
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Prior to this release, the ability to handle malformed lines in CSV files was
+limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter.
+This release brings this capability to the `PyArrow <https://arrow.apache
+.org/docs/python/index.html>`_ engine as well.
+
+The implementation supports both specifying ``skip`` and ``warn`` values as
+well as passing down a callable as defined in the `PyArrow documentation
+<https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
+#pyarrow.csv.ParseOptions.invalid_row_handler>`_
+
+*Example Usage*
+
+.. ipython:: python
+
+    from io import StringIO
+
+    bad_csv = """a,b,c
+    acol1,bcol1,ccol1
+    acol2,ccol2
+    """
+    df_arrow = pd.read_csv(StringIO(bad_csv), engine='pyarrow',
+                           dtype_backend='pyarrow', on_bad_lines='skip')
+    df_arrow
+
 
-enhancement1
-^^^^^^^^^^^^
 
 .. _whatsnew_220.enhancements.enhancement2:
 
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 18961adc4b954..3a8ba09bef7a3 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -90,13 +90,14 @@ def _get_pyarrow_options(self) -> None:
         }
 
         if "on_bad_lines" in self.kwds:
-            if callable(self.kwds["on_bad_lines"]):
-                self.parse_options["invalid_row_handler"] = self.kwds["on_bad_lines"]
-            elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.ERROR:
+            on_bad_lines = self.kwds["on_bad_lines"]
+            if callable(on_bad_lines):
+                self.parse_options["invalid_row_handler"] = on_bad_lines
+            elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
                 self.parse_options[
                     "invalid_row_handler"
                 ] = None  # PyArrow raises an exception by default
-            elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.WARN:
+            elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:
 
                 def handle_warning(invalid_row):
                     warnings.warn(
@@ -108,7 +109,7 @@ def handle_warning(invalid_row):
                     return "skip"
 
                 self.parse_options["invalid_row_handler"] = handle_warning
-            elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.SKIP:
+            elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
                 self.parse_options["invalid_row_handler"] = lambda _: "skip"
 
         self.convert_options = {

From bd8969ee40b154a2c51fd40977b996d613704024 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Wed, 23 Aug 2023 00:19:21 +0530
Subject: [PATCH 04/14] Refine whatsnew

---
 doc/source/whatsnew/v2.2.0.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index ddfcfa35a1fc0..b3ef7d73089f0 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -16,11 +16,11 @@ Enhancements
 
 .. _whatsnew_220.enhancements.pyarrow_on_bad_lines:
 
-PyArrow engine support for handling misformatted lines in CSV
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PyArrow engine support for handling malformed lines in CSV
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Prior to this release, the ability to handle malformed lines in CSV files was
 limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter.
-This release brings this capability to the `PyArrow <https://arrow.apache
+This release brings that capability to the `PyArrow <https://arrow.apache
 .org/docs/python/index.html>`_ engine as well.
 
 The implementation supports both specifying ``skip`` and ``warn`` values as

From e9c158f3a1810f85b4313e7a210dbeeb741dd9ef Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Wed, 23 Aug 2023 00:23:56 +0530
Subject: [PATCH 05/14] Add "error" value

---
 doc/source/whatsnew/v2.2.0.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index b3ef7d73089f0..32aaecfb37283 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -16,17 +16,17 @@ Enhancements
 
 .. _whatsnew_220.enhancements.pyarrow_on_bad_lines:
 
-PyArrow engine support for handling malformed lines in CSV
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PyArrow engine support for handling malformed lines in CSV files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Prior to this release, the ability to handle malformed lines in CSV files was
 limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter.
 This release brings that capability to the `PyArrow <https://arrow.apache
 .org/docs/python/index.html>`_ engine as well.
 
-The implementation supports both specifying ``skip`` and ``warn`` values as
-well as passing down a callable as defined in the `PyArrow documentation
-<https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
-#pyarrow.csv.ParseOptions.invalid_row_handler>`_
+The implementation supports both specifying ``skip``, ``error`` and ``warn``
+values as well as passing down a callable as defined in the `PyArrow
+documentation <https://arrow.apache.org/docs/python/generated/pyarrow.csv.
+ParseOptions.html#pyarrow.csv.ParseOptions.invalid_row_handler>`_
 
 *Example Usage*
 

From 4569f78967740112e7195955cfbb506a36d273f7 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Wed, 23 Aug 2023 00:51:05 +0530
Subject: [PATCH 06/14] Condense What's New

---
 doc/source/whatsnew/v2.2.0.rst | 32 +++++++-------------------------
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index 32aaecfb37283..533a56d28bdbd 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -18,31 +18,13 @@ Enhancements
 
 PyArrow engine support for handling malformed lines in CSV files
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Prior to this release, the ability to handle malformed lines in CSV files was
-limited to the ``python`` engine with the use of the ``on_bad_lines`` parameter.
-This release brings that capability to the `PyArrow <https://arrow.apache
-.org/docs/python/index.html>`_ engine as well.
-
-The implementation supports both specifying ``skip``, ``error`` and ``warn``
-values as well as passing down a callable as defined in the `PyArrow
-documentation <https://arrow.apache.org/docs/python/generated/pyarrow.csv.
-ParseOptions.html#pyarrow.csv.ParseOptions.invalid_row_handler>`_
-
-*Example Usage*
-
-.. ipython:: python
-
-    from io import StringIO
-
-    bad_csv = """a,b,c
-    acol1,bcol1,ccol1
-    acol2,ccol2
-    """
-    df_arrow = pd.read_csv(StringIO(bad_csv), engine='pyarrow',
-                           dtype_backend='pyarrow', on_bad_lines='skip')
-    df_arrow
-
-
+This release brings the capability to handle malformed lines in CSV files to
+the the `PyArrow <https://arrow.apache.org/docs/python/index.html>`_ engine
+using the ``on_bad_lines`` parameter.The implementation supports both specifying
+``skip``, ``error`` and ``warn`` values as well as passing down a callable as
+defined in the `PyArrow documentation <https://arrow.apache
+.org/docs/python/generated/pyarrow.csv.
+ParseOptions.html#pyarrow.csv.ParseOptions.invalid_row_handler>`_.
 
 .. _whatsnew_220.enhancements.enhancement2:
 

From 696d935b7f979d3454dcfda69831291ad5ef2ee6 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Wed, 23 Aug 2023 02:25:53 +0530
Subject: [PATCH 07/14] Move to "Other Enhancements"

---
 doc/source/whatsnew/v2.2.0.rst | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index 533a56d28bdbd..a5da66f209f28 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -16,15 +16,6 @@ Enhancements
 
 .. _whatsnew_220.enhancements.pyarrow_on_bad_lines:
 
-PyArrow engine support for handling malformed lines in CSV files
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-This release brings the capability to handle malformed lines in CSV files to
-the the `PyArrow <https://arrow.apache.org/docs/python/index.html>`_ engine
-using the ``on_bad_lines`` parameter.The implementation supports both specifying
-``skip``, ``error`` and ``warn`` values as well as passing down a callable as
-defined in the `PyArrow documentation <https://arrow.apache
-.org/docs/python/generated/pyarrow.csv.
-ParseOptions.html#pyarrow.csv.ParseOptions.invalid_row_handler>`_.
 
 .. _whatsnew_220.enhancements.enhancement2:
 
@@ -35,8 +26,8 @@ enhancement2
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
--
--
+- Addition of the capability to handle malformed lines in CSV files to the the `PyArrow <https://arrow.apache.org/docs/python/index.html>`_ engine using the ``on_bad_lines`` parameter. (:issue:`54480`)
+
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_220.notable_bug_fixes:

From 178468a7fd7542ce834e8e595a821b9f71f8eac0 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Thu, 24 Aug 2023 00:44:06 +0530
Subject: [PATCH 08/14] Refactor tests in "test_read_errors" to work with added
 capabilities

---
 .../io/parser/common/test_read_errors.py      | 99 ++++++++++++++-----
 1 file changed, 77 insertions(+), 22 deletions(-)

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
index 492b4d5ec058e..e048707e3ed30 100644
--- a/pandas/tests/io/parser/common/test_read_errors.py
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -1,5 +1,5 @@
 """
-Tests that work on both the Python and C engines but do not have a
+Tests that work on both the Python, C and PyArrow engines but do not have a
 specific classification into the other test modules.
 """
 import codecs
@@ -9,18 +9,21 @@
 from pathlib import Path
 
 import numpy as np
+from pyarrow import ArrowInvalid
 import pytest
 
 from pandas.compat import PY311
 from pandas.errors import (
     EmptyDataError,
     ParserError,
+    ParserWarning,
 )
 
 from pandas import DataFrame
 import pandas._testing as tm
 
-pytestmark = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
 def test_empty_decimal_marker(all_parsers):
@@ -32,10 +35,17 @@ def test_empty_decimal_marker(all_parsers):
     msg = "Only length-1 decimal markers supported"
     parser = all_parsers
 
+    if parser.engine == "pyarrow":
+        msg = (
+            "only single character unicode strings can be "
+            "converted to Py_UCS4, got length 0"
+        )
+
     with pytest.raises(ValueError, match=msg):
         parser.read_csv(StringIO(data), decimal="")
 
 
+@skip_pyarrow
 def test_bad_stream_exception(all_parsers, csv_dir_path):
     # see gh-13652
     #
@@ -56,6 +66,7 @@ def test_bad_stream_exception(all_parsers, csv_dir_path):
             parser.read_csv(stream)
 
 
+@skip_pyarrow
 def test_malformed(all_parsers):
     # see gh-6607
     parser = all_parsers
@@ -70,6 +81,7 @@ def test_malformed(all_parsers):
         parser.read_csv(StringIO(data), header=1, comment="#")
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", [5, 3, None])
 def test_malformed_chunks(all_parsers, nrows):
     data = """ignore
@@ -89,6 +101,7 @@ def test_malformed_chunks(all_parsers, nrows):
             reader.read(nrows)
 
 
+@skip_pyarrow
 def test_catch_too_many_names(all_parsers):
     # see gh-5156
     data = """\
@@ -108,6 +121,7 @@ def test_catch_too_many_names(all_parsers):
         parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
 def test_raise_on_no_columns(all_parsers, nrows):
     parser = all_parsers
@@ -135,11 +149,16 @@ def test_suppress_error_output(all_parsers, capsys):
     data = "a\n1\n1,2,3\n4\n5,6,7"
     expected = DataFrame({"a": [1, 4]})
 
-    result = parser.read_csv(StringIO(data), on_bad_lines="skip")
-    tm.assert_frame_equal(result, expected)
+    if parser.engine == "pyarrow":
+        with tm.assert_produces_warning(False):
+            result = parser.read_csv(StringIO(data), on_bad_lines="skip")
+        tm.assert_frame_equal(result, expected)
+    else:
+        result = parser.read_csv(StringIO(data), on_bad_lines="skip")
+        tm.assert_frame_equal(result, expected)
 
-    captured = capsys.readouterr()
-    assert captured.err == ""
+        captured = capsys.readouterr()
+        assert captured.err == ""
 
 
 def test_error_bad_lines(all_parsers):
@@ -148,7 +167,13 @@ def test_error_bad_lines(all_parsers):
     data = "a\n1\n1,2,3\n4\n5,6,7"
 
     msg = "Expected 1 fields in line 3, saw 3"
-    with pytest.raises(ParserError, match=msg):
+    ex_type = ParserError
+
+    if parser.engine == "pyarrow":
+        msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3"
+        ex_type = ArrowInvalid
+
+    with pytest.raises(ex_type, match=msg):
         parser.read_csv(StringIO(data), on_bad_lines="error")
 
 
@@ -158,12 +183,21 @@ def test_warn_bad_lines(all_parsers, capsys):
     data = "a\n1\n1,2,3\n4\n5,6,7"
     expected = DataFrame({"a": [1, 4]})
 
-    result = parser.read_csv(StringIO(data), on_bad_lines="warn")
-    tm.assert_frame_equal(result, expected)
+    if parser.engine == "pyarrow":
+        with tm.assert_produces_warning(
+            ParserWarning,
+            check_stacklevel=False,
+            match="Expected 1 columns, but found 3: 1,2,3",
+        ):
+            result = parser.read_csv(StringIO(data), on_bad_lines="warn")
+        tm.assert_frame_equal(result, expected)
+    else:
+        result = parser.read_csv(StringIO(data), on_bad_lines="warn")
+        tm.assert_frame_equal(result, expected)
 
-    captured = capsys.readouterr()
-    assert "Skipping line 3" in captured.err
-    assert "Skipping line 5" in captured.err
+        captured = capsys.readouterr()
+        assert "Skipping line 3" in captured.err
+        assert "Skipping line 5" in captured.err
 
 
 def test_read_csv_wrong_num_columns(all_parsers):
@@ -175,11 +209,17 @@ def test_read_csv_wrong_num_columns(all_parsers):
 """
     parser = all_parsers
     msg = "Expected 6 fields in line 3, saw 7"
+    ex_type = ParserError
 
-    with pytest.raises(ParserError, match=msg):
+    if parser.engine == "pyarrow":
+        msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12"
+        ex_type = ArrowInvalid
+
+    with pytest.raises(ex_type, match=msg):
         parser.read_csv(StringIO(data))
 
 
+@skip_pyarrow
 def test_null_byte_char(request, all_parsers):
     # see gh-2741
     data = "\x00,foo"
@@ -202,6 +242,7 @@ def test_null_byte_char(request, all_parsers):
             parser.read_csv(StringIO(data), names=names)
 
 
+@skip_pyarrow
 @pytest.mark.filterwarnings("always::ResourceWarning")
 def test_open_file(request, all_parsers):
     # GH 39024
@@ -235,13 +276,17 @@ def test_bad_header_uniform_error(all_parsers):
     parser = all_parsers
     data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
     msg = "Expected 2 fields in line 2, saw 4"
+    ex_type = ParserError
     if parser.engine == "c":
         msg = (
             "Could not construct index. Requested to use 1 "
             "number of columns, but 3 left to parse."
         )
+    elif parser.engine == "pyarrow":
+        ex_type = ArrowInvalid
+        msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
 
-    with pytest.raises(ParserError, match=msg):
+    with pytest.raises(ex_type, match=msg):
         parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
 
 
@@ -256,17 +301,27 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
 """
     expected = DataFrame({"1": "a", "2": ["b"] * 2})
 
-    result = parser.read_csv(StringIO(data), on_bad_lines="warn")
-    tm.assert_frame_equal(result, expected)
+    # pyarrow engine uses warnings instead of directly printing to stderr
+    if parser.engine == "pyarrow":
+        with tm.assert_produces_warning(
+            ParserWarning,
+            check_stacklevel=False,
+            match="Expected 2 columns, but found 3: a,b,c",
+        ):
+            result = parser.read_csv(StringIO(data), on_bad_lines="warn")
+        tm.assert_frame_equal(result, expected)
+    else:
+        result = parser.read_csv(StringIO(data), on_bad_lines="warn")
+        tm.assert_frame_equal(result, expected)
 
-    captured = capsys.readouterr()
-    if parser.engine == "c":
-        warn = """Skipping line 3: expected 2 fields, saw 3
+        captured = capsys.readouterr()
+        if parser.engine == "c":
+            warn = """Skipping line 3: expected 2 fields, saw 3
 Skipping line 4: expected 2 fields, saw 3
 
 """
-    else:
-        warn = """Skipping line 3: Expected 2 fields in line 3, saw 3
+        else:
+            warn = """Skipping line 3: Expected 2 fields in line 3, saw 3
 Skipping line 4: Expected 2 fields in line 4, saw 3
 """
-    assert captured.err == warn
+        assert captured.err == warn

From 3eb1af8d276f5d9a6444814849f03278eb191cb2 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Thu, 24 Aug 2023 01:18:06 +0530
Subject: [PATCH 09/14] Conditionally import pyarrow error types

---
 pandas/tests/io/parser/common/test_read_errors.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
index e048707e3ed30..091cbbc49a39a 100644
--- a/pandas/tests/io/parser/common/test_read_errors.py
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -9,7 +9,6 @@
 from pathlib import Path
 
 import numpy as np
-from pyarrow import ArrowInvalid
 import pytest
 
 from pandas.compat import PY311
@@ -22,6 +21,12 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+# PyArrow's error types are not available by default
+try:
+    from pyarrow import ArrowInvalid
+except ImportError:
+    pass
+
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 

From 23ce4abd866606d997736cc5ab383791d75efa31 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Sat, 26 Aug 2023 02:19:55 +0530
Subject: [PATCH 10/14] Revert changes in v2.2.0.rst > enhancements

---
 doc/source/whatsnew/v2.2.0.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index a5da66f209f28..93147f3b3674e 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -14,8 +14,10 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_220.enhancements.pyarrow_on_bad_lines:
+.. _whatsnew_220.enhancements.enhancement1:
 
+enhancement1
+^^^^^^^^^^^^
 
 .. _whatsnew_220.enhancements.enhancement2:
 

From 6cc4a7c84033fef4de1b178c54c5d6f620b1b020 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Sat, 26 Aug 2023 02:35:07 +0530
Subject: [PATCH 11/14] Address review comments

---
 doc/source/whatsnew/v2.2.0.rst                  |  2 +-
 .../tests/io/parser/common/test_read_errors.py  | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index 93147f3b3674e..c8437f43a4aaa 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -28,7 +28,7 @@ enhancement2
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
-- Addition of the capability to handle malformed lines in CSV files to the the `PyArrow <https://arrow.apache.org/docs/python/index.html>`_ engine using the ``on_bad_lines`` parameter. (:issue:`54480`)
+- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
 
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
index 091cbbc49a39a..24c8f50d475e8 100644
--- a/pandas/tests/io/parser/common/test_read_errors.py
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -1,5 +1,5 @@
 """
-Tests that work on both the Python, C and PyArrow engines but do not have a
+Tests that work on the Python, C and PyArrow engines but do not have a
 specific classification into the other test modules.
 """
 import codecs
@@ -21,12 +21,6 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
-# PyArrow's error types are not available by default
-try:
-    from pyarrow import ArrowInvalid
-except ImportError:
-    pass
-
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
@@ -175,8 +169,9 @@ def test_error_bad_lines(all_parsers):
     ex_type = ParserError
 
     if parser.engine == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+        ex_type = pa.ArrowInvalid
         msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3"
-        ex_type = ArrowInvalid
 
     with pytest.raises(ex_type, match=msg):
         parser.read_csv(StringIO(data), on_bad_lines="error")
@@ -217,8 +212,9 @@ def test_read_csv_wrong_num_columns(all_parsers):
     ex_type = ParserError
 
     if parser.engine == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+        ex_type = pa.ArrowInvalid
         msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12"
-        ex_type = ArrowInvalid
 
     with pytest.raises(ex_type, match=msg):
         parser.read_csv(StringIO(data))
@@ -288,7 +284,8 @@ def test_bad_header_uniform_error(all_parsers):
             "number of columns, but 3 left to parse."
         )
     elif parser.engine == "pyarrow":
-        ex_type = ArrowInvalid
+        pa = pytest.importorskip("pyarrow")
+        ex_type = pa.ArrowInvalid
         msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
 
     with pytest.raises(ex_type, match=msg):

From 61dcaedcd3aa749b41caeb3d95abbebb84a62cb6 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Fri, 1 Sep 2023 23:38:41 +0530
Subject: [PATCH 12/14] Address review comments

---
 pandas/io/parsers/arrow_parser_wrapper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 2cf60cae07389..4fdd84438820d 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -89,8 +89,8 @@ def _get_pyarrow_options(self) -> None:
             in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
         }
 
-        if "on_bad_lines" in self.kwds:
-            on_bad_lines = self.kwds["on_bad_lines"]
+        on_bad_lines = self.kwds.get("on_bad_lines")
+        if on_bad_lines is not None:
             if callable(on_bad_lines):
                 self.parse_options["invalid_row_handler"] = on_bad_lines
             elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:

From 1da74c7eb458ad7ecc232b691b3c0c2f2bb20746 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Sat, 9 Sep 2023 03:15:20 +0530
Subject: [PATCH 13/14] Wrap ArrowInvalid with ParserError

---
 pandas/io/parsers/arrow_parser_wrapper.py     | 22 +++++++++++++------
 .../io/parser/common/test_read_errors.py      | 15 +++----------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 4fdd84438820d..6a07052dd0742 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -3,11 +3,16 @@
 from typing import TYPE_CHECKING
 import warnings
 
+from pyarrow import ArrowInvalid
+
 from pandas._config import using_pyarrow_string_dtype
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
-from pandas.errors import ParserWarning
+from pandas.errors import (
+    ParserError,
+    ParserWarning,
+)
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.inference import is_integer
@@ -217,12 +222,15 @@ def read(self) -> DataFrame:
         pyarrow_csv = import_optional_dependency("pyarrow.csv")
         self._get_pyarrow_options()
 
-        table = pyarrow_csv.read_csv(
-            self.src,
-            read_options=pyarrow_csv.ReadOptions(**self.read_options),
-            parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
-            convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
-        )
+        try:
+            table = pyarrow_csv.read_csv(
+                self.src,
+                read_options=pyarrow_csv.ReadOptions(**self.read_options),
+                parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
+                convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
+            )
+        except ArrowInvalid as e:
+            raise ParserError(e) from e
 
         dtype_backend = self.kwds["dtype_backend"]
 
diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
index 24c8f50d475e8..6f9b6fcffbe9d 100644
--- a/pandas/tests/io/parser/common/test_read_errors.py
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -166,14 +166,11 @@ def test_error_bad_lines(all_parsers):
     data = "a\n1\n1,2,3\n4\n5,6,7"
 
     msg = "Expected 1 fields in line 3, saw 3"
-    ex_type = ParserError
 
     if parser.engine == "pyarrow":
-        pa = pytest.importorskip("pyarrow")
-        ex_type = pa.ArrowInvalid
         msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3"
 
-    with pytest.raises(ex_type, match=msg):
+    with pytest.raises(ParserError, match=msg):
         parser.read_csv(StringIO(data), on_bad_lines="error")
 
 
@@ -209,14 +206,11 @@ def test_read_csv_wrong_num_columns(all_parsers):
 """
     parser = all_parsers
     msg = "Expected 6 fields in line 3, saw 7"
-    ex_type = ParserError
 
     if parser.engine == "pyarrow":
-        pa = pytest.importorskip("pyarrow")
-        ex_type = pa.ArrowInvalid
         msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12"
 
-    with pytest.raises(ex_type, match=msg):
+    with pytest.raises(ParserError, match=msg):
         parser.read_csv(StringIO(data))
 
 
@@ -277,18 +271,15 @@ def test_bad_header_uniform_error(all_parsers):
     parser = all_parsers
     data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
     msg = "Expected 2 fields in line 2, saw 4"
-    ex_type = ParserError
     if parser.engine == "c":
         msg = (
             "Could not construct index. Requested to use 1 "
             "number of columns, but 3 left to parse."
         )
     elif parser.engine == "pyarrow":
-        pa = pytest.importorskip("pyarrow")
-        ex_type = pa.ArrowInvalid
         msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
 
-    with pytest.raises(ex_type, match=msg):
+    with pytest.raises(ParserError, match=msg):
         parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
 
 

From e847e8cc215079df6710ea5508962f45dd28af55 Mon Sep 17 00:00:00 2001
From: amithkk <amithkumaran@gmail.com>
Date: Sun, 10 Sep 2023 02:28:46 +0530
Subject: [PATCH 14/14] Change ArrowInvalid to optional import

---
 pandas/io/parsers/arrow_parser_wrapper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 6a07052dd0742..765a4ffcd2cb9 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -3,8 +3,6 @@
 from typing import TYPE_CHECKING
 import warnings
 
-from pyarrow import ArrowInvalid
-
 from pandas._config import using_pyarrow_string_dtype
 
 from pandas._libs import lib
@@ -229,7 +227,7 @@ def read(self) -> DataFrame:
                 parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
                 convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
             )
-        except ArrowInvalid as e:
+        except pa.ArrowInvalid as e:
             raise ParserError(e) from e
 
         dtype_backend = self.kwds["dtype_backend"]