pandas-dev · mroeschke · Nov 11, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -13,6 +13,7 @@
 )
 from pandas.util._exceptions import find_stack_level
 
+from pandas.core.dtypes.common import pandas_dtype
 from pandas.core.dtypes.inference import is_integer
 
 import pandas as pd
@@ -203,7 +204,13 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
             # Ignore non-existent columns from dtype mapping
             # like other parsers do
             if isinstance(self.dtype, dict):
-                self.dtype = {k: v for k, v in self.dtype.items() if k in frame.columns}
+                self.dtype = {
+                    k: pandas_dtype(v)
+                    for k, v in self.dtype.items()
+                    if k in frame.columns
+                }
+            else:
+                self.dtype = pandas_dtype(self.dtype)
             try:
                 frame = frame.astype(self.dtype)
             except TypeError as e:

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -119,7 +119,6 @@ def test_read_csv_local(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_1000_sep(all_parsers):
     parser = all_parsers
     data = """A|B|C
@@ -128,6 +127,12 @@ def test_1000_sep(all_parsers):
 """
     expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
 
+    if parser.engine == "pyarrow":
+        msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), sep="|", thousands=",")
+        return
+
     result = parser.read_csv(StringIO(data), sep="|", thousands=",")
     tm.assert_frame_equal(result, expected)
 
@@ -161,7 +166,6 @@ def test_csv_mixed_type(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_read_csv_low_memory_no_rows_with_index(all_parsers):
     # see gh-21141
     parser = all_parsers
@@ -174,6 +178,13 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers):
 2,2,3,4
 3,3,4,5
 """
+
+    if parser.engine == "pyarrow":
+        msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
+        return
+
     result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
     expected = DataFrame(columns=["A", "B", "C"])
     tm.assert_frame_equal(result, expected)
@@ -212,7 +223,6 @@ def test_read_csv_dataframe(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("nrows", [3, 3.0])
 def test_read_nrows(all_parsers, nrows):
     # see gh-10476
@@ -230,11 +240,16 @@ def test_read_nrows(all_parsers, nrows):
     )
     parser = all_parsers
 
+    if parser.engine == "pyarrow":
+        msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), nrows=nrows)
+        return
+
     result = parser.read_csv(StringIO(data), nrows=nrows)
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("nrows", [1.2, "foo", -1])
 def test_read_nrows_bad(all_parsers, nrows):
     data = """index,A,B,C,D
@@ -247,6 +262,8 @@ def test_read_nrows_bad(all_parsers, nrows):
 """
     msg = r"'nrows' must be an integer >=0"
     parser = all_parsers
+    if parser.engine == "pyarrow":
+        msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
 
     with pytest.raises(ValueError, match=msg):
         parser.read_csv(StringIO(data), nrows=nrows)
@@ -277,7 +294,6 @@ def test_missing_trailing_delimiters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_skip_initial_space(all_parsers):
     data = (
         '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
@@ -289,6 +305,18 @@ def test_skip_initial_space(all_parsers):
     )
     parser = all_parsers
 
+    if parser.engine == "pyarrow":
+        msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(
+                StringIO(data),
+                names=list(range(33)),
+                header=None,
+                na_values=["-9999.0"],
+                skipinitialspace=True,
+            )
+        return
+
     result = parser.read_csv(
         StringIO(data),
         names=list(range(33)),
@@ -437,7 +465,6 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
         tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -467,6 +494,12 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
     data = "A B C  \nrandom line with trailing spaces    \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n   \n5.1,NaN,10.0\n"  # noqa: E501
     parser = all_parsers
 
+    if parser.engine == "pyarrow":
+        msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
+        return
+
     result = parser.read_csv(StringIO(data.replace(",", "  ")), **kwargs)
     tm.assert_frame_equal(result, expected)
 
@@ -488,7 +521,6 @@ def test_read_filepath_or_buffer(all_parsers):
         parser.read_csv(filepath_or_buffer=b"input")
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("delim_whitespace", [True, False])
 def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
     # see gh-9710
@@ -501,6 +533,15 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
 b\n"""
 
     expected = DataFrame({"MyColumn": list("abab")})
+
+    if parser.engine == "pyarrow":
+        msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(
+                StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
+            )
+        return
+
     result = parser.read_csv(
         StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
     )
@@ -688,7 +729,6 @@ def test_first_row_bom_unquoted(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("nrows", range(1, 6))
 def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     # GH 28071
@@ -698,6 +738,15 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     )
     csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
     parser = all_parsers
+
+    if parser.engine == "pyarrow":
+        msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(
+                StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
+            )
+        return
+
     df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
     tm.assert_frame_equal(df, ref[:nrows])
 
@@ -731,11 +780,16 @@ def test_read_csv_names_not_accepting_sets(all_parsers):
         parser.read_csv(StringIO(data), names=set("QAZ"))
 
 
-@xfail_pyarrow
 def test_read_table_delim_whitespace_default_sep(all_parsers):
     # GH: 35958
     f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
     parser = all_parsers
+
+    if parser.engine == "pyarrow":
+        msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_table(f, delim_whitespace=True)
+        return
     result = parser.read_table(f, delim_whitespace=True)
     expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
     tm.assert_frame_equal(result, expected)

diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py
@@ -13,10 +13,7 @@
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
 
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
-
-@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,thousands,decimal",
     [
@@ -42,6 +39,14 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
     parser = all_parsers
     expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
 
+    if parser.engine == "pyarrow":
+        msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(
+                StringIO(data), sep="|", thousands=thousands, decimal=decimal
+            )
+        return
+
     result = parser.read_csv(
         StringIO(data), sep="|", thousands=thousands, decimal=decimal
     )

diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -214,8 +214,14 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
     # see gh-10728, gh-10548
     parser = all_parsers
 
+    if parser.engine == "pyarrow" and "comment" in kwargs:
+        msg = "The 'comment' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), **kwargs)
+        return
+
     if parser.engine == "pyarrow" and "\r" not in data:
-        mark = pytest.mark.xfail(reason="The 'comment' option is not supported")
+        mark = pytest.mark.xfail(reason="Mismatched exception type/message")
         request.applymarker(mark)
 
     if expected is None:
@@ -356,7 +362,6 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding):
     assert not handle.closed
 
 
-@xfail_pyarrow  # ValueError: The 'memory_map' option is not supported
 def test_memory_map_compression(all_parsers, compression):
     """
     Support memory map for compressed files.
@@ -369,19 +374,32 @@ def test_memory_map_compression(all_parsers, compression):
     with tm.ensure_clean() as path:
         expected.to_csv(path, index=False, compression=compression)
 
-        tm.assert_frame_equal(
-            parser.read_csv(path, memory_map=True, compression=compression),
-            expected,
-        )
+        if parser.engine == "pyarrow":
+            msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
+            with pytest.raises(ValueError, match=msg):
+                parser.read_csv(path, memory_map=True, compression=compression)
+            return
+
+        result = parser.read_csv(path, memory_map=True, compression=compression)
+
+    tm.assert_frame_equal(
+        result,
+        expected,
+    )
 
 
-@xfail_pyarrow  # ValueError: The 'chunksize' option is not supported
 def test_context_manager(all_parsers, datapath):
     # make sure that opened files are closed
     parser = all_parsers
 
     path = datapath("io", "data", "csv", "iris.csv")
 
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(path, chunksize=1)
+        return
+
     reader = parser.read_csv(path, chunksize=1)
     assert not reader.handles.handle.closed
     try:
@@ -392,12 +410,17 @@ def test_context_manager(all_parsers, datapath):
         assert reader.handles.handle.closed
 
 
-@xfail_pyarrow  # ValueError: The 'chunksize' option is not supported
 def test_context_manageri_user_provided(all_parsers, datapath):
     # make sure that user-provided handles are not closed
     parser = all_parsers
 
     with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
+        if parser.engine == "pyarrow":
+            msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+            with pytest.raises(ValueError, match=msg):
+                parser.read_csv(path, chunksize=1)
+            return
+
         reader = parser.read_csv(path, chunksize=1)
         assert not reader.handles.handle.closed
         try:
@@ -417,7 +440,6 @@ def test_file_descriptor_leak(all_parsers, using_copy_on_write):
             parser.read_csv(path)
 
 
-@xfail_pyarrow  # ValueError: The 'memory_map' option is not supported
 def test_memory_map(all_parsers, csv_dir_path):
     mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
     parser = all_parsers
@@ -426,5 +448,11 @@ def test_memory_map(all_parsers, csv_dir_path):
         {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
     )
 
+    if parser.engine == "pyarrow":
+        msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(mmap_file, memory_map=True)
+        return
+
     result = parser.read_csv(mmap_file, memory_map=True)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py
@@ -126,10 +126,8 @@ def test_int64_min_issues(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-# ValueError: The 'converters' option is not supported with the 'pyarrow' engine
-@xfail_pyarrow
 @pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
-def test_int64_overflow(all_parsers, conv):
+def test_int64_overflow(all_parsers, conv, request):
     data = """ID
 00013007854817840016671868
 00013007854817840016749251
@@ -143,6 +141,10 @@ def test_int64_overflow(all_parsers, conv):
     if conv is None:
         # 13007854817840016671868 > UINT64_MAX, so this
         # will overflow and return object as the dtype.
+        if parser.engine == "pyarrow":
+            mark = pytest.mark.xfail(reason="parses to float64")
+            request.applymarker(mark)
+
         result = parser.read_csv(StringIO(data))
         expected = DataFrame(
             [
@@ -161,13 +163,19 @@ def test_int64_overflow(all_parsers, conv):
         # 13007854817840016671868 > UINT64_MAX, so attempts
         # to cast to either int64 or uint64 will result in
         # an OverflowError being raised.
-        msg = (
-            "(Python int too large to convert to C long)|"
-            "(long too big to convert)|"
-            "(int too big to convert)"
+        msg = "|".join(
+            [
+                "Python int too large to convert to C long",
+                "long too big to convert",
+                "int too big to convert",
+            ]
         )
+        err = OverflowError
+        if parser.engine == "pyarrow":
+            err = ValueError
+            msg = "The 'converters' option is not supported with the 'pyarrow' engine"
 
-        with pytest.raises(OverflowError, match=msg):
+        with pytest.raises(err, match=msg):
             parser.read_csv(StringIO(data), converters={"ID": conv})