From f22ff466b510d13b323c5e483cdeecbbf739dd4e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 8 Feb 2020 20:24:35 -0800
Subject: [PATCH 01/35] add arrow engine to read_csv

---
 pandas/io/parsers.py | 132 +++++++++++++++++++++++++++++++------------
 1 file changed, 97 insertions(+), 35 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 84a8b5b2a94fe..f5c00f3f7d137 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -20,6 +20,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -165,9 +166,10 @@
     to preserve and not interpret dtype.
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
-engine : {{'c', 'python'}}, optional
-    Parser engine to use. The C engine is faster while the python engine is
-    currently more feature-complete.
+engine : {{'c', 'python', 'arrow'}}, optional
+    Parser engine to use. The C and arrow engines are faster while the python engine is
+    currently more feature-complete. The arrow engine requires ``pyarrow``
+    as a dependency however.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -506,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
-
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,
@@ -520,6 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
+_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"}
 _python_unsupported = {"low_memory", "float_precision"}
 
 _deprecated_defaults: Dict[str, Any] = {}
@@ -705,7 +707,6 @@ def read_fwf(
     infer_nrows=100,
     **kwds,
 ):
-
     r"""
     Read a table of fixed-width formatted lines into DataFrame.
 
@@ -879,7 +880,8 @@ def __init__(self, f, engine=None, **kwds):
         self._make_engine(self.engine)
 
     def close(self):
-        self._engine.close()
+        if self.engine != "arrow":
+            self._engine.close()
 
     def _get_options_with_defaults(self, engine):
         kwds = self.orig_options
@@ -945,16 +947,16 @@ def _clean_options(self, options, engine):
         delim_whitespace = options["delim_whitespace"]
 
         # C engine not supported yet
-        if engine == "c":
+        if engine == "c" or engine == "arrow":
             if options["skipfooter"] > 0:
-                fallback_reason = "the 'c' engine does not support skipfooter"
+                fallback_reason = f"the {engine} engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
         if sep is None and not delim_whitespace:
-            if engine == "c":
+            if engine == "c" or engine == "arrow":
                 fallback_reason = (
-                    "the 'c' engine does not support "
+                    f"the {engine} engine does not support "
                     "sep=None with delim_whitespace=False"
                 )
                 engine = "python"
@@ -1081,14 +1083,20 @@ def _clean_options(self, options, engine):
         na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
 
         # handle skiprows; this is internally handled by the
-        # c-engine, so only need for python parsers
+        # c-engine, so only need for python parser
         if engine != "c":
-            if is_integer(skiprows):
-                skiprows = list(range(skiprows))
-            if skiprows is None:
-                skiprows = set()
-            elif not callable(skiprows):
-                skiprows = set(skiprows)
+            if engine == "arrow":
+                if not is_integer(skiprows) and skiprows is not None:
+                    raise ValueError(
+                        "skiprows argument must be integer when using arrow engine"
+                    )
+            else:
+                if is_integer(skiprows):
+                    skiprows = list(range(skiprows))
+                if skiprows is None:
+                    skiprows = set()
+                elif not callable(skiprows):
+                    skiprows = set(skiprows)
 
         # put stuff back
         result["names"] = names
@@ -1109,6 +1117,8 @@ def __next__(self):
     def _make_engine(self, engine="c"):
         if engine == "c":
             self._engine = CParserWrapper(self.f, **self.options)
+        elif engine == "arrow":
+            self._engine = ArrowParserWrapper(self.f, **self.options)
         else:
             if engine == "python":
                 klass = PythonParser
@@ -1125,29 +1135,32 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        nrows = _validate_integer("nrows", nrows)
-        ret = self._engine.read(nrows)
+        if self.engine == "arrow":
+            return self._engine.read(nrows)
+        else:
+            nrows = _validate_integer("nrows", nrows)
+            ret = self._engine.read(nrows)
 
-        # May alter columns / col_dict
-        index, columns, col_dict = self._create_index(ret)
+            # May alter columns / col_dict
+            index, columns, col_dict = self._create_index(ret)
 
-        if index is None:
-            if col_dict:
-                # Any column is actually fine:
-                new_rows = len(next(iter(col_dict.values())))
-                index = RangeIndex(self._currow, self._currow + new_rows)
+            if index is None:
+                if col_dict:
+                    # Any column is actually fine:
+                    new_rows = len(next(iter(col_dict.values())))
+                    index = RangeIndex(self._currow, self._currow + new_rows)
+                else:
+                    new_rows = 0
             else:
-                new_rows = 0
-        else:
-            new_rows = len(index)
+                new_rows = len(index)
 
-        df = DataFrame(col_dict, columns=columns, index=index)
+            df = DataFrame(col_dict, columns=columns, index=index)
 
-        self._currow += new_rows
+            self._currow += new_rows
 
-        if self.squeeze and len(df.columns) == 1:
-            return df[df.columns[0]].copy()
-        return df
+            if self.squeeze and len(df.columns) == 1:
+                return df[df.columns[0]].copy()
+            return df
 
     def _create_index(self, ret):
         index, columns, col_dict = ret
@@ -2135,6 +2148,56 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
         return values
 
 
+class ArrowParserWrapper(ParserBase):
+    """
+
+    """
+
+    def __init__(self, src, **kwds):
+        self.kwds = kwds
+        self.src = src
+        kwds = kwds.copy()
+
+        ParserBase.__init__(self, kwds)
+
+        # #2442
+        kwds["allow_leading_cols"] = self.index_col is not False
+
+        # GH20529, validate usecol arg before TextReader
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
+        kwds["usecols"] = self.usecols
+
+        self.names = kwds["names"]
+
+    def read(self, nrows=None):
+        pyarrow = import_optional_dependency(
+            "pyarrow.csv", extra="pyarrow is required to use arrow engine"
+        )
+        nrows = _validate_integer("nrows", nrows)
+        table = pyarrow.read_csv(
+            self.src,
+            read_options=pyarrow.ReadOptions(
+                skip_rows=self.kwds.get("skiprows"), column_names=self.names
+            ),
+            parse_options=pyarrow.ParseOptions(
+                delimiter=self.kwds.get("delimiter"),
+                quote_char=self.kwds.get("quotechar"),
+            ),
+            convert_options=pyarrow.ConvertOptions(
+                include_columns=self.usecols, column_types=self.kwds.get("dtype")
+            ),
+        )
+        if nrows:
+            table = table[:nrows]
+        table_width = len(table.column_names)
+        if self.names is None:
+            if self.prefix:
+                self.names = [f"{self.prefix}{i}" for i in range(table_width)]
+        if self.names:
+            table = table.rename_columns(self.names)
+        return table.to_pandas()
+
+
 def TextParser(*args, **kwds):
     """
     Converts lists of lists/tuples into DataFrames with proper type inference
@@ -3336,7 +3399,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
-
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES

From 8ae43e44cdbec134771173b69a5d4c1a2400504f Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 8 Feb 2020 21:01:26 -0800
Subject: [PATCH 02/35] fix failing test

---
 pandas/io/parsers.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f5c00f3f7d137..75da1d991dc9b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1135,7 +1135,7 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        if self.engine == "arrow":
+        if isinstance(self._engine, ArrowParserWrapper):
             return self._engine.read(nrows)
         else:
             nrows = _validate_integer("nrows", nrows)
@@ -2165,9 +2165,6 @@ def __init__(self, src, **kwds):
 
         # GH20529, validate usecol arg before TextReader
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
-        kwds["usecols"] = self.usecols
-
-        self.names = kwds["names"]
 
     def read(self, nrows=None):
         pyarrow = import_optional_dependency(

From 09074df84e42eec3e7f7dd1ae7c710af53b386cc Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 10:01:55 -0800
Subject: [PATCH 03/35] formatting and revert unnecessary change

---
 pandas/io/parsers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 75da1d991dc9b..ad60b223daa06 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -167,7 +167,7 @@
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
 engine : {{'c', 'python', 'arrow'}}, optional
-    Parser engine to use. The C and arrow engines are faster while the python engine is
+    Parser engine to use. The C and arrow engines are faster, while the python engine is
     currently more feature-complete. The arrow engine requires ``pyarrow``
     as a dependency however.
 converters : dict, optional
@@ -508,6 +508,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
+
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,

From 6be276db8c7c5e1384bfb45591534176d2f6bfe5 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 10:07:03 -0800
Subject: [PATCH 04/35] remove bloat and more formatting changes

---
 pandas/io/parsers.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index ad60b223daa06..6d8764fef385c 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -881,8 +881,7 @@ def __init__(self, f, engine=None, **kwds):
         self._make_engine(self.engine)
 
     def close(self):
-        if self.engine != "arrow":
-            self._engine.close()
+        self._engine.close()
 
     def _get_options_with_defaults(self, engine):
         kwds = self.orig_options
@@ -1089,7 +1088,7 @@ def _clean_options(self, options, engine):
             if engine == "arrow":
                 if not is_integer(skiprows) and skiprows is not None:
                     raise ValueError(
-                        "skiprows argument must be integer when using arrow engine"
+                        "skiprows argument must be an integer when using engine='arrow'"
                     )
             else:
                 if is_integer(skiprows):

From df4fa7e2ac359f7e25031f8f92d312049972d1ec Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 10:25:25 -0800
Subject: [PATCH 05/35] Whatsnew

---
 doc/source/whatsnew/v1.1.0.rst | 4 +++-
 pandas/io/parsers.py           | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 920919755dc23..2c4f5dcfbcde8 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -42,7 +42,9 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
--
+- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
+  if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts.
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 6d8764fef385c..938bafa780d89 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -708,6 +708,7 @@ def read_fwf(
     infer_nrows=100,
     **kwds,
 ):
+
     r"""
     Read a table of fixed-width formatted lines into DataFrame.
 
@@ -3396,6 +3397,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
+
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES

From ecaf3fd036d38dfd34e5d9a5de45304dbdfacca4 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 9 Feb 2020 16:35:32 -0800
Subject: [PATCH 06/35] Get tests up and running

---
 pandas/io/parsers.py               | 12 +++++++-----
 pandas/tests/io/parser/conftest.py | 12 ++++++++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 371660b19b171..43272ef2cf600 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -508,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
-
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,
@@ -522,7 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
-_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"}
+_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"}
 _python_unsupported = {"low_memory", "float_precision"}
 
 _deprecated_defaults: Dict[str, Any] = {}
@@ -708,7 +707,6 @@ def read_fwf(
     infer_nrows=100,
     **kwds,
 ):
-
     r"""
     Read a table of fixed-width formatted lines into DataFrame.
 
@@ -947,7 +945,12 @@ def _clean_options(self, options, engine):
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
 
-        # C engine not supported yet
+        # arrow engine not supported yet
+        if engine == "arrow":
+            if options["chunksize"] is not None:
+                fallback_reason = f"the arrow engine does not support chunksize"
+                engine = "python"
+        # C and arrow engine not supported yet
         if engine == "c" or engine == "arrow":
             if options["skipfooter"] > 0:
                 fallback_reason = f"the {engine} engine does not support skipfooter"
@@ -3401,7 +3404,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
-
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 15967e3be176a..751db1d22e8ae 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -44,6 +44,11 @@ class PythonParser(BaseParser):
     float_precision_choices = [None]
 
 
+class ArrowParser(BaseParser):
+    engine = "arrow"
+    float_precision_choices = [None]
+
+
 @pytest.fixture
 def csv_dir_path(datapath):
     """
@@ -63,14 +68,17 @@ def csv1(csv_dir_path):
 _cParserHighMemory = CParserHighMemory()
 _cParserLowMemory = CParserLowMemory()
 _pythonParser = PythonParser()
+_arrowParser = ArrowParser()
 
 _py_parsers_only = [_pythonParser]
 _c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
-_all_parsers = [*_c_parsers_only, *_py_parsers_only]
+_arrow_parsers_only = [_arrowParser]
+_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only]
 
 _py_parser_ids = ["python"]
 _c_parser_ids = ["c_high", "c_low"]
-_all_parser_ids = [*_c_parser_ids, *_py_parser_ids]
+_arrow_parser_ids = ["arrow"]
+_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)

From b3c328723bb997a675e31cd8db84d77d75afa4f7 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 10 Feb 2020 07:26:58 -0800
Subject: [PATCH 07/35] Some fixes

---
 pandas/io/parsers.py | 45 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 43272ef2cf600..d3f40a6b9df2b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -947,7 +947,7 @@ def _clean_options(self, options, engine):
 
         # arrow engine not supported yet
         if engine == "arrow":
-            if options["chunksize"] is not None:
+            if self.chunksize is not None:
                 fallback_reason = f"the arrow engine does not support chunksize"
                 engine = "python"
         # C and arrow engine not supported yet
@@ -1087,10 +1087,11 @@ def _clean_options(self, options, engine):
         na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
 
         # handle skiprows; this is internally handled by the
-        # c-engine, so only need for python parser
+        # c-engine, so only need for python and arrow parsers
         if engine != "c":
             if engine == "arrow":
                 if not is_integer(skiprows) and skiprows is not None:
+                    # pyarrow expects skiprows to be passed as an integer
                     raise ValueError(
                         "skiprows argument must be an integer when using engine='arrow'"
                     )
@@ -1131,7 +1132,7 @@ def _make_engine(self, engine="c"):
             else:
                 raise ValueError(
                     f"Unknown engine: {engine} (valid options "
-                    'are "c", "python", or "python-fwf")'
+                    'are "c", "python", "arrow", or "python-fwf")'
                 )
             self._engine = klass(self.f, **self.options)
 
@@ -1139,32 +1140,31 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        if isinstance(self._engine, ArrowParserWrapper):
+        nrows = _validate_integer("nrows", nrows)
+        if self.engine == "arrow":
             return self._engine.read(nrows)
-        else:
-            nrows = _validate_integer("nrows", nrows)
-            ret = self._engine.read(nrows)
+        ret = self._engine.read(nrows)
 
-            # May alter columns / col_dict
-            index, columns, col_dict = self._create_index(ret)
+        # May alter columns / col_dict
+        index, columns, col_dict = self._create_index(ret)
 
-            if index is None:
-                if col_dict:
-                    # Any column is actually fine:
-                    new_rows = len(next(iter(col_dict.values())))
-                    index = RangeIndex(self._currow, self._currow + new_rows)
-                else:
-                    new_rows = 0
+        if index is None:
+            if col_dict:
+                # Any column is actually fine:
+                new_rows = len(next(iter(col_dict.values())))
+                index = RangeIndex(self._currow, self._currow + new_rows)
             else:
-                new_rows = len(index)
+                new_rows = 0
+        else:
+            new_rows = len(index)
 
-            df = DataFrame(col_dict, columns=columns, index=index)
+        df = DataFrame(col_dict, columns=columns, index=index)
 
-            self._currow += new_rows
+        self._currow += new_rows
 
-            if self.squeeze and len(df.columns) == 1:
-                return df[df.columns[0]].copy()
-            return df
+        if self.squeeze and len(df.columns) == 1:
+            return df[df.columns[0]].copy()
+        return df
 
     def _create_index(self, ret):
         index, columns, col_dict = ret
@@ -2178,7 +2178,6 @@ def read(self, nrows=None):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use arrow engine"
         )
-        nrows = _validate_integer("nrows", nrows)
         table = pyarrow.read_csv(
             self.src,
             read_options=pyarrow.ReadOptions(

From 474baf4c83ee28330ef38b426f09617d2f8cfc9e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 10 Feb 2020 20:35:38 -0800
Subject: [PATCH 08/35] Add asvs and xfail some tests

---
 asv_bench/benchmarks/io/csv.py | 10 ++++++++++
 pandas/io/parsers.py           |  8 +++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 9bcd125f56bbb..89c81a937090b 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -254,6 +254,16 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
+    def time_read_csv_arrow_engine(self, sep, decimal, float_precision):
+        read_csv(
+            self.data(self.StringIO_input),
+            sep=sep,
+            header=None,
+            engine="arrow",
+            float_precision=None,
+            names=list("abc"),
+        )
+
 
 class ReadCSVCategorical(BaseIO):
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index d3f40a6b9df2b..dd2155d2d735b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -521,7 +521,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
-_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"}
+_arrow_unsupported = {
+    "skipfooter",
+    "low_memory",
+    "float_precision",
+    "chunksize",
+    "comment",
+}
 _python_unsupported = {"low_memory", "float_precision"}
 
 _deprecated_defaults: Dict[str, Any] = {}

From 2cd993771b6c07a8144c8472c710e164410c8e37 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 19 Feb 2020 16:57:52 -0800
Subject: [PATCH 09/35] address comments

---
 asv_bench/benchmarks/io/csv.py     |  4 +-
 doc/source/whatsnew/v1.1.0.rst     |  2 +-
 pandas/io/parsers.py               | 63 +++++++++++++++++++-----------
 pandas/tests/io/parser/conftest.py | 14 +++----
 4 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 89c81a937090b..a4e6f94f326ba 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -254,12 +254,12 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
-    def time_read_csv_arrow_engine(self, sep, decimal, float_precision):
+    def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision):
         read_csv(
             self.data(self.StringIO_input),
             sep=sep,
             header=None,
-            engine="arrow",
+            engine="pyarrow",
             float_precision=None,
             names=list("abc"),
         )
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index fc0e486978ffb..297c561557053 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -43,7 +43,7 @@ Other enhancements
 
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
 - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
-  if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
   "python" counterparts.
 -
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index dd2155d2d735b..59678d675b0b1 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -5,7 +5,7 @@
 from collections import abc, defaultdict
 import csv
 import datetime
-from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper
+from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper
 import re
 import sys
 from textwrap import fill
@@ -166,10 +166,11 @@
     to preserve and not interpret dtype.
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
-engine : {{'c', 'python', 'arrow'}}, optional
-    Parser engine to use. The C and arrow engines are faster, while the python engine is
-    currently more feature-complete. The arrow engine requires ``pyarrow``
+engine : {{'c', 'python', 'pyarrow'}}, optional
+    Parser engine to use. The C and pyarrow engines are faster, while the python engine
+    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
+    .. versionchanged(1.1)
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -521,9 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
-_arrow_unsupported = {
+_pyarrow_unsupported = {
     "skipfooter",
-    "low_memory",
     "float_precision",
     "chunksize",
     "comment",
@@ -951,20 +951,29 @@ def _clean_options(self, options, engine):
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
 
-        # arrow engine not supported yet
-        if engine == "arrow":
-            if self.chunksize is not None:
-                fallback_reason = f"the arrow engine does not support chunksize"
-                engine = "python"
-        # C and arrow engine not supported yet
-        if engine == "c" or engine == "arrow":
+        # pyarrow engine not supported yet
+        if engine == "pyarrow":
+            for option in _pyarrow_unsupported:
+                if option != "chunksize" and option != "skipfooter":
+                    if options[option] is not None:
+                        fallback_reason = (
+                            f"the pyarrow engine does not support the {option} argumnet"
+                        )
+                        engine = "python"
+                else:
+                    if self.chunksize is not None:
+                        fallback_reason = (
+                            "the pyarrow engine does not support using chunksize"
+                        )
+        # C and pyarrow engine not supported yet
+        if engine == "c" or "pyarrow":
             if options["skipfooter"] > 0:
                 fallback_reason = f"the {engine} engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
         if sep is None and not delim_whitespace:
-            if engine == "c" or engine == "arrow":
+            if engine == "c" or engine == "pyarrow":
                 fallback_reason = (
                     f"the {engine} engine does not support "
                     "sep=None with delim_whitespace=False"
@@ -1093,13 +1102,14 @@ def _clean_options(self, options, engine):
         na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
 
         # handle skiprows; this is internally handled by the
-        # c-engine, so only need for python and arrow parsers
+        # c-engine, so only need for python and pyarrow parsers
         if engine != "c":
-            if engine == "arrow":
+            if engine == "pyarrow":
                 if not is_integer(skiprows) and skiprows is not None:
                     # pyarrow expects skiprows to be passed as an integer
                     raise ValueError(
-                        "skiprows argument must be an integer when using engine='arrow'"
+                        "skiprows argument must be an integer when using "
+                        "engine='pyarrow'"
                     )
             else:
                 if is_integer(skiprows):
@@ -2164,7 +2174,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
 
 class ArrowParserWrapper(ParserBase):
     """
-
+    Wrapper for the pyarrow engine for pd.read_csv()
     """
 
     def __init__(self, src, **kwds):
@@ -2174,12 +2184,13 @@ def __init__(self, src, **kwds):
 
         ParserBase.__init__(self, kwds)
 
-        # #2442
-        kwds["allow_leading_cols"] = self.index_col is not False
+        encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
 
-        # GH20529, validate usecol arg before TextReader
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
 
+        if isinstance(self.src, StringIO):
+            self.src = BytesIO(self.src.getvalue().encode(encoding))
+
     def read(self, nrows=None):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use arrow engine"
@@ -2197,12 +2208,18 @@ def read(self, nrows=None):
                 include_columns=self.usecols, column_types=self.kwds.get("dtype")
             ),
         )
-        if nrows:
-            table = table[:nrows]
+
         table_width = len(table.column_names)
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(table_width)]
+            elif self.header is not None:
+                if self.header == "infer":
+                    header = 0
+                else:
+                    header = self.header
+                self.names = table[header]
+                del table[header]
         if self.names:
             table = table.rename_columns(self.names)
         return table.to_pandas()
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 751db1d22e8ae..327f87303aeb0 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -44,8 +44,8 @@ class PythonParser(BaseParser):
     float_precision_choices = [None]
 
 
-class ArrowParser(BaseParser):
-    engine = "arrow"
+class PyArrowParser(BaseParser):
+    engine = "pyarrow"
     float_precision_choices = [None]
 
 
@@ -68,17 +68,17 @@ def csv1(csv_dir_path):
 _cParserHighMemory = CParserHighMemory()
 _cParserLowMemory = CParserLowMemory()
 _pythonParser = PythonParser()
-_arrowParser = ArrowParser()
+_pyarrowParser = PyArrowParser()
 
 _py_parsers_only = [_pythonParser]
 _c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
-_arrow_parsers_only = [_arrowParser]
-_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only]
+_pyarrow_parsers_only = [_pyarrowParser]
+_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
 
 _py_parser_ids = ["python"]
 _c_parser_ids = ["c_high", "c_low"]
-_arrow_parser_ids = ["arrow"]
-_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids]
+_pyarrow_parser_ids = ["pyarrow"]
+_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)

From 3d15a5660d7779eb7638875a33882b3e9103b190 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Thu, 20 Feb 2020 10:57:11 -0800
Subject: [PATCH 10/35] fix typo

---
 pandas/io/parsers.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 59678d675b0b1..4d31ca3230df6 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1138,7 +1138,7 @@ def __next__(self):
     def _make_engine(self, engine="c"):
         if engine == "c":
             self._engine = CParserWrapper(self.f, **self.options)
-        elif engine == "arrow":
+        elif engine == "pyarrow":
             self._engine = ArrowParserWrapper(self.f, **self.options)
         else:
             if engine == "python":
@@ -1157,7 +1157,7 @@ def _failover_to_python(self):
 
     def read(self, nrows=None):
         nrows = _validate_integer("nrows", nrows)
-        if self.engine == "arrow":
+        if self.engine == "pyarrow":
             return self._engine.read(nrows)
         ret = self._engine.read(nrows)
 
@@ -2208,21 +2208,19 @@ def read(self, nrows=None):
                 include_columns=self.usecols, column_types=self.kwds.get("dtype")
             ),
         )
-
+        frame = table.to_pandas()
         table_width = len(table.column_names)
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(table_width)]
-            elif self.header is not None:
-                if self.header == "infer":
-                    header = 0
-                else:
-                    header = self.header
-                self.names = table[header]
-                del table[header]
+            elif self.header is not None and self.header != "infer":
+                header = self.header
+                self.names = frame.iloc[header]
+                frame = frame.drop(header, axis=0)
+
         if self.names:
-            table = table.rename_columns(self.names)
-        return table.to_pandas()
+            frame = frame.rename(self.names, axis="columns")
+        return frame
 
 
 def TextParser(*args, **kwds):

From 98aa134d85044ab84adade39f66639777d971eed Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 29 Feb 2020 08:59:43 -0800
Subject: [PATCH 11/35] some fixes

---
 pandas/io/parsers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 3ab847ebd7e04..dbd55f2015d1c 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2195,7 +2195,9 @@ def read(self, nrows=None):
         table = pyarrow.read_csv(
             self.src,
             read_options=pyarrow.ReadOptions(
-                skip_rows=self.kwds.get("skiprows"), column_names=self.names
+                skip_rows=self.kwds.get("skiprows"),
+                column_names=self.names,
+                autogenerate_column_names=True if self.header != 0 else False,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
@@ -2215,8 +2217,7 @@ def read(self, nrows=None):
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
 
-        if self.names:
-            frame = frame.rename(self.names, axis="columns")
+        frame = frame.rename(zip(frame.names, self.names), axis="columns")
         return frame
 
 

From b9c6d2c0a2b177c12c94b30f7c1395d77d1d0242 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 4 Apr 2020 19:42:14 -0700
Subject: [PATCH 12/35] Fix bug

---
 pandas/io/parsers.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index dbd55f2015d1c..ac7658d5b3772 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -526,6 +526,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "float_precision",
     "chunksize",
     "comment",
+    "nrows",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -952,7 +953,11 @@ def _clean_options(self, options, engine):
         # pyarrow engine not supported yet
         if engine == "pyarrow":
             for option in _pyarrow_unsupported:
-                if option != "chunksize" and option != "skipfooter":
+                if (
+                    option != "chunksize"
+                    and option != "skipfooter"
+                    and option != "nrows"
+                ):
                     if options[option] is not None:
                         fallback_reason = (
                             f"the pyarrow engine does not support the {option} argumnet"
@@ -963,6 +968,10 @@ def _clean_options(self, options, engine):
                         fallback_reason = (
                             "the pyarrow engine does not support using chunksize"
                         )
+                    if self.nrows is not None:
+                        fallback_reason = (
+                            "the pyarrow engine does not support using skipfooter"
+                        )
         # C and pyarrow engine not supported yet
         if engine == "c" or "pyarrow":
             if options["skipfooter"] > 0:
@@ -2171,7 +2180,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
 
 class ArrowParserWrapper(ParserBase):
     """
-    Wrapper for the pyarrow engine for pd.read_csv()
+    Wrapper for the pyarrow engine for read_csv()
     """
 
     def __init__(self, src, **kwds):
@@ -2208,16 +2217,22 @@ def read(self, nrows=None):
             ),
         )
         frame = table.to_pandas()
-        table_width = len(table.column_names)
+        num_cols = len(frame.columns)
         if self.names is None:
             if self.prefix:
-                self.names = [f"{self.prefix}{i}" for i in range(table_width)]
-            elif self.header is not None and self.header != "infer":
+                self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
+                frame = frame.rename(
+                    dict(zip(frame.columns, self.names), axis="columns")
+                )
+            elif self.header != 0:
                 header = self.header
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
-
-        frame = frame.rename(zip(frame.names, self.names), axis="columns")
+                frame = frame.rename(
+                    dict(zip(frame.columns, self.names), axis="columns")
+                )
+        if self.kwds.get("squeeze"):
+            frame = frame.squeeze()
         return frame
 
 

From 7f891a64d8887d69ca435d6b7093a81239ca95f3 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 10 Apr 2020 11:02:05 -0700
Subject: [PATCH 13/35] New benchmark and fix more tests

---
 asv_bench/benchmarks/io/csv.py | 37 ++++++++++-------
 pandas/io/parsers.py           | 73 ++++++++++++++++++++++------------
 2 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index a4e6f94f326ba..047fc1fe5f7f7 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -10,7 +10,6 @@
 
 
 class ToCSV(BaseIO):
-
     fname = "__test__.csv"
     params = ["wide", "long", "mixed"]
     param_names = ["kind"]
@@ -43,7 +42,6 @@ def time_frame(self, kind):
 
 
 class ToCSVDatetime(BaseIO):
-
     fname = "__test__.csv"
 
     def setup(self):
@@ -55,7 +53,6 @@ def time_frame_date_formatting(self):
 
 
 class ToCSVDatetimeBig(BaseIO):
-
     fname = "__test__.csv"
     timeout = 1500
     params = [1000, 10000, 100000]
@@ -83,7 +80,6 @@ def data(self, stringio_object):
 
 
 class ReadCSVDInferDatetimeFormat(StringIORewind):
-
     params = ([True, False], ["custom", "iso8601", "ymd"])
     param_names = ["infer_datetime_format", "format"]
 
@@ -108,7 +104,6 @@ def time_read_csv(self, infer_datetime_format, format):
 
 
 class ReadCSVConcatDatetime(StringIORewind):
-
     iso8601 = "%Y-%m-%d %H:%M:%S"
 
     def setup(self):
@@ -126,7 +121,6 @@ def time_read_csv(self):
 
 
 class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
-
     params = (["nan", "0", ""],)
     param_names = ["bad_date_value"]
 
@@ -144,7 +138,6 @@ def time_read_csv(self, bad_date_value):
 
 
 class ReadCSVSkipRows(BaseIO):
-
     fname = "__test__.csv"
     params = [None, 10000]
     param_names = ["skiprows"]
@@ -190,7 +183,6 @@ def time_read_uint64_na_values(self):
 
 
 class ReadCSVThousands(BaseIO):
-
     fname = "__test__.csv"
     params = ([",", "|"], [None, ","])
     param_names = ["sep", "thousands"]
@@ -222,7 +214,6 @@ def time_comment(self):
 
 
 class ReadCSVFloatPrecision(StringIORewind):
-
     params = ([",", ";"], [".", "_"], [None, "high", "round_trip"])
     param_names = ["sep", "decimal", "float_precision"]
 
@@ -254,19 +245,38 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
-    def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision):
+    def time_read_csv_arrow(self, sep):
+        read_csv(
+            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+        )
+
+
+class ReadCSVEngine(StringIORewind):
+    def setup(self):
+        data = ["A,B,C"] + (["1,2,3"] * 100000)
+        self.StringIO_input = StringIO("\n".join(data))
+
+    def time_read_csv_c(self, sep):
+        read_csv(
+            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+        )
+
+    def time_read_csv_arrow(self, sep):
+        read_csv(
+            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+        )
+
+    def time_read_csv_python_engine(self, sep):
         read_csv(
             self.data(self.StringIO_input),
             sep=sep,
             header=None,
-            engine="pyarrow",
-            float_precision=None,
+            engine="python",
             names=list("abc"),
         )
 
 
 class ReadCSVCategorical(BaseIO):
-
     fname = "__test__.csv"
 
     def setup(self):
@@ -335,7 +345,6 @@ def time_read_csv_cached(self, do_cache):
 
 
 class ReadCSVMemoryGrowth(BaseIO):
-
     chunksize = 20
     num_rows = 1000
     fname = "__test__.csv"
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f17c1008e29a5..175dccf0633df 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -5,7 +5,7 @@
 from collections import abc, defaultdict
 import csv
 import datetime
-from io import BytesIO, StringIO, TextIOWrapper
+from io import StringIO, TextIOBase, TextIOWrapper
 import itertools
 import re
 import sys
@@ -172,7 +172,7 @@
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
-    .. versionchanged(1.1)
+    .. versionchanged:: (1.1)
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -1167,27 +1167,28 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        nrows = _validate_integer("nrows", nrows)
         if self.engine == "pyarrow":
-            return self._engine.read(nrows)
-        ret = self._engine.read(nrows)
+            df = self._engine.read()
+        else:
+            nrows = _validate_integer("nrows", nrows)
+            ret = self._engine.read(nrows)
 
-        # May alter columns / col_dict
-        index, columns, col_dict = self._create_index(ret)
+            # May alter columns / col_dict
+            index, columns, col_dict = self._create_index(ret)
 
-        if index is None:
-            if col_dict:
-                # Any column is actually fine:
-                new_rows = len(next(iter(col_dict.values())))
-                index = RangeIndex(self._currow, self._currow + new_rows)
+            if index is None:
+                if col_dict:
+                    # Any column is actually fine:
+                    new_rows = len(next(iter(col_dict.values())))
+                    index = RangeIndex(self._currow, self._currow + new_rows)
+                else:
+                    new_rows = 0
             else:
-                new_rows = 0
-        else:
-            new_rows = len(index)
+                new_rows = len(index)
 
-        df = DataFrame(col_dict, columns=columns, index=index)
+            df = DataFrame(col_dict, columns=columns, index=index)
 
-        self._currow += new_rows
+            self._currow += new_rows
 
         if self.squeeze and len(df.columns) == 1:
             return df[df.columns[0]].copy()
@@ -2231,6 +2232,19 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
         return values
 
 
+class BytesIOWrapper:
+    def __init__(self, string_buffer, encoding="utf-8"):
+        self.string_buffer = string_buffer
+        self.encoding = encoding
+
+    def __getattr__(self, attr):
+        return getattr(self.string_buffer, attr)
+
+    def read(self, size=-1):
+        content = self.string_buffer.read(size)
+        return content.encode(self.encoding)
+
+
 class ArrowParserWrapper(ParserBase):
     """
     Wrapper for the pyarrow engine for read_csv()
@@ -2247,10 +2261,10 @@ def __init__(self, src, **kwds):
 
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
 
-        if isinstance(self.src, StringIO):
-            self.src = BytesIO(self.src.getvalue().encode(encoding))
+        if isinstance(self.src, TextIOBase):
+            self.src = BytesIOWrapper(self.src, encoding=encoding)
 
-    def read(self, nrows=None):
+    def read(self):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use arrow engine"
         )
@@ -2259,7 +2273,9 @@ def read(self, nrows=None):
             read_options=pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
                 column_names=self.names,
-                autogenerate_column_names=True if self.header != 0 else False,
+                autogenerate_column_names=True
+                if self.header != 0 or self.kwds.get("skiprows") != set()
+                else False,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
@@ -2277,15 +2293,22 @@ def read(self, nrows=None):
                 frame = frame.rename(
                     dict(zip(frame.columns, self.names), axis="columns")
                 )
-            elif self.header != 0:
+            elif self.header is not None and self.header != 0:
                 header = self.header
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
                 frame = frame.rename(
-                    dict(zip(frame.columns, self.names), axis="columns")
+                    columns=dict(zip(frame.columns, self.names), axis="columns")
                 )
-        if self.kwds.get("squeeze"):
-            frame = frame.squeeze()
+            elif self.header is None:
+                self.names = range(len(frame.columns))
+                frame = frame.rename(
+                    columns=dict(zip(frame.columns, self.names), axis="columns")
+                )
+
+        index_col = self.kwds.get("index_col")[0]  # flatten list w/ 1 elem
+        if index_col is not None:
+            frame.set_index(frame.columns[index_col], drop=True, inplace=True)
         return frame
 
 

From 23425f7be4840ac48ff35058ae9a64d064628537 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 10 Apr 2020 15:27:33 -0700
Subject: [PATCH 14/35] More cleanups

---
 asv_bench/benchmarks/io/csv.py | 22 +++++++---------------
 doc/source/whatsnew/v1.1.0.rst |  6 +++---
 pandas/io/parsers.py           |  7 +++----
 3 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 047fc1fe5f7f7..b7d7c4e8c120a 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -245,7 +245,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
-    def time_read_csv_arrow(self, sep):
+    def time_read_csv_arrow(self, sep, decimal, float_precision):
         read_csv(
             self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
         )
@@ -256,23 +256,15 @@ def setup(self):
         data = ["A,B,C"] + (["1,2,3"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
 
-    def time_read_csv_c(self, sep):
-        read_csv(
-            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
-        )
+    def time_read_csv_c(self):
+        read_csv(self.data(self.StringIO_input))
 
-    def time_read_csv_arrow(self, sep):
-        read_csv(
-            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
-        )
+    def time_read_csv_arrow(self):
+        read_csv(self.data(self.StringIO_input), engine="pyarrow")
 
-    def time_read_csv_python_engine(self, sep):
+    def time_read_csv_python_engine(self):
         read_csv(
-            self.data(self.StringIO_input),
-            sep=sep,
-            header=None,
-            engine="python",
-            names=list("abc"),
+            self.data(self.StringIO_input), engine="python",
         )
 
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 4c44e35169ba7..b60a79a239628 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -88,9 +88,6 @@ Other enhancements
 - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
 - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
 - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
-- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
-  if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
-  "python" counterparts. (:issue:`23697`)
 
 .. ---------------------------------------------------------------------------
 
@@ -412,6 +409,9 @@ I/O
 - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`)
 - Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns
 - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`)
+- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
+  if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts. (:issue:`23697`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 175dccf0633df..455b7f748102d 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -172,6 +172,7 @@
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
+
     .. versionchanged:: (1.1)
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
@@ -2266,16 +2267,14 @@ def __init__(self, src, **kwds):
 
     def read(self):
         pyarrow = import_optional_dependency(
-            "pyarrow.csv", extra="pyarrow is required to use arrow engine"
+            "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
         table = pyarrow.read_csv(
             self.src,
             read_options=pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
                 column_names=self.names,
-                autogenerate_column_names=True
-                if self.header != 0 or self.kwds.get("skiprows") != set()
-                else False,
+                autogenerate_column_names=True if self.header != 0 else False,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),

From 01c03942b61f4ab38cf4712c4d078a52c4f27939 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 10 Apr 2020 19:46:34 -0700
Subject: [PATCH 15/35] Formatting fixes and typo correction

---
 asv_bench/benchmarks/io/csv.py | 9 +++++++++
 doc/source/whatsnew/v1.1.0.rst | 2 ++
 pandas/io/parsers.py           | 6 +++---
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index b7d7c4e8c120a..8dec39091e322 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -10,6 +10,7 @@
 
 
 class ToCSV(BaseIO):
+
     fname = "__test__.csv"
     params = ["wide", "long", "mixed"]
     param_names = ["kind"]
@@ -42,6 +43,7 @@ def time_frame(self, kind):
 
 
 class ToCSVDatetime(BaseIO):
+
     fname = "__test__.csv"
 
     def setup(self):
@@ -53,6 +55,7 @@ def time_frame_date_formatting(self):
 
 
 class ToCSVDatetimeBig(BaseIO):
+
     fname = "__test__.csv"
     timeout = 1500
     params = [1000, 10000, 100000]
@@ -80,6 +83,7 @@ def data(self, stringio_object):
 
 
 class ReadCSVDInferDatetimeFormat(StringIORewind):
+
     params = ([True, False], ["custom", "iso8601", "ymd"])
     param_names = ["infer_datetime_format", "format"]
 
@@ -104,6 +108,7 @@ def time_read_csv(self, infer_datetime_format, format):
 
 
 class ReadCSVConcatDatetime(StringIORewind):
+
     iso8601 = "%Y-%m-%d %H:%M:%S"
 
     def setup(self):
@@ -121,6 +126,7 @@ def time_read_csv(self):
 
 
 class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
+
     params = (["nan", "0", ""],)
     param_names = ["bad_date_value"]
 
@@ -138,6 +144,7 @@ def time_read_csv(self, bad_date_value):
 
 
 class ReadCSVSkipRows(BaseIO):
+
     fname = "__test__.csv"
     params = [None, 10000]
     param_names = ["skiprows"]
@@ -183,6 +190,7 @@ def time_read_uint64_na_values(self):
 
 
 class ReadCSVThousands(BaseIO):
+
     fname = "__test__.csv"
     params = ([",", "|"], [None, ","])
     param_names = ["sep", "thousands"]
@@ -214,6 +222,7 @@ def time_comment(self):
 
 
 class ReadCSVFloatPrecision(StringIORewind):
+
     params = ([",", ";"], [".", "_"], [None, "high", "round_trip"])
     param_names = ["sep", "decimal", "float_precision"]
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 690df648ceada..1704f3c096801 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -527,6 +527,8 @@ I/O
 - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
   if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or
   "python" counterparts. (:issue:`23697`)
+
+
 Plotting
 ^^^^^^^^
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 455b7f748102d..0cf148366cc1c 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -975,7 +975,7 @@ def _clean_options(self, options, engine):
                         )
                     if self.nrows is not None:
                         fallback_reason = (
-                            "the pyarrow engine does not support using skipfooter"
+                            "the pyarrow engine does not support using nrows"
                         )
         # C and pyarrow engine not supported yet
         if engine == "c" or "pyarrow":
@@ -2305,9 +2305,9 @@ def read(self):
                     columns=dict(zip(frame.columns, self.names), axis="columns")
                 )
 
-        index_col = self.kwds.get("index_col")[0]  # flatten list w/ 1 elem
+        index_col = self.kwds.get("index_col")  # need to flatten since returns list
         if index_col is not None:
-            frame.set_index(frame.columns[index_col], drop=True, inplace=True)
+            frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True)
         return frame
 
 

From ba5620ff84a14baa0814f96d2499b652a30afdd8 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 11 Apr 2020 17:22:45 -0700
Subject: [PATCH 16/35] skip pyarrow tests if not installed

---
 asv_bench/benchmarks/io/csv.py     |  1 +
 pandas/tests/io/parser/conftest.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 8dec39091e322..fef4fee047862 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -346,6 +346,7 @@ def time_read_csv_cached(self, do_cache):
 
 
 class ReadCSVMemoryGrowth(BaseIO):
+
     chunksize = 20
     num_rows = 1000
     fname = "__test__.csv"
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 327f87303aeb0..87a34d728bc60 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -1,4 +1,5 @@
 import os
+import pkgutil
 from typing import List, Optional
 
 import pytest
@@ -73,12 +74,17 @@ def csv1(csv_dir_path):
 _py_parsers_only = [_pythonParser]
 _c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
 _pyarrow_parsers_only = [_pyarrowParser]
-_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
 
 _py_parser_ids = ["python"]
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
-_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
+
+if pkgutil.find_loader("pyarrow"):
+    _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
+    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
+else:
+    _all_parsers = [*_c_parsers_only, *_py_parsers_only]
+    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)

From 2570c823f28eb722435929dd86ccfdfb2ff1a37b Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 11 Apr 2020 17:31:51 -0700
Subject: [PATCH 17/35] Address comments

---
 pandas/io/parsers.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 0cf148366cc1c..235cefd82f2d5 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -173,7 +173,8 @@
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
     as a dependency however.
 
-    .. versionchanged:: (1.1)
+    .. versionchanged:: 1.1
+        The "pyarrow" engine was added.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -958,11 +959,7 @@ def _clean_options(self, options, engine):
         # pyarrow engine not supported yet
         if engine == "pyarrow":
             for option in _pyarrow_unsupported:
-                if (
-                    option != "chunksize"
-                    and option != "skipfooter"
-                    and option != "nrows"
-                ):
+                if option not in ["chunksize", "skipfooter", "nrows"]:
                     if options[option] is not None:
                         fallback_reason = (
                             f"the pyarrow engine does not support the {option} argumnet"
@@ -2274,11 +2271,12 @@ def read(self):
             read_options=pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
                 column_names=self.names,
-                autogenerate_column_names=True if self.header != 0 else False,
+                autogenerate_column_names=False if self.header == 0 else True,
             ),
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
                 quote_char=self.kwds.get("quotechar"),
+                ignore_empty_lines=self.kwds.get("skip_blank_lines"),
             ),
             convert_options=pyarrow.ConvertOptions(
                 include_columns=self.usecols, column_types=self.kwds.get("dtype")
@@ -2289,21 +2287,15 @@ def read(self):
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-                frame = frame.rename(
-                    dict(zip(frame.columns, self.names), axis="columns")
-                )
+                frame.columns = self.names
             elif self.header is not None and self.header != 0:
                 header = self.header
                 self.names = frame.iloc[header]
                 frame = frame.drop(header, axis=0)
-                frame = frame.rename(
-                    columns=dict(zip(frame.columns, self.names), axis="columns")
-                )
+                frame.columns = self.names
             elif self.header is None:
-                self.names = range(len(frame.columns))
-                frame = frame.rename(
-                    columns=dict(zip(frame.columns, self.names), axis="columns")
-                )
+                self.names = range(num_cols)
+                frame.columns = self.names
 
         index_col = self.kwds.get("index_col")  # need to flatten since returns list
         if index_col is not None:

From b3a1f6628879b8df819c82bc75686d6fd89f42d2 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 14 Apr 2020 14:24:28 -0700
Subject: [PATCH 18/35] Get some more tests to pass

---
 asv_bench/benchmarks/io/csv.py        |  2 +-
 pandas/io/parsers.py                  | 45 ++++++++++++++++-----------
 pandas/tests/io/parser/test_common.py |  1 +
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index fef4fee047862..55bc8d35af432 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 class ReadCSVEngine(StringIORewind):
     def setup(self):
-        data = ["A,B,C"] + (["1,2,3"] * 100000)
+        data = ["A,B,C"] + (["1,2,3"] * 1000000)
         self.StringIO_input = StringIO("\n".join(data))
 
     def time_read_csv_c(self):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 235cefd82f2d5..444582cbe723c 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -531,6 +531,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "chunksize",
     "comment",
     "nrows",
+    "thousands",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -959,12 +960,11 @@ def _clean_options(self, options, engine):
         # pyarrow engine not supported yet
         if engine == "pyarrow":
             for option in _pyarrow_unsupported:
-                if option not in ["chunksize", "skipfooter", "nrows"]:
+                if option not in ["chunksize", "nrows"]:
                     if options[option] is not None:
                         fallback_reason = (
                             f"the pyarrow engine does not support the {option} argumnet"
                         )
-                        engine = "python"
                 else:
                     if self.chunksize is not None:
                         fallback_reason = (
@@ -974,10 +974,10 @@ def _clean_options(self, options, engine):
                         fallback_reason = (
                             "the pyarrow engine does not support using nrows"
                         )
-        # C and pyarrow engine not supported yet
-        if engine == "c" or "pyarrow":
+        # C engine not supported yet
+        if engine == "c":
             if options["skipfooter"] > 0:
-                fallback_reason = f"the {engine} engine does not support skipfooter"
+                fallback_reason = f"the 'c' engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
@@ -1157,7 +1157,7 @@ def _make_engine(self, engine="c"):
             else:
                 raise ValueError(
                     f"Unknown engine: {engine} (valid options "
-                    'are "c", "python", "arrow", or "python-fwf")'
+                    'are "c", "python", "pyarrow", or "python-fwf")'
                 )
             self._engine = klass(self.f, **self.options)
 
@@ -2266,13 +2266,24 @@ def read(self):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
-        table = pyarrow.read_csv(
-            self.src,
-            read_options=pyarrow.ReadOptions(
+        try:
+            read_options = pyarrow.ReadOptions(
                 skip_rows=self.kwds.get("skiprows"),
-                column_names=self.names,
                 autogenerate_column_names=False if self.header == 0 else True,
-            ),
+            )
+        except TypeError as e:
+            msg = "__init__() got an unexpected keyword argument"
+            if msg in str(e):
+                raise ImportError(
+                    "Pyarrow version >= 0.15.0 is needed in order "
+                    "to use skiprows kwarg with engine=pyarrow. "
+                    "Please upgrade Pyarrow or switch engines."
+                )
+            else:
+                raise e
+        table = pyarrow.read_csv(
+            self.src,
+            read_options=read_options,
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
                 quote_char=self.kwds.get("quotechar"),
@@ -2287,17 +2298,13 @@ def read(self):
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-                frame.columns = self.names
             elif self.header is not None and self.header != 0:
-                header = self.header
-                self.names = frame.iloc[header]
-                frame = frame.drop(header, axis=0)
-                frame.columns = self.names
+                self.names = frame.iloc[self.header]
+                frame = frame.drop(self.header, axis=0)
             elif self.header is None:
                 self.names = range(num_cols)
-                frame.columns = self.names
-
-        index_col = self.kwds.get("index_col")  # need to flatten since returns list
+        frame.columns = self.names
+        index_col = self.index_col  # need to flatten since returns list
         if index_col is not None:
             frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True)
         return frame
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 5bf9587a6ca22..f27178cdc429f 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -63,6 +63,7 @@ def _set_noconvert_columns(self):
         "parse_dates": parse_dates,
         "delimiter": ",",
     }
+    parser.engine = "c"
     parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
 
     result = parser.read()

From d46ceed07a5197cc24748e09a92c3b8199ce7fa3 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Thu, 16 Apr 2020 20:20:22 -0700
Subject: [PATCH 19/35] Fix some bugs and cleanups

---
 pandas/io/parsers.py | 113 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 85 insertions(+), 28 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 444582cbe723c..39ee43f905950 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -532,6 +532,24 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "comment",
     "nrows",
     "thousands",
+    "memory_map",
+    "dialect",
+    "warn_bad_lines",
+    "error_bad_lines",
+    "delim_whitespace",
+    "quoting",
+    "lineterminator",
+    "converters",
+    "decimal",
+    "iterator",
+    "cache_dates",
+    "dayfirst",
+    "keep_date_col",
+    "infer_datetime_format",
+    "verbose",
+    "skipinitialspace",
+    "date_parser",
+    "cache_dates",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -902,6 +920,16 @@ def _get_options_with_defaults(self, engine):
         for argname, default in _parser_defaults.items():
             value = kwds.get(argname, default)
 
+            if argname in _pyarrow_unsupported:
+                if engine == "pyarrow" and value != default:
+                    raise ValueError(
+                        f"The {repr(argname)} option is not supported with the "
+                        f"{repr(engine)} engine"
+                    )
+            if argname == "iterator" and engine == "pyarrow":
+                raise ValueError(
+                    "The iterator option is not supported with the" "pyarrow engine"
+                )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
                 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
@@ -957,27 +985,10 @@ def _clean_options(self, options, engine):
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
 
-        # pyarrow engine not supported yet
-        if engine == "pyarrow":
-            for option in _pyarrow_unsupported:
-                if option not in ["chunksize", "nrows"]:
-                    if options[option] is not None:
-                        fallback_reason = (
-                            f"the pyarrow engine does not support the {option} argumnet"
-                        )
-                else:
-                    if self.chunksize is not None:
-                        fallback_reason = (
-                            "the pyarrow engine does not support using chunksize"
-                        )
-                    if self.nrows is not None:
-                        fallback_reason = (
-                            "the pyarrow engine does not support using nrows"
-                        )
         # C engine not supported yet
         if engine == "c":
             if options["skipfooter"] > 0:
-                fallback_reason = f"the 'c' engine does not support skipfooter"
+                fallback_reason = "the 'c' engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
@@ -2251,13 +2262,16 @@ class ArrowParserWrapper(ParserBase):
     def __init__(self, src, **kwds):
         self.kwds = kwds
         self.src = src
-        kwds = kwds.copy()
+        # kwds = kwds.copy()
 
         ParserBase.__init__(self, kwds)
 
         encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
 
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
+        self.na_values = _clean_na_values(
+            kwds["na_values"], keep_default_na=kwds["keep_default_na"]
+        )
 
         if isinstance(self.src, TextIOBase):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
@@ -2268,8 +2282,7 @@ def read(self):
         )
         try:
             read_options = pyarrow.ReadOptions(
-                skip_rows=self.kwds.get("skiprows"),
-                autogenerate_column_names=False if self.header == 0 else True,
+                skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True,
             )
         except TypeError as e:
             msg = "__init__() got an unexpected keyword argument"
@@ -2287,10 +2300,14 @@ def read(self):
             parse_options=pyarrow.ParseOptions(
                 delimiter=self.kwds.get("delimiter"),
                 quote_char=self.kwds.get("quotechar"),
+                escape_char=self.kwds.get("escapechar"),
                 ignore_empty_lines=self.kwds.get("skip_blank_lines"),
             ),
             convert_options=pyarrow.ConvertOptions(
-                include_columns=self.usecols, column_types=self.kwds.get("dtype")
+                include_columns=self.usecols,
+                null_values=self.kwds.get("na_values"),
+                true_values=self.kwds.get("true_values"),
+                false_values=self.kwds.get("false_values"),
             ),
         )
         frame = table.to_pandas()
@@ -2298,17 +2315,57 @@ def read(self):
         if self.names is None:
             if self.prefix:
                 self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-            elif self.header is not None and self.header != 0:
-                self.names = frame.iloc[self.header]
-                frame = frame.drop(self.header, axis=0)
+            elif self.header is not None:
+                self.names = frame.iloc[self.header].tolist()
+                frame.drop(range(self.header + 1), axis=0, inplace=True)
+                frame.reset_index(drop=True, inplace=True)
             elif self.header is None:
                 self.names = range(num_cols)
         frame.columns = self.names
-        index_col = self.index_col  # need to flatten since returns list
-        if index_col is not None:
-            frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True)
+        if self.index_col is not None:
+            index_col = [frame.columns[i] for i in self.index_col]
+            frame.set_index(index_col, drop=True, inplace=True)
+        if self.kwds.get("dtype") is not None:
+            frame = frame.astype(self.kwds.get("dtype"))
+        else:
+            frame = frame.infer_objects()
         return frame
 
+    def _clean_na_values(na_values, keep_default_na=True):
+        if na_values is None:
+            if keep_default_na:
+                na_values = STR_NA_VALUES
+            else:
+                na_values = set()
+            na_fvalues = set()
+        elif isinstance(na_values, dict):
+            old_na_values = na_values.copy()
+            na_values = {}  # Prevent aliasing.
+
+            # Convert the values in the na_values dictionary
+            # into array-likes for further use. This is also
+            # where we append the default NaN values, provided
+            # that `keep_default_na=True`.
+            for k, v in old_na_values.items():
+                if not is_list_like(v):
+                    v = [v]
+
+                if keep_default_na:
+                    v = set(v) | STR_NA_VALUES
+
+                na_values[k] = v
+            na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
+        else:
+            if not is_list_like(na_values):
+                na_values = [na_values]
+            na_values = _stringify_na_values(na_values)
+            if keep_default_na:
+                na_values = na_values | STR_NA_VALUES
+
+            na_fvalues = _floatify_na_values(na_values)
+
+        return na_values, na_fvalues
+
 
 def TextParser(*args, **kwds):
     """

From 637845922e829e9a6bc97c577b064935591f99ac Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 19 May 2020 20:40:57 -0700
Subject: [PATCH 20/35] Perform version checks for submodule imports too

---
 pandas/compat/_optional.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 7e253a52a9c00..139641f300980 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -1,5 +1,6 @@
 import distutils.version
 import importlib
+import sys
 import types
 import warnings
 
@@ -92,10 +93,16 @@ def import_optional_dependency(
             raise ImportError(msg) from None
         else:
             return None
-
+    # Grab parent module if submodule being imported
+    parent = name.split(".")[0]
+    if parent != name:
+        name = parent
+        module_to_get = sys.modules[name]
+    else:
+        module_to_get = module
     minimum_version = VERSIONS.get(name)
     if minimum_version:
-        version = _get_version(module)
+        version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
             assert on_version in {"warn", "raise", "ignore"}
             msg = (

From 9d648821b047419b9541381ad50c419f9f571847 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 19 May 2020 20:44:52 -0700
Subject: [PATCH 21/35] Refresh with newer pyarrow

---
 asv_bench/benchmarks/io/csv.py |  19 ++++--
 pandas/io/parsers.py           | 116 +++++++++++++--------------------
 2 files changed, 59 insertions(+), 76 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 55bc8d35af432..52d88d20b6d52 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -262,20 +262,31 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 class ReadCSVEngine(StringIORewind):
     def setup(self):
-        data = ["A,B,C"] + (["1,2,3"] * 1000000)
+        data = ["A,B,C"] + (["1,2,3"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
+        # simulate reading from file
+        self.BytesIO_input = self.StringIO_input.read().encode("utf-8")
 
-    def time_read_csv_c(self):
+    def time_read_stringcsv_c(self):
         read_csv(self.data(self.StringIO_input))
 
-    def time_read_csv_arrow(self):
+    def time_read_stringcsv_arrow(self):
         read_csv(self.data(self.StringIO_input), engine="pyarrow")
 
-    def time_read_csv_python_engine(self):
+    def time_read_stringcsv_python_engine(self):
         read_csv(
             self.data(self.StringIO_input), engine="python",
         )
 
+    def time_read_bytescsv_c(self):
+        read_csv(self.BytesIO_input)
+
+    def time_read_bytescsv_arrow(self):
+        read_csv(self.BytesIO_input, engine="pyarrow")
+
+    def time_read_bytescsv_python_engine(self):
+        read_csv(self.BytesIO_input, engine="python")
+
 
 class ReadCSVCategorical(BaseIO):
     fname = "__test__.csv"
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 39ee43f905950..40dbfc4c4956d 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -170,7 +170,7 @@
     of dtype conversion.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
-    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13
+    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15
     as a dependency however.
 
     .. versionchanged:: 1.1
@@ -919,7 +919,6 @@ def _get_options_with_defaults(self, engine):
 
         for argname, default in _parser_defaults.items():
             value = kwds.get(argname, default)
-
             if argname in _pyarrow_unsupported:
                 if engine == "pyarrow" and value != default:
                     raise ValueError(
@@ -928,7 +927,7 @@ def _get_options_with_defaults(self, engine):
                     )
             if argname == "iterator" and engine == "pyarrow":
                 raise ValueError(
-                    "The iterator option is not supported with the" "pyarrow engine"
+                    "The iterator option is not supported with the pyarrow engine"
                 )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
@@ -2262,17 +2261,22 @@ class ArrowParserWrapper(ParserBase):
     def __init__(self, src, **kwds):
         self.kwds = kwds
         self.src = src
-        # kwds = kwds.copy()
 
         ParserBase.__init__(self, kwds)
 
         encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8"
 
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
-        self.na_values = _clean_na_values(
-            kwds["na_values"], keep_default_na=kwds["keep_default_na"]
+        na_values = kwds["na_values"]
+        if isinstance(na_values, dict):
+            raise ValueError(
+                "The pyarrow engine doesn't support passing a dict for na_values"
+            )
+        self.na_values = list(
+            _clean_na_values(
+                kwds["na_values"], keep_default_na=kwds["keep_default_na"]
+            )[0]
         )
-
         if isinstance(self.src, TextIOBase):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
@@ -2280,48 +2284,51 @@ def read(self):
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
+        kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
+        # these are kwargs passed to pyarrow
+        parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
+        convertoptions = {
+            "include_columns",
+            "null_values",
+            "true_values",
+            "false_values",
+        }
+        parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions}
+        convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
+        read_options = pyarrow.ReadOptions(autogenerate_column_names=True)
+        headerexists = True if self.header is not None and self.header >= 0 else False
         try:
-            read_options = pyarrow.ReadOptions(
-                skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True,
-            )
+            skiprows = self.kwds.get("skiprows")
+            if skiprows is not None:
+                read_options = pyarrow.ReadOptions(skip_rows=skiprows)
+            elif self.header >= 0:
+                read_options = pyarrow.ReadOptions(skip_rows=self.header)
         except TypeError as e:
             msg = "__init__() got an unexpected keyword argument"
             if msg in str(e):
                 raise ImportError(
-                    "Pyarrow version >= 0.15.0 is needed in order "
-                    "to use skiprows kwarg with engine=pyarrow. "
-                    "Please upgrade Pyarrow or switch engines."
+                    "pyarrow version >= 0.15.0 is required to use "
+                    "read_csv with engine='pyarrow'"
                 )
-            else:
-                raise e
         table = pyarrow.read_csv(
             self.src,
             read_options=read_options,
-            parse_options=pyarrow.ParseOptions(
-                delimiter=self.kwds.get("delimiter"),
-                quote_char=self.kwds.get("quotechar"),
-                escape_char=self.kwds.get("escapechar"),
-                ignore_empty_lines=self.kwds.get("skip_blank_lines"),
-            ),
-            convert_options=pyarrow.ConvertOptions(
-                include_columns=self.usecols,
-                null_values=self.kwds.get("na_values"),
-                true_values=self.kwds.get("true_values"),
-                false_values=self.kwds.get("false_values"),
-            ),
+            parse_options=pyarrow.ParseOptions(**parse_options),
+            convert_options=pyarrow.ConvertOptions(**convert_options),
         )
         frame = table.to_pandas()
         num_cols = len(frame.columns)
-        if self.names is None:
-            if self.prefix:
-                self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-            elif self.header is not None:
-                self.names = frame.iloc[self.header].tolist()
-                frame.drop(range(self.header + 1), axis=0, inplace=True)
-                frame.reset_index(drop=True, inplace=True)
-            elif self.header is None:
-                self.names = range(num_cols)
-        frame.columns = self.names
+        if not headerexists:
+            if self.names is None:
+                if self.prefix is not None:
+                    self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
+                # elif self.header is not None:
+                #    self.names = frame.iloc[self.header].tolist()
+                #    frame.drop(range(self.header + 1), axis=0, inplace=True)
+                #    frame.reset_index(drop=True, inplace=True)
+                elif self.header is None:
+                    self.names = range(num_cols)
+            frame.columns = self.names
         if self.index_col is not None:
             index_col = [frame.columns[i] for i in self.index_col]
             frame.set_index(index_col, drop=True, inplace=True)
@@ -2331,41 +2338,6 @@ def read(self):
             frame = frame.infer_objects()
         return frame
 
-    def _clean_na_values(na_values, keep_default_na=True):
-        if na_values is None:
-            if keep_default_na:
-                na_values = STR_NA_VALUES
-            else:
-                na_values = set()
-            na_fvalues = set()
-        elif isinstance(na_values, dict):
-            old_na_values = na_values.copy()
-            na_values = {}  # Prevent aliasing.
-
-            # Convert the values in the na_values dictionary
-            # into array-likes for further use. This is also
-            # where we append the default NaN values, provided
-            # that `keep_default_na=True`.
-            for k, v in old_na_values.items():
-                if not is_list_like(v):
-                    v = [v]
-
-                if keep_default_na:
-                    v = set(v) | STR_NA_VALUES
-
-                na_values[k] = v
-            na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
-        else:
-            if not is_list_like(na_values):
-                na_values = [na_values]
-            na_values = _stringify_na_values(na_values)
-            if keep_default_na:
-                na_values = na_values | STR_NA_VALUES
-
-            na_fvalues = _floatify_na_values(na_values)
-
-        return na_values, na_fvalues
-
 
 def TextParser(*args, **kwds):
     """

From 93382b421cf62c2ad2f1ede65bd702e2912e8db6 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Thu, 21 May 2020 11:55:20 -0700
Subject: [PATCH 22/35] Start xfailing tests

---
 asv_bench/benchmarks/io/csv.py             |  4 +--
 pandas/io/parsers.py                       |  4 ---
 pandas/tests/io/parser/conftest.py         | 19 ++++++++--
 pandas/tests/io/parser/test_common.py      | 42 +++++++++++-----------
 pandas/tests/io/parser/test_compression.py | 15 +++++---
 5 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 52d88d20b6d52..6e166ec315df6 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -1,4 +1,4 @@
-from io import StringIO
+from io import BytesIO, StringIO
 import random
 import string
 
@@ -265,7 +265,7 @@ def setup(self):
         data = ["A,B,C"] + (["1,2,3"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
         # simulate reading from file
-        self.BytesIO_input = self.StringIO_input.read().encode("utf-8")
+        self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
 
     def time_read_stringcsv_c(self):
         read_csv(self.data(self.StringIO_input))
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5bcd9253abb72..e64ca0651e7c7 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2322,10 +2322,6 @@ def read(self):
             if self.names is None:
                 if self.prefix is not None:
                     self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
-                # elif self.header is not None:
-                #    self.names = frame.iloc[self.header].tolist()
-                #    frame.drop(range(self.header + 1), axis=0, inplace=True)
-                #    frame.reset_index(drop=True, inplace=True)
                 elif self.header is None:
                     self.names = range(num_cols)
             frame.columns = self.names
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 87a34d728bc60..8f473bded9225 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -1,7 +1,8 @@
+import distutils.version
 import os
-import pkgutil
 from typing import List, Optional
 
+import pkg_resources
 import pytest
 
 from pandas import read_csv, read_table
@@ -79,7 +80,10 @@ def csv1(csv_dir_path):
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
 
-if pkgutil.find_loader("pyarrow"):
+pyarrow_version = pkg_resources.get_distribution("pyarrow").version
+if (
+    distutils.version.LooseVersion(pyarrow_version) > "0.15.0"
+):  # TODO remove this if block once required pyarrow>0.15.0
     _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
     _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 else:
@@ -135,3 +139,14 @@ def encoding_fmt(request):
     Fixture for all possible string formats of a UTF encoding.
     """
     return request.param
+
+
+@pytest.fixture
+def pyarrow_xfail(request):
+    """
+    Fixture that xfails a test if the engine is pyarrow.
+    """
+    if "all_parsers" in request.fixturenames:
+        parser = request.getfixturevalue("all_parsers")
+        if parser.engine == "pyarrow":
+            pytest.xfail("pyarrow doesn't support this.")
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index b6987dae5ed2b..e0b6d70b607d6 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -70,7 +70,7 @@ def _set_noconvert_columns(self):
     tm.assert_frame_equal(result, expected)
 
 
-def test_empty_decimal_marker(all_parsers):
+def test_empty_decimal_marker(all_parsers, pyarrow_xfail):
     data = """A|B|C
 1|2,334|5
 10|13|10.
@@ -83,7 +83,7 @@ def test_empty_decimal_marker(all_parsers):
         parser.read_csv(StringIO(data), decimal="")
 
 
-def test_bad_stream_exception(all_parsers, csv_dir_path):
+def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail):
     # see gh-13652
     #
     # This test validates that both the Python engine and C engine will
@@ -169,7 +169,7 @@ def test_squeeze(all_parsers):
     assert not result._is_view
 
 
-def test_malformed(all_parsers):
+def test_malformed(all_parsers, pyarrow_xfail):
     # see gh-6607
     parser = all_parsers
     data = """ignore
@@ -184,7 +184,7 @@ def test_malformed(all_parsers):
 
 
 @pytest.mark.parametrize("nrows", [5, 3, None])
-def test_malformed_chunks(all_parsers, nrows):
+def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail):
     data = """ignore
 A,B,C
 skip
@@ -203,7 +203,7 @@ def test_malformed_chunks(all_parsers, nrows):
         reader.read(nrows)
 
 
-def test_unnamed_columns(all_parsers):
+def test_unnamed_columns(all_parsers, pyarrow_xfail):
     data = """A,B,C,,
 1,2,3,4,5
 6,7,8,9,10
@@ -306,7 +306,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path):
     tm.assert_frame_equal(result, expected)
 
 
-def test_read_csv_wrong_num_columns(all_parsers):
+def test_read_csv_wrong_num_columns(all_parsers, pyarrow_xfail):
     # Too few columns.
     data = """A,B,C,D,E,F
 1,2,3,4,5,6
@@ -422,7 +422,7 @@ def test_int_conversion(all_parsers):
 
 
 @pytest.mark.parametrize("nrows", [3, 3.0])
-def test_read_nrows(all_parsers, nrows):
+def test_read_nrows(all_parsers, nrows, pyarrow_xfail):
     # see gh-10476
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -443,7 +443,7 @@ def test_read_nrows(all_parsers, nrows):
 
 
 @pytest.mark.parametrize("nrows", [1.2, "foo", -1])
-def test_read_nrows_bad(all_parsers, nrows):
+def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -460,7 +460,7 @@ def test_read_nrows_bad(all_parsers, nrows):
 
 
 @pytest.mark.parametrize("index_col", [0, "index"])
-def test_read_chunksize_with_index(all_parsers, index_col):
+def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail):
     parser = all_parsers
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -492,7 +492,7 @@ def test_read_chunksize_with_index(all_parsers, index_col):
 
 
 @pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
-def test_read_chunksize_bad(all_parsers, chunksize):
+def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -509,7 +509,7 @@ def test_read_chunksize_bad(all_parsers, chunksize):
 
 
 @pytest.mark.parametrize("chunksize", [2, 8])
-def test_read_chunksize_and_nrows(all_parsers, chunksize):
+def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail):
     # see gh-15755
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -527,7 +527,7 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize):
     tm.assert_frame_equal(concat(reader), expected)
 
 
-def test_read_chunksize_and_nrows_changing_size(all_parsers):
+def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail):
     data = """index,A,B,C,D
 foo,2,3,4,5
 bar,7,8,9,10
@@ -549,7 +549,7 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
         reader.get_chunk(size=3)
 
 
-def test_get_chunk_passed_chunksize(all_parsers):
+def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail):
     parser = all_parsers
     data = """A,B,C
 1,2,3
@@ -565,7 +565,7 @@ def test_get_chunk_passed_chunksize(all_parsers):
 
 
 @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)])
-def test_read_chunksize_compat(all_parsers, kwargs):
+def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail):
     # see gh-12185
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -582,7 +582,7 @@ def test_read_chunksize_compat(all_parsers, kwargs):
     tm.assert_frame_equal(concat(reader), result)
 
 
-def test_read_chunksize_jagged_names(all_parsers):
+def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail):
     # see gh-23509
     parser = all_parsers
     data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
@@ -594,7 +594,7 @@ def test_read_chunksize_jagged_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_read_data_list(all_parsers):
+def test_read_data_list(all_parsers, pyarrow_xfail):
     parser = all_parsers
     kwargs = dict(index_col=0)
     data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
@@ -608,7 +608,7 @@ def test_read_data_list(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_iterator(all_parsers):
+def test_iterator(all_parsers, pyarrow_xfail):
     # see gh-6607
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -631,7 +631,7 @@ def test_iterator(all_parsers):
     tm.assert_frame_equal(last_chunk, expected[3:])
 
 
-def test_iterator2(all_parsers):
+def test_iterator2(all_parsers, pyarrow_xfail):
     parser = all_parsers
     data = """A,B,C
 foo,1,2,3
@@ -694,7 +694,7 @@ def test_reader_list_skiprows(all_parsers):
     tm.assert_frame_equal(chunks[0], expected[1:3])
 
 
-def test_iterator_stop_on_chunksize(all_parsers):
+def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail):
     # gh-3967: stopping iteration when chunksize is specified
     parser = all_parsers
     data = """A,B,C
@@ -718,7 +718,7 @@ def test_iterator_stop_on_chunksize(all_parsers):
 @pytest.mark.parametrize(
     "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)]
 )
-def test_iterator_skipfooter_errors(all_parsers, kwargs):
+def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail):
     msg = "'skipfooter' not supported for 'iteration'"
     parser = all_parsers
     data = "a\n1\n2"
@@ -727,7 +727,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs):
         parser.read_csv(StringIO(data), skipfooter=1, **kwargs)
 
 
-def test_nrows_skipfooter_errors(all_parsers):
+def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail):
     msg = "'skipfooter' not supported with 'nrows'"
     data = "a\n1\n2\n3\n4\n5\n6"
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index b773664adda72..22bba9bd3f98a 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -29,7 +29,7 @@ def parser_and_data(all_parsers, csv1):
 
 
 @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
-def test_zip(parser_and_data, compression):
+def test_zip(parser_and_data, compression, pyarrow_xfail):
     parser, data, expected = parser_and_data
 
     with tm.ensure_clean("test_file.zip") as path:
@@ -46,7 +46,7 @@ def test_zip(parser_and_data, compression):
 
 
 @pytest.mark.parametrize("compression", ["zip", "infer"])
-def test_zip_error_multiple_files(parser_and_data, compression):
+def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail):
     parser, data, expected = parser_and_data
 
     with tm.ensure_clean("combined_zip.zip") as path:
@@ -60,7 +60,7 @@ def test_zip_error_multiple_files(parser_and_data, compression):
             parser.read_csv(path, compression=compression)
 
 
-def test_zip_error_no_files(parser_and_data):
+def test_zip_error_no_files(parser_and_data, pyarrow_xfail):
     parser, _, _ = parser_and_data
 
     with tm.ensure_clean() as path:
@@ -71,7 +71,7 @@ def test_zip_error_no_files(parser_and_data):
             parser.read_csv(path, compression="zip")
 
 
-def test_zip_error_invalid_zip(parser_and_data):
+def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
     parser, _, _ = parser_and_data
 
     with tm.ensure_clean() as path:
@@ -86,6 +86,11 @@ def test_compression(parser_and_data, compression_only, buffer, filename):
     compress_type = compression_only
 
     ext = "gz" if compress_type == "gzip" else compress_type
+    pyarrow_unsupported_exts = {"bz2", "zip", "xz"}
+    if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow":
+        # need to skip since this test will hang forever and not fail
+        pytest.skip(f"The pyarrow package doesn't come with {ext} support")
+
     filename = filename if filename is None else filename.format(ext=ext)
 
     if filename and buffer:
@@ -141,7 +146,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding
 
 
 @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
-def test_invalid_compression(all_parsers, invalid_compression):
+def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail):
     parser = all_parsers
     compress_kwargs = dict(compression=invalid_compression)
 

From f1bb4e25c77f4b672ddd5dfc7afc2af51abc9e32 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 27 May 2020 10:57:57 -0700
Subject: [PATCH 23/35] Get all tests to run & some fixes

---
 pandas/io/parsers.py                       | 37 ++++++++++++----------
 pandas/tests/io/parser/conftest.py         |  7 ++--
 pandas/tests/io/parser/test_common.py      |  4 +--
 pandas/tests/io/parser/test_compression.py | 11 +++----
 pandas/tests/io/parser/test_unsupported.py | 19 +++++++++++
 5 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index e64ca0651e7c7..2f9e4ec11187e 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -21,7 +21,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer
-from pandas.compat._optional import import_optional_dependency
+from pandas.compat._optional import import_optional_dependency, VERSIONS
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -444,7 +444,14 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get("iterator", False)
-    chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
+    chunksize = kwds.get("chunksize", None)
+    if kwds.get("engine") == "pyarrow":  # chunksize not supported for pyarrow
+        if iterator:
+            raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine")
+        if chunksize is not None:
+            raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine")
+    else:
+        chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
     nrows = kwds.get("nrows", None)
 
     # Check for duplicates in names.
@@ -830,6 +837,9 @@ def __init__(self, f, engine=None, **kwds):
         self._engine_specified = kwds.get("engine_specified", engine_specified)
 
         if kwds.get("dialect") is not None:
+            if engine == "pyarrow":
+                raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine")
+
             dialect = kwds["dialect"]
             if dialect in csv.list_dialects():
                 dialect = csv.get_dialect(dialect)
@@ -923,11 +933,11 @@ def _get_options_with_defaults(self, engine):
                 if engine == "pyarrow" and value != default:
                     raise ValueError(
                         f"The {repr(argname)} option is not supported with the "
-                        f"{repr(engine)} engine"
+                        f"'pyarrow' engine"
                     )
             if argname == "iterator" and engine == "pyarrow":
                 raise ValueError(
-                    "The iterator option is not supported with the pyarrow engine"
+                    "The iterator option is not supported with the 'pyarrow' engine"
                 )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
@@ -2281,6 +2291,7 @@ def __init__(self, src, **kwds):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
+        VERSIONS["pyarrow"] = "0.15.0"
         pyarrow = import_optional_dependency(
             "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
         )
@@ -2297,19 +2308,11 @@ def read(self):
         convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
         read_options = pyarrow.ReadOptions(autogenerate_column_names=True)
         headerexists = True if self.header is not None and self.header >= 0 else False
-        try:
-            skiprows = self.kwds.get("skiprows")
-            if skiprows is not None:
-                read_options = pyarrow.ReadOptions(skip_rows=skiprows)
-            elif self.header >= 0:
-                read_options = pyarrow.ReadOptions(skip_rows=self.header)
-        except TypeError as e:
-            msg = "__init__() got an unexpected keyword argument"
-            if msg in str(e):
-                raise ImportError(
-                    "pyarrow version >= 0.15.0 is required to use "
-                    "read_csv with engine='pyarrow'"
-                )
+        skiprows = self.kwds.get("skiprows")
+        if skiprows is not None:
+            read_options = pyarrow.ReadOptions(skip_rows=skiprows)
+        elif headerexists:
+            read_options = pyarrow.ReadOptions(skip_rows=self.header)
         table = pyarrow.read_csv(
             self.src,
             read_options=read_options,
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 8f473bded9225..09379ac1b6922 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -80,10 +80,13 @@ def csv1(csv_dir_path):
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
 
-pyarrow_version = pkg_resources.get_distribution("pyarrow").version
+try:
+    pyarrow_version = pkg_resources.get_distribution("pyarrow").version
+except pkg_resources.DistributionNotFound:
+    pyarrow_version = None
 if (
     distutils.version.LooseVersion(pyarrow_version) > "0.15.0"
-):  # TODO remove this if block once required pyarrow>0.15.0
+):
     _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
     _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 else:
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index e0b6d70b607d6..f35da606110fe 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1517,7 +1517,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols):
         ),
     ],
 )
-def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
+def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xfail):
     # see gh-12493
     parser = all_parsers
 
@@ -2082,7 +2082,7 @@ def test_read_table_equivalency_to_read_csv(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_first_row_bom(all_parsers):
+def test_first_row_bom(all_parsers, pyarrow_xfail):
     # see gh-26545
     parser = all_parsers
     data = '''\ufeff"Head1"	"Head2"	"Head3"'''
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 22bba9bd3f98a..2c5f1b61370a5 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -81,16 +81,11 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
 
 
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
-def test_compression(parser_and_data, compression_only, buffer, filename):
+def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail):
     parser, data, expected = parser_and_data
     compress_type = compression_only
 
     ext = "gz" if compress_type == "gzip" else compress_type
-    pyarrow_unsupported_exts = {"bz2", "zip", "xz"}
-    if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow":
-        # need to skip since this test will hang forever and not fail
-        pytest.skip(f"The pyarrow package doesn't come with {ext} support")
-
     filename = filename if filename is None else filename.format(ext=ext)
 
     if filename and buffer:
@@ -118,6 +113,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     expected = parser.read_csv(csv1, **kwargs)
     kwargs["compression"] = "infer"
 
+    if ext == "bz2":
+        pytest.xfail("pyarrow wheels don't have bz2 codec support")
     if buffer:
         with open(csv1) as f:
             result = parser.read_csv(f, **kwargs)
@@ -128,7 +125,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     tm.assert_frame_equal(result, expected)
 
 
-def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
+def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail):
     # see gh-18071, gh-24130
     parser = all_parsers
     encoding = encoding_fmt.format(utf_value)
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 267fae760398a..44865d61d1b05 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -121,3 +121,22 @@ def read(self):
 
         with pytest.raises(ValueError, match=msg):
             read_csv(NoNextBuffer(data), engine=python_engine)
+
+    def test_pyarrow_engine(self):
+        from pandas.io.parsers import _pyarrow_unsupported as pa_unsupported
+
+        data = """1,2,3,,
+        1,2,3,4,
+        1,2,3,4,5
+        1,2,,,
+        1,2,3,4,"""
+
+        for default in pa_unsupported:
+            msg = (
+                f"The {repr(default)} option is not "
+                f"supported with the 'pyarrow' engine"
+            )
+            print(default)
+            kwargs = {default: object()}
+            with pytest.raises(ValueError, match=msg):
+                read_csv(StringIO(data), engine="pyarrow", **kwargs)

From 7876b4ef795150510837f74538fdc10b1c38333e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 29 May 2020 15:57:58 -0700
Subject: [PATCH 24/35] Lint and CI

---
 pandas/io/parsers.py                       | 15 +++++++++++----
 pandas/tests/io/parser/conftest.py         |  6 ++----
 pandas/tests/io/parser/test_common.py      |  2 +-
 pandas/tests/io/parser/test_compression.py |  8 ++++++--
 pandas/tests/io/parser/test_dtypes.py      |  2 +-
 pandas/tests/io/parser/test_unsupported.py |  1 -
 6 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 2f9e4ec11187e..f1a89da794849 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -21,7 +21,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer
-from pandas.compat._optional import import_optional_dependency, VERSIONS
+from pandas.compat._optional import VERSIONS, import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -447,9 +447,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     chunksize = kwds.get("chunksize", None)
     if kwds.get("engine") == "pyarrow":  # chunksize not supported for pyarrow
         if iterator:
-            raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine")
+            raise ValueError(
+                "The 'iterator' option is not supported with the 'pyarrow' engine"
+            )
         if chunksize is not None:
-            raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine")
+            raise ValueError(
+                "The 'chunksize' option is not supported with the 'pyarrow' engine"
+            )
     else:
         chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
     nrows = kwds.get("nrows", None)
@@ -557,6 +561,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skipinitialspace",
     "date_parser",
     "cache_dates",
+    "parse_dates",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -838,7 +843,9 @@ def __init__(self, f, engine=None, **kwds):
 
         if kwds.get("dialect") is not None:
             if engine == "pyarrow":
-                raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine")
+                raise ValueError(
+                    "The 'dialect' option is not supported with the 'pyarrow' engine"
+                )
 
             dialect = kwds["dialect"]
             if dialect in csv.list_dialects():
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 09379ac1b6922..9aa23bd739d24 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -83,10 +83,8 @@ def csv1(csv_dir_path):
 try:
     pyarrow_version = pkg_resources.get_distribution("pyarrow").version
 except pkg_resources.DistributionNotFound:
-    pyarrow_version = None
-if (
-    distutils.version.LooseVersion(pyarrow_version) > "0.15.0"
-):
+    pyarrow_version = "0"  # represents pyarrow not found
+if distutils.version.LooseVersion(pyarrow_version) > "0.15.0":
     _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
     _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 else:
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index f35da606110fe..96410f626952b 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1561,7 +1561,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-def test_raise_on_sep_with_delim_whitespace(all_parsers):
+def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail):
     # see gh-6607
     data = "a b c\n1 2 3"
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 2c5f1b61370a5..ecc35dd6644c8 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -81,7 +81,9 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
 
 
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
-def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail):
+def test_compression(
+    parser_and_data, compression_only, buffer, filename, pyarrow_xfail
+):
     parser, data, expected = parser_and_data
     compress_type = compression_only
 
@@ -125,7 +127,9 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     tm.assert_frame_equal(result, expected)
 
 
-def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail):
+def test_compression_utf_encoding(
+    all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail
+):
     # see gh-18071, gh-24130
     parser = all_parsers
     encoding = encoding_fmt.format(utf_value)
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index d1ed85cc6f466..626d4febd7ddf 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -403,7 +403,7 @@ def test_empty_with_multi_index_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
+def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfail):
     parser = all_parsers
 
     data = "one,one"
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 44865d61d1b05..2e6165619f318 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -136,7 +136,6 @@ def test_pyarrow_engine(self):
                 f"The {repr(default)} option is not "
                 f"supported with the 'pyarrow' engine"
             )
-            print(default)
             kwargs = {default: object()}
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)

From 008acab51559e76c1646bd659146d6b79081b99d Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 3 Jun 2020 14:20:56 -0700
Subject: [PATCH 25/35] parse_dates support and fixups of some tests

---
 asv_bench/benchmarks/io/csv.py             | 2 +-
 pandas/io/parsers.py                       | 8 +++-----
 pandas/tests/io/parser/test_unsupported.py | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 6e166ec315df6..f2462184abb37 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 class ReadCSVEngine(StringIORewind):
     def setup(self):
-        data = ["A,B,C"] + (["1,2,3"] * 100000)
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000)
         self.StringIO_input = StringIO("\n".join(data))
         # simulate reading from file
         self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f1a89da794849..24aff9ddba376 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -555,13 +555,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "iterator",
     "cache_dates",
     "dayfirst",
-    "keep_date_col",
     "infer_datetime_format",
     "verbose",
     "skipinitialspace",
-    "date_parser",
     "cache_dates",
-    "parse_dates",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -2338,10 +2335,11 @@ def read(self):
         if self.index_col is not None:
             index_col = [frame.columns[i] for i in self.index_col]
             frame.set_index(index_col, drop=True, inplace=True)
+
+        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
+
         if self.kwds.get("dtype") is not None:
             frame = frame.astype(self.kwds.get("dtype"))
-        else:
-            frame = frame.infer_objects()
         return frame
 
 
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 2e6165619f318..d2ae4c160d519 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -132,6 +132,7 @@ def test_pyarrow_engine(self):
         1,2,3,4,"""
 
         for default in pa_unsupported:
+            print(default)
             msg = (
                 f"The {repr(default)} option is not "
                 f"supported with the 'pyarrow' engine"

From 2dddae747d4d612ab8e78761bd058ff76a13a5eb Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 12 Jun 2020 21:33:34 -0700
Subject: [PATCH 26/35] Date parsing fixes and address comments

---
 asv_bench/benchmarks/io/csv.py     |  68 +++++++++----------
 doc/source/user_guide/io.rst       |   8 ++-
 doc/source/whatsnew/v1.1.0.rst     |   6 +-
 pandas/io/parsers.py               | 102 ++++++++++++++++++++++++-----
 pandas/tests/io/parser/conftest.py |  16 ++---
 5 files changed, 130 insertions(+), 70 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index f2462184abb37..3681cd4df481f 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = [None, 10000]
-    param_names = ["skiprows"]
+    params = ([None, 10000], ["c", "pyarrow"])
+    param_names = ["skiprows", "engine"]
 
-    def setup(self, skiprows):
+    def setup(self, skiprows, engine):
         N = 20000
         index = tm.makeStringIndex(N)
         df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
         )
         df.to_csv(self.fname)
 
-    def time_skipprows(self, skiprows):
-        read_csv(self.fname, skiprows=skiprows)
+    def time_skipprows(self, skiprows, engine):
+        read_csv(self.fname, skiprows=skiprows, engine=engine)
 
 
 class ReadUint64Integers(StringIORewind):
@@ -261,31 +261,20 @@ def time_read_csv_arrow(self, sep, decimal, float_precision):
 
 
 class ReadCSVEngine(StringIORewind):
-    def setup(self):
-        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000)
+    params = ["c", "python", "pyarrow"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
         # simulate reading from file
         self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
 
-    def time_read_stringcsv_c(self):
-        read_csv(self.data(self.StringIO_input))
-
-    def time_read_stringcsv_arrow(self):
-        read_csv(self.data(self.StringIO_input), engine="pyarrow")
-
-    def time_read_stringcsv_python_engine(self):
-        read_csv(
-            self.data(self.StringIO_input), engine="python",
-        )
-
-    def time_read_bytescsv_c(self):
-        read_csv(self.BytesIO_input)
-
-    def time_read_bytescsv_arrow(self):
-        read_csv(self.BytesIO_input, engine="pyarrow")
+    def time_read_stringcsv(self, engine):
+        read_csv(self.data(self.StringIO_input), engine=engine)
 
-    def time_read_bytescsv_python_engine(self):
-        read_csv(self.BytesIO_input, engine="python")
+    def time_read_bytescsv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
 
 
 class ReadCSVCategorical(BaseIO):
@@ -305,7 +294,10 @@ def time_convert_direct(self):
 
 
 class ReadCSVParseDates(StringIORewind):
-    def setup(self):
+    params = ["c", "pyarrow", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
                   {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
                   {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -316,18 +308,20 @@ def setup(self):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self):
+    def time_multiple_date(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=list(string.digits[:9]),
             parse_dates=[[1, 2], [1, 3]],
         )
 
-    def time_baseline(self):
+    def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             parse_dates=[1],
@@ -336,17 +330,18 @@ def time_baseline(self):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False],)
-    param_names = ["do_cache"]
+    params = ([True, False], ["c", "pyarrow", "python"])
+    param_names = ["do_cache", "engine"]
 
-    def setup(self, do_cache):
+    def setup(self, do_cache, engine):
         data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
-    def time_read_csv_cached(self, do_cache):
+    def time_read_csv_cached(self, do_cache, engine):
         try:
             read_csv(
                 self.data(self.StringIO_input),
+                engine=engine,
                 header=None,
                 parse_dates=[0],
                 cache_dates=do_cache,
@@ -376,22 +371,23 @@ def mem_parser_chunks(self):
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"],)
-    param_names = ["value"]
+    params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"])
+    param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",
         "mdY": "12/02/2010\n",
         "hm": "21:34\n",
     }
 
-    def setup(self, value):
+    def setup(self, value, engine):
         count_elem = 10000
         data = self.objects[value] * count_elem
         self.StringIO_input = StringIO(data)
 
-    def time_read_special_date(self, value):
+    def time_read_special_date(self, value, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=["Date"],
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index df6b44ac654ce..9ff714a8211bb 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -160,9 +160,11 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use `str` or `object` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
-engine : {``'c'``, ``'python'``}
-  Parser engine to use. The C engine is faster while the Python engine is
-  currently more feature-complete.
+engine : {``'c'``, ``'pyarrow'``,``'python'``}
+  Parser engine to use. In terms of performance, the pyarrow engine,
+  which requires pyarrow>=0.15.0, is faster than the C engine, which
+  is faster than the python engine. However, the pyarrow and C engines
+  are currently less feature complete than their Python counterpart.
 converters : dict, default ``None``
   Dict of functions for converting values in certain columns. Keys can either be
   integers or column labels.
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 44a56e0818ae8..dee66257f2d56 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -288,6 +288,9 @@ Other enhancements
 - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`).
 - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable（:issue:`11704`)
 - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`).
+- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing
+  if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
 .. ---------------------------------------------------------------------------
 
@@ -901,9 +904,6 @@ I/O
 - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`)
 - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`)
 - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`)
-- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
-  if pyarrow>0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
-  "python" counterparts. (:issue:`23697`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 24aff9ddba376..d8ef6488dc02a 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -170,9 +170,8 @@
     of dtype conversion.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
-    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15
+    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
-
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional
@@ -445,7 +444,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get("iterator", False)
     chunksize = kwds.get("chunksize", None)
-    if kwds.get("engine") == "pyarrow":  # chunksize not supported for pyarrow
+    # chunksize and iterator not supported for pyarrow
+    if kwds.get("engine") == "pyarrow":
         if iterator:
             raise ValueError(
                 "The 'iterator' option is not supported with the 'pyarrow' engine"
@@ -523,6 +523,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
+
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,
@@ -553,12 +554,11 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "converters",
     "decimal",
     "iterator",
-    "cache_dates",
     "dayfirst",
     "infer_datetime_format",
     "verbose",
     "skipinitialspace",
-    "cache_dates",
+    "low_memory",
 }
 _python_unsupported = {"low_memory", "float_precision"}
 
@@ -939,10 +939,6 @@ def _get_options_with_defaults(self, engine):
                         f"The {repr(argname)} option is not supported with the "
                         f"'pyarrow' engine"
                     )
-            if argname == "iterator" and engine == "pyarrow":
-                raise ValueError(
-                    "The iterator option is not supported with the 'pyarrow' engine"
-                )
             # see gh-12935
             if argname == "mangle_dupe_cols" and not value:
                 raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
@@ -2255,14 +2251,18 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
 
 
 class BytesIOWrapper:
-    def __init__(self, string_buffer, encoding="utf-8"):
+    """
+    Allows the pyarrow engine for read_csv() to read from string buffers
+    """
+
+    def __init__(self, string_buffer: StringIO, encoding: str = "utf-8"):
         self.string_buffer = string_buffer
         self.encoding = encoding
 
-    def __getattr__(self, attr):
+    def __getattr__(self, attr: str):
         return getattr(self.string_buffer, attr)
 
-    def read(self, size=-1):
+    def read(self, size: int = -1):
         content = self.string_buffer.read(size)
         return content.encode(self.encoding)
 
@@ -2332,16 +2332,85 @@ def read(self):
                 elif self.header is None:
                     self.names = range(num_cols)
             frame.columns = self.names
-        if self.index_col is not None:
-            index_col = [frame.columns[i] for i in self.index_col]
-            frame.set_index(index_col, drop=True, inplace=True)
 
-        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
+        frame = self._date_conversion(
+            frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col
+        )
+
+        if self.index_col is not None:
+            for i, item in enumerate(self.index_col):
+                if is_integer(item):
+                    self.index_col[i] = frame.columns[item]
+            frame.set_index(self.index_col, drop=True, inplace=True)
 
         if self.kwds.get("dtype") is not None:
             frame = frame.astype(self.kwds.get("dtype"))
         return frame
 
+    def _date_conversion(
+        self, data, converter, parse_spec, keep_date_col=False,
+    ):
+
+        orig_names = data.columns
+        columns = list(data.columns)
+
+        date_cols = set()
+
+        if parse_spec is None or isinstance(parse_spec, bool):
+            return data, columns
+
+        if isinstance(parse_spec, list):
+            # list of column lists
+            for colspec in parse_spec:
+                if is_scalar(colspec):
+                    if isinstance(colspec, int) and colspec not in data:
+                        colspec = orig_names[colspec]
+                    data[colspec] = converter(data[colspec].values)
+                else:
+                    new_name, col, old_names = self._try_convert_dates(
+                        converter, colspec, data, orig_names
+                    )
+                    if new_name in data:
+                        raise ValueError(f"New date column already in dict {new_name}")
+                    data[new_name] = col
+                    date_cols.update(old_names)
+
+        elif isinstance(parse_spec, dict):
+            # dict of new name to column list
+            for new_name, colspec in parse_spec.items():
+                if new_name in data:
+                    raise ValueError(f"Date column {new_name} already in dict")
+
+                _, col, old_names = self._try_convert_dates(
+                    converter, colspec, data, orig_names
+                )
+
+                data[new_name] = col
+                date_cols.update(old_names)
+
+        if not keep_date_col:
+            data = data.drop(date_cols, axis=1)
+
+        return data
+
+    def _try_convert_dates(self, parser, colspec, data, columns):
+        colset = set(columns)
+        colnames = []
+
+        for c in colspec:
+            if c in colset:
+                colnames.append(c)
+            elif isinstance(c, int) and c not in columns:
+                colnames.append(columns[c])
+            else:
+                colnames.append(c)
+
+        new_name = "_".join(str(x) for x in colnames)
+        to_parse = [data[c].values for c in colnames if c in data]
+
+        new_col = parser(*to_parse)
+        return new_name, new_col, colnames
+
 
 def TextParser(*args, **kwds):
     """
@@ -3548,6 +3617,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
+
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 9aa23bd739d24..11710fda521f1 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -1,8 +1,6 @@
-import distutils.version
 import os
 from typing import List, Optional
 
-import pkg_resources
 import pytest
 
 from pandas import read_csv, read_table
@@ -80,16 +78,8 @@ def csv1(csv_dir_path):
 _c_parser_ids = ["c_high", "c_low"]
 _pyarrow_parser_ids = ["pyarrow"]
 
-try:
-    pyarrow_version = pkg_resources.get_distribution("pyarrow").version
-except pkg_resources.DistributionNotFound:
-    pyarrow_version = "0"  # represents pyarrow not found
-if distutils.version.LooseVersion(pyarrow_version) > "0.15.0":
-    _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
-    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
-else:
-    _all_parsers = [*_c_parsers_only, *_py_parsers_only]
-    _all_parser_ids = [*_c_parser_ids, *_py_parser_ids]
+_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
+_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
@@ -97,6 +87,8 @@ def all_parsers(request):
     """
     Fixture all of the CSV parsers.
     """
+    if request.param.engine == "pyarrow":
+        pytest.importorskip("pyarrow", "0.15.0")
     return request.param
 
 

From 88e200a108985baa5ac05e5c07287b8971ea091d Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 29 Jun 2020 11:04:49 -0700
Subject: [PATCH 27/35] Clean/Address comments/Update docs

---
 asv_bench/benchmarks/io/csv.py           |   2 +-
 doc/source/whatsnew/v1.1.0.rst           |  11 ++-
 pandas/compat/_optional.py               |  16 ++--
 pandas/io/parsers.py                     | 108 ++++++-----------------
 pandas/tests/test_optional_dependency.py |   7 +-
 5 files changed, 51 insertions(+), 93 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 3681cd4df481f..8792fff5300d3 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -294,7 +294,7 @@ def time_convert_direct(self):
 
 
 class ReadCSVParseDates(StringIORewind):
-    params = ["c", "pyarrow", "python"]
+    params = ["c", "python"]
     param_names = ["engine"]
 
     def setup(self, engine):
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 7c0a707c964c5..d54935c2bdc08 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -245,6 +245,14 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha
 
 For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`.
 
+.. _whatsnew_110.enhancements.read_csv_pyarrow_engine_support:
+
+read_csv() now accepts pyarrow as an engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
+with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+
 
 .. _whatsnew_110.enhancements.other:
 
@@ -293,9 +301,6 @@ Other enhancements
 - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
 - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
 - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).
-- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing
-  if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
-  "python" counterparts. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index ed025ec36dafd..f65d53c05257c 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -2,6 +2,7 @@
 import importlib
 import sys
 import types
+from typing import Optional
 import warnings
 
 # Update install.rst when updating versions!
@@ -46,7 +47,11 @@ def _get_version(module: types.ModuleType) -> str:
 
 
 def import_optional_dependency(
-    name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise"
+    name: str,
+    extra: str = "",
+    raise_on_missing: bool = True,
+    on_version: str = "raise",
+    min_version: Optional[str] = None,
 ):
     """
     Import an optional dependency.
@@ -58,8 +63,7 @@ def import_optional_dependency(
     Parameters
     ----------
     name : str
-        The module name. This should be top-level only, so that the
-        version may be checked.
+        The module name.
     extra : str
         Additional text to include in the ImportError message.
     raise_on_missing : bool, default True
@@ -73,6 +77,8 @@ def import_optional_dependency(
         * ignore: Return the module, even if the version is too old.
           It's expected that users validate the version locally when
           using ``on_version="ignore"`` (see. ``io/html.py``)
+    min_version: Optional[str]
+        Specify the minimum version
 
     Returns
     -------
@@ -93,14 +99,14 @@ def import_optional_dependency(
             raise ImportError(msg) from None
         else:
             return None
-    # Grab parent module if submodule being imported
+    # Handle submodules: if we have submodule, grab parent module from sys.modules
     parent = name.split(".")[0]
     if parent != name:
         name = parent
         module_to_get = sys.modules[name]
     else:
         module_to_get = module
-    minimum_version = VERSIONS.get(name)
+    minimum_version = min_version if min_version is not None else VERSIONS.get(name)
     if minimum_version:
         version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 3563a1ea0f04e..ebaefafd8b5b8 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -21,7 +21,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer, Union
-from pandas.compat._optional import VERSIONS, import_optional_dependency
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -172,6 +172,7 @@
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
+
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional
@@ -1015,7 +1016,7 @@ def _clean_options(self, options, engine):
             elif engine not in ("python", "python-fwf"):
                 # wait until regex engine integrated
                 fallback_reason = (
-                    "the 'c' engine does not support "
+                    f"the '{engine}' engine does not support "
                     "regex separators (separators > 1 char and "
                     r"different from '\s+' are interpreted as regex)"
                 )
@@ -2302,9 +2303,10 @@ def __init__(self, src, **kwds):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
-        VERSIONS["pyarrow"] = "0.15.0"
         pyarrow = import_optional_dependency(
-            "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine"
+            "pyarrow.csv",
+            min_version="0.15.0",
+            extra="pyarrow is required to use the pyarrow engine",
         )
         kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
@@ -2315,15 +2317,26 @@ def read(self):
             "true_values",
             "false_values",
         }
+        # rename some arguments to pass to pyarrow
+        kwdscopy["include_columns"] = kwdscopy.get("usecols")
+        kwdscopy["null_values"] = kwdscopy.get("na_values")
+        kwdscopy["escape_char"] = kwdscopy.get("escapechar")
+        kwdscopy["ignore_empty_lines"] = kwdscopy.get("skip_blank_lines")
+
         parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions}
         convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions}
-        read_options = pyarrow.ReadOptions(autogenerate_column_names=True)
-        headerexists = True if self.header is not None and self.header >= 0 else False
+        headerexists = True if self.header is not None else False
+        read_options = dict()
+
         skiprows = self.kwds.get("skiprows")
-        if skiprows is not None:
-            read_options = pyarrow.ReadOptions(skip_rows=skiprows)
-        elif headerexists:
-            read_options = pyarrow.ReadOptions(skip_rows=self.header)
+        if headerexists:
+            read_options["skip_rows"] = self.header
+            read_options["autogenerate_column_names"] = False
+        else:
+            if skiprows is not None:
+                read_options["skip_rows"] = skiprows
+            read_options["autogenerate_column_names"] = True
+        read_options = pyarrow.ReadOptions(**read_options)
         table = pyarrow.read_csv(
             self.src,
             read_options=read_options,
@@ -2339,11 +2352,8 @@ def read(self):
                 elif self.header is None:
                     self.names = range(num_cols)
             frame.columns = self.names
-
-        frame = self._date_conversion(
-            frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col
-        )
-
+        # we only need the frame not the names
+        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
         if self.index_col is not None:
             for i, item in enumerate(self.index_col):
                 if is_integer(item):
@@ -2354,70 +2364,6 @@ def read(self):
             frame = frame.astype(self.kwds.get("dtype"))
         return frame
 
-    def _date_conversion(
-        self, data, converter, parse_spec, keep_date_col=False,
-    ):
-
-        orig_names = data.columns
-        columns = list(data.columns)
-
-        date_cols = set()
-
-        if parse_spec is None or isinstance(parse_spec, bool):
-            return data, columns
-
-        if isinstance(parse_spec, list):
-            # list of column lists
-            for colspec in parse_spec:
-                if is_scalar(colspec):
-                    if isinstance(colspec, int) and colspec not in data:
-                        colspec = orig_names[colspec]
-                    data[colspec] = converter(data[colspec].values)
-                else:
-                    new_name, col, old_names = self._try_convert_dates(
-                        converter, colspec, data, orig_names
-                    )
-                    if new_name in data:
-                        raise ValueError(f"New date column already in dict {new_name}")
-                    data[new_name] = col
-                    date_cols.update(old_names)
-
-        elif isinstance(parse_spec, dict):
-            # dict of new name to column list
-            for new_name, colspec in parse_spec.items():
-                if new_name in data:
-                    raise ValueError(f"Date column {new_name} already in dict")
-
-                _, col, old_names = self._try_convert_dates(
-                    converter, colspec, data, orig_names
-                )
-
-                data[new_name] = col
-                date_cols.update(old_names)
-
-        if not keep_date_col:
-            data = data.drop(date_cols, axis=1)
-
-        return data
-
-    def _try_convert_dates(self, parser, colspec, data, columns):
-        colset = set(columns)
-        colnames = []
-
-        for c in colspec:
-            if c in colset:
-                colnames.append(c)
-            elif isinstance(c, int) and c not in columns:
-                colnames.append(columns[c])
-            else:
-                colnames.append(c)
-
-        new_name = "_".join(str(x) for x in colnames)
-        to_parse = [data[c].values for c in colnames if c in data]
-
-        new_col = parser(*to_parse)
-        return new_name, new_col, colnames
-
 
 def TextParser(*args, **kwds):
     """
@@ -3568,7 +3514,7 @@ def _isindex(colspec):
                     colspec = orig_names[colspec]
                 if _isindex(colspec):
                     continue
-                data_dict[colspec] = converter(data_dict[colspec])
+                data_dict[colspec] = converter(np.array(data_dict[colspec]))
             else:
                 new_name, col, old_names = _try_convert_dates(
                     converter, colspec, data_dict, orig_names
@@ -3617,7 +3563,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
             colnames.append(c)
 
     new_name = "_".join(str(x) for x in colnames)
-    to_parse = [data_dict[c] for c in colnames if c in data_dict]
+    to_parse = [np.array(data_dict[c]) for c in colnames if c in data_dict]
 
     new_col = parser(*to_parse)
     return new_name, new_col, colnames
diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py
index e5ed69b7703b1..61dbd81e2cee5 100644
--- a/pandas/tests/test_optional_dependency.py
+++ b/pandas/tests/test_optional_dependency.py
@@ -27,14 +27,15 @@ def test_bad_version(monkeypatch):
     module = types.ModuleType(name)
     module.__version__ = "0.9.0"
     sys.modules[name] = module
-    monkeypatch.setitem(VERSIONS, name, "1.0.0")
 
     match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'"
     with pytest.raises(ImportError, match=match):
-        import_optional_dependency("fakemodule")
+        import_optional_dependency("fakemodule", min_version="1.0.0")
 
     with tm.assert_produces_warning(UserWarning):
-        result = import_optional_dependency("fakemodule", on_version="warn")
+        result = import_optional_dependency(
+            "fakemodule", min_version="1.0.0", on_version="warn"
+        )
     assert result is None
 
     module.__version__ = "1.0.0"  # exact match is OK

From ede279925c591f42a1585d0aae9e186a3b936cd0 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 29 Jun 2020 11:08:18 -0700
Subject: [PATCH 28/35] Fix typo

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/io/parsers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index b3127d4f84cd8..de2a833e51ea0 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -173,6 +173,7 @@
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
 
+
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional

From e8eff08c8b939539ecbe6e9466f9248722fd0927 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 8 Jul 2020 16:46:46 -0700
Subject: [PATCH 29/35] Fix doc failures

---
 doc/source/user_guide/io.rst | 21 ++++++++++++++-------
 pandas/io/parsers.py         |  1 -
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 2fcffcd814195..e4da778ee7378 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -160,9 +160,9 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use `str` or `object` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
-engine : {``'c'``, ``'pyarrow'``,``'python'``}
+engine : {``'c'``, ``'pyarrow'``, ``'python'``}
   Parser engine to use. In terms of performance, the pyarrow engine,
-  which requires pyarrow>=0.15.0, is faster than the C engine, which
+  which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which
   is faster than the python engine. However, the pyarrow and C engines
   are currently less feature complete than their Python counterpart.
 converters : dict, default ``None``
@@ -1621,11 +1621,18 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
 Specifying the parser engine
 ''''''''''''''''''''''''''''
 
-Under the hood pandas uses a fast and efficient parser implemented in C as well
-as a Python implementation which is currently more feature-complete. Where
-possible pandas uses the C parser (specified as ``engine='c'``), but may fall
-back to Python if C-unsupported options are specified. Currently, C-unsupported
-options include:
+Currently, pandas supports using three engines, the C engine, the python engine,
+and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance
+the pyarrow engine is fastest, followed by the C and Python engines. However,
+the pyarrow engine is much less robust than the C engine, which in turn lacks a
+couple of features present in the Python parser.
+
+Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall
+back to Python if C-unsupported options are specified. If pyarrow unsupported options are
+specified while using ``engine='pyarrow'``, the parser will error out
+(a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``).
+
+Currently, C-unsupported options include:
 
 * ``sep`` other than a single character (e.g. regex separators)
 * ``skipfooter``
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index de2a833e51ea0..b3127d4f84cd8 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -173,7 +173,6 @@
     is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
     as a dependency however.
 
-
     .. versionchanged:: 1.1
         The "pyarrow" engine was added.
 converters : dict, optional

From 55139ee19a512c3bd83b3c07caa4c44a92a49a59 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 22 Oct 2020 16:35:14 +0100
Subject: [PATCH 30/35] wip

---
 pandas/tests/io/parser/conftest.py     | 14 +++++-
 pandas/tests/io/parser/test_comment.py |  2 +
 pandas/tests/io/parser/test_common.py  | 64 ++++++++++++++++++++++++--
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 05fae470f5a88..a179c1b82baae 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -142,4 +142,16 @@ def pyarrow_xfail(request):
     if "all_parsers" in request.fixturenames:
         parser = request.getfixturevalue("all_parsers")
         if parser.engine == "pyarrow":
-            pytest.xfail("pyarrow doesn't support this.")
+            mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
+            request.node.add_marker(mark)
+
+
+@pytest.fixture
+def pyarrow_skip(request):
+    """
+    Fixture that skips a test if the engine is pyarrow.
+    """
+    if "all_parsers" in request.fixturenames:
+        parser = request.getfixturevalue("all_parsers")
+        if parser.engine == "pyarrow":
+            pytest.skip("pyarrow doesn't support this.")
diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py
index 60e32d7c27200..a9a03f006668b 100644
--- a/pandas/tests/io/parser/test_comment.py
+++ b/pandas/tests/io/parser/test_comment.py
@@ -10,6 +10,8 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 @pytest.mark.parametrize("na_values", [None, ["NaN"]])
 def test_comment(all_parsers, na_values):
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 753189ea7c8d2..1295f0061f808 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -23,6 +23,9 @@
 
 from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 def test_override_set_noconvert_columns():
     # see gh-17351
@@ -84,7 +87,8 @@ def test_empty_decimal_marker(all_parsers, pyarrow_xfail):
         parser.read_csv(StringIO(data), decimal="")
 
 
-def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail):
+@skip_pyarrow
+def test_bad_stream_exception(all_parsers, csv_dir_path):
     # see gh-13652
     #
     # This test validates that both the Python engine and C engine will
@@ -139,6 +143,7 @@ def test_read_csv_local(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_1000_sep(all_parsers):
     parser = all_parsers
     data = """A|B|C
@@ -232,6 +237,7 @@ def test_csv_mixed_type(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_low_memory_no_rows_with_index(all_parsers):
     # see gh-21141
     parser = all_parsers
@@ -280,6 +286,7 @@ def test_read_csv_dataframe(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_no_index_name(all_parsers, csv_dir_path):
     parser = all_parsers
     csv2 = os.path.join(csv_dir_path, "test2.csv")
@@ -348,6 +355,7 @@ def test_read_duplicate_index_explicit(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_duplicate_index_implicit(all_parsers):
     data = """A,B,C,D
 foo,2,3,4,5
@@ -728,7 +736,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail):
         parser.read_csv(StringIO(data), skipfooter=1, **kwargs)
 
 
-def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail):
+def test_nrows_skipfooter_errors(all_parsers):
     msg = "'skipfooter' not supported with 'nrows'"
     data = "a\n1\n2\n3\n4\n5\n6"
     parser = all_parsers
@@ -799,6 +807,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_no_level_names(all_parsers, index_col):
     data = """index1,index2,A,B,C,D
@@ -823,6 +832,7 @@ def test_multi_index_no_level_names(all_parsers, index_col):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multi_index_no_level_names_implicit(all_parsers):
     parser = all_parsers
     data = """A,B,C,D
@@ -856,6 +866,7 @@ def test_multi_index_no_level_names_implicit(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected,header",
     [
@@ -877,6 +888,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_unnamed_index(all_parsers):
     parser = all_parsers
     data = """ id c0 c1 c2
@@ -939,6 +951,7 @@ def test_local_file(all_parsers, csv_dir_path):
         pytest.skip("Failing on: " + " ".join(platform.uname()))
 
 
+@xfail_pyarrow
 def test_path_path_lib(all_parsers):
     parser = all_parsers
     df = tm.makeDataFrame()
@@ -946,6 +959,7 @@ def test_path_path_lib(all_parsers):
     tm.assert_frame_equal(df, result)
 
 
+@xfail_pyarrow
 def test_path_local_path(all_parsers):
     parser = all_parsers
     df = tm.makeDataFrame()
@@ -955,6 +969,7 @@ def test_path_local_path(all_parsers):
     tm.assert_frame_equal(df, result)
 
 
+@xfail_pyarrow
 def test_nonexistent_path(all_parsers):
     # gh-2428: pls no segfault
     # gh-14086: raise more helpful FileNotFoundError
@@ -968,6 +983,7 @@ def test_nonexistent_path(all_parsers):
     assert path == e.value.filename
 
 
+@xfail_pyarrow
 @td.skip_if_windows  # os.chmod does not work in windows
 def test_no_permission(all_parsers):
     # GH 23784
@@ -990,6 +1006,7 @@ def test_no_permission(all_parsers):
         assert path == e.value.filename
 
 
+@xfail_pyarrow
 def test_missing_trailing_delimiters(all_parsers):
     parser = all_parsers
     data = """A,B,C,D
@@ -1005,6 +1022,7 @@ def test_missing_trailing_delimiters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skip_initial_space(all_parsers):
     data = (
         '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
@@ -1065,6 +1083,7 @@ def test_skip_initial_space(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_trailing_delimiters(all_parsers):
     # see gh-2442
     data = """A,B,C
@@ -1168,6 +1187,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
     assert df.a.dtype == object
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("sep", [" ", r"\s+"])
 def test_integer_overflow_bug(all_parsers, sep):
     # see gh-2601
@@ -1179,6 +1199,7 @@ def test_integer_overflow_bug(all_parsers, sep):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_catch_too_many_names(all_parsers):
     # see gh-5156
     data = """\
@@ -1198,6 +1219,7 @@ def test_catch_too_many_names(all_parsers):
         parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
 
 
+@xfail_pyarrow
 def test_ignore_leading_whitespace(all_parsers):
     # see gh-3374, gh-6607
     parser = all_parsers
@@ -1218,6 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_index(all_parsers):
     # see gh-10184
     data = "x,y"
@@ -1228,6 +1251,7 @@ def test_empty_with_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_multi_index(all_parsers):
     # see gh-10467
     data = "x,y,z"
@@ -1240,6 +1264,7 @@ def test_empty_with_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_reversed_multi_index(all_parsers):
     data = "x,y,z"
     parser = all_parsers
@@ -1251,6 +1276,7 @@ def test_empty_with_reversed_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_float_parser(all_parsers):
     # see gh-9565
     parser = all_parsers
@@ -1272,6 +1298,7 @@ def test_scientific_no_exponent(all_parsers):
         tm.assert_frame_equal(df_roundtrip, df)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
 def test_int64_overflow(all_parsers, conv):
     data = """ID
@@ -1315,6 +1342,7 @@ def test_int64_overflow(all_parsers, conv):
             parser.read_csv(StringIO(data), converters={"ID": conv})
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
 )
@@ -1328,6 +1356,7 @@ def test_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
 )
@@ -1341,6 +1370,7 @@ def test_outside_int64_uint64_range(all_parsers, val):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]])
 def test_numeric_range_too_wide(all_parsers, exp_data):
     # No numerical dtype can hold both negative and uint64
@@ -1353,6 +1383,7 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("iterator", [True, False])
 def test_empty_with_nrows_chunksize(all_parsers, iterator):
     # see gh-9535
@@ -1370,6 +1401,7 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected,msg",
     [
@@ -1477,6 +1509,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
 def test_uneven_lines_with_usecols(all_parsers, usecols):
     # see gh-12203
@@ -1531,6 +1564,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xf
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -1562,7 +1596,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail):
+def test_raise_on_sep_with_delim_whitespace(all_parsers):
     # see gh-6607
     data = "a b c\n1 2 3"
     parser = all_parsers
@@ -1571,6 +1605,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail):
         parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("delim_whitespace", [True, False])
 def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
     # see gh-9710
@@ -1589,6 +1624,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "sep,skip_blank_lines,exp_data",
     [
@@ -1628,6 +1664,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_whitespace_lines(all_parsers):
     parser = all_parsers
     data = """
@@ -1643,6 +1680,7 @@ def test_whitespace_lines(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -1671,6 +1709,7 @@ def test_whitespace_regex_separator(all_parsers, data, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_verbose_read(all_parsers, capsys):
     parser = all_parsers
     data = """a,b,c,d
@@ -1694,6 +1733,7 @@ def test_verbose_read(all_parsers, capsys):
         assert captured.out == "Filled 3 NA values in column a\n"
 
 
+@xfail_pyarrow
 def test_verbose_read2(all_parsers, capsys):
     parser = all_parsers
     data = """a,b,c,d
@@ -1735,6 +1775,7 @@ def test_iteration_open_handle(all_parsers):
             tm.assert_series_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,thousands,decimal",
     [
@@ -1766,6 +1807,7 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_euro_decimal_format(all_parsers):
     parser = all_parsers
     data = """Id;Number1;Number2;Text1;Text2;Number3
@@ -1785,6 +1827,7 @@ def test_euro_decimal_format(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_inf_parsing(all_parsers, na_filter):
     parser = all_parsers
@@ -1808,6 +1851,7 @@ def test_inf_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_infinity_parsing(all_parsers, na_filter):
     parser = all_parsers
@@ -1825,6 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
 def test_raise_on_no_columns(all_parsers, nrows):
     parser = all_parsers
@@ -1835,6 +1880,7 @@ def test_raise_on_no_columns(all_parsers, nrows):
         parser.read_csv(StringIO(data))
 
 
+@xfail_pyarrow
 @td.check_file_leaks
 def test_memory_map(all_parsers, csv_dir_path):
     mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
@@ -1848,6 +1894,7 @@ def test_memory_map(all_parsers, csv_dir_path):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_null_byte_char(all_parsers):
     # see gh-2741
     data = "\x00,foo"
@@ -1864,6 +1911,7 @@ def test_null_byte_char(all_parsers):
             parser.read_csv(StringIO(data), names=names)
 
 
+@xfail_pyarrow
 def test_temporary_file(all_parsers):
     # see gh-13398
     parser = all_parsers
@@ -1985,6 +2033,7 @@ def seek(self, pos, whence=0):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [dict(), dict(error_bad_lines=True)],  # Default is True.  # Explicitly pass in.
@@ -2003,6 +2052,7 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
         parser.read_csv(StringIO(data), **kwargs)
 
 
+@xfail_pyarrow
 def test_warn_bad_lines(all_parsers, capsys):
     # see gh-15925
     parser = all_parsers
@@ -2017,6 +2067,7 @@ def test_warn_bad_lines(all_parsers, capsys):
     assert "Skipping line 5" in captured.err
 
 
+@xfail_pyarrow
 def test_suppress_error_output(all_parsers, capsys):
     # see gh-15925
     parser = all_parsers
@@ -2045,6 +2096,7 @@ def test_filename_with_special_chars(all_parsers, filename):
         tm.assert_frame_equal(result, df)
 
 
+@xfail_pyarrow
 def test_read_csv_memory_growth_chunksize(all_parsers):
     # see gh-24805
     #
@@ -2127,6 +2179,7 @@ def test_first_row_bom(all_parsers, pyarrow_xfail):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_first_row_bom_unquoted(all_parsers):
     # see gh-36343
     parser = all_parsers
@@ -2147,6 +2200,7 @@ def test_integer_precision(all_parsers):
     tm.assert_series_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_file_descriptor_leak(all_parsers):
     # GH 31488
 
@@ -2160,6 +2214,7 @@ def test():
         td.check_file_leaks(test)()
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("nrows", range(1, 6))
 def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     # GH 28071
@@ -2173,6 +2228,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
     tm.assert_frame_equal(df, ref[:nrows])
 
 
+@xfail_pyarrow
 def test_no_header_two_extra_columns(all_parsers):
     # GH 26218
     column_names = ["one", "two", "three"]
@@ -2203,6 +2259,7 @@ def test_read_csv_with_use_inf_as_na(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_table_delim_whitespace_default_sep(all_parsers):
     # GH: 35958
     f = StringIO("a  b  c\n1 -2 -3\n4  5   6")
@@ -2244,6 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
         parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
 
 
+@xfail_pyarrow
 def test_dict_keys_as_names(all_parsers):
     # GH: 36928
     data = "1,2"

From c1aeecf20a519d3ae5b198097a4746291942c936 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 22 Oct 2020 20:27:33 +0100
Subject: [PATCH 31/35] more xfails and skips

---
 pandas/tests/io/parser/test_common.py       |  8 ++---
 pandas/tests/io/parser/test_compression.py  |  5 +++-
 pandas/tests/io/parser/test_converters.py   |  2 ++
 pandas/tests/io/parser/test_dialect.py      |  2 ++
 pandas/tests/io/parser/test_dtypes.py       | 25 ++++++++++++++++
 pandas/tests/io/parser/test_encoding.py     | 13 ++++++++
 pandas/tests/io/parser/test_header.py       | 18 +++++++++++
 pandas/tests/io/parser/test_index_col.py    | 11 +++++++
 pandas/tests/io/parser/test_mangle_dupes.py |  6 ++++
 pandas/tests/io/parser/test_multi_thread.py |  2 ++
 pandas/tests/io/parser/test_na_values.py    | 24 +++++++++++++++
 pandas/tests/io/parser/test_parse_dates.py  | 33 +++++++++++++++++++++
 pandas/tests/io/parser/test_quoting.py      | 10 +++++++
 pandas/tests/io/parser/test_skiprows.py     | 13 ++++++++
 pandas/tests/io/parser/test_usecols.py      | 25 ++++++++++++++++
 15 files changed, 192 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 1295f0061f808..cbf474ad5e5c6 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1240,7 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_index(all_parsers):
     # see gh-10184
     data = "x,y"
@@ -1264,7 +1264,7 @@ def test_empty_with_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_empty_with_reversed_multi_index(all_parsers):
     data = "x,y,z"
     parser = all_parsers
@@ -1869,7 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
 def test_raise_on_no_columns(all_parsers, nrows):
     parser = all_parsers
@@ -2301,7 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
         parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
 
 
-@xfail_pyarrow
+@skip_pyarrow
 def test_dict_keys_as_names(all_parsers):
     # GH: 36928
     data = "1,2"
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index ecc35dd6644c8..e23b91373f611 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -11,6 +11,8 @@
 import pandas as pd
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+
 
 @pytest.fixture(params=[True, False])
 def buffer(request):
@@ -80,6 +82,7 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail):
                 parser.read_csv(f, compression="zip")
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("filename", [None, "test.{ext}"])
 def test_compression(
     parser_and_data, compression_only, buffer, filename, pyarrow_xfail
@@ -147,7 +150,7 @@ def test_compression_utf_encoding(
 
 
 @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
-def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail):
+def test_invalid_compression(all_parsers, invalid_compression):
     parser = all_parsers
     compress_kwargs = dict(compression=invalid_compression)
 
diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
index 88b400d9a11df..a70fe847b6ae9 100644
--- a/pandas/tests/io/parser/test_converters.py
+++ b/pandas/tests/io/parser/test_converters.py
@@ -12,6 +12,8 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 def test_converters_type_must_be_dict(all_parsers):
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py
index cc65def0fd096..7a65e46ba670f 100644
--- a/pandas/tests/io/parser/test_dialect.py
+++ b/pandas/tests/io/parser/test_dialect.py
@@ -13,6 +13,8 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 @pytest.fixture
 def custom_dialect():
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 1ba6f0ea0a342..8e6462767513a 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -16,7 +16,11 @@
 from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
 def test_dtype_all_columns(all_parsers, dtype, check_orig):
@@ -43,6 +47,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_dtype_all_columns_empty(all_parsers):
     # see gh-12048
     parser = all_parsers
@@ -52,6 +57,7 @@ def test_dtype_all_columns_empty(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -70,6 +76,7 @@ def test_dtype_per_column(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_invalid_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -83,6 +90,7 @@ def test_invalid_dtype_per_column(all_parsers):
         parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -109,6 +117,7 @@ def test_categorical_dtype(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
 def test_categorical_dtype_single(all_parsers, dtype):
     # see gh-10153
@@ -124,6 +133,7 @@ def test_categorical_dtype_single(all_parsers, dtype):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_unsorted(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -142,6 +152,7 @@ def test_categorical_dtype_unsorted(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_missing(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -160,6 +171,7 @@ def test_categorical_dtype_missing(all_parsers):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.slow
 def test_categorical_dtype_high_cardinality_numeric(all_parsers):
     # see gh-18186
@@ -187,6 +199,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     # see gh-10153
     pth = os.path.join(csv_dir_path, "utf16_ex.txt")
@@ -201,6 +214,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_chunksize_infer_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -219,6 +233,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers):
         tm.assert_frame_equal(actual, expected)
 
 
+@xfail_pyarrow
 def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
     # see gh-10153
     parser = all_parsers
@@ -320,6 +335,7 @@ def test_categorical_coerces_timestamp(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_categorical_coerces_timedelta(all_parsers):
     parser = all_parsers
     dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
@@ -361,6 +377,7 @@ def test_categorical_unexpected_categories(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -374,6 +391,7 @@ def test_empty_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_index_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -388,6 +406,7 @@ def test_empty_with_index_pass_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_multi_index_pass_dtype(all_parsers):
     parser = all_parsers
 
@@ -416,6 +435,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfai
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
     parser = all_parsers
 
@@ -429,6 +449,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
     # see gh-9424
     parser = all_parsers
@@ -457,6 +478,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
         parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
 
 
+@xfail_pyarrow
 def test_raise_on_passed_int_dtype_with_nas(all_parsers):
     # see gh-2631
     parser = all_parsers
@@ -474,6 +496,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers):
         parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
 
 
+@xfail_pyarrow
 def test_dtype_with_converters(all_parsers):
     parser = all_parsers
     data = """a,b
@@ -489,6 +512,7 @@ def test_dtype_with_converters(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "dtype,expected",
     [
@@ -553,6 +577,7 @@ def test_numeric_dtype(all_parsers, dtype):
     tm.assert_frame_equal(expected, result)
 
 
+@xfail_pyarrow
 def test_boolean_dtype(all_parsers):
     parser = all_parsers
     data = "\n".join(
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
index 876696ecdad9c..eac906601876b 100644
--- a/pandas/tests/io/parser/test_encoding.py
+++ b/pandas/tests/io/parser/test_encoding.py
@@ -13,7 +13,11 @@
 from pandas import DataFrame, read_csv
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_bytes_io_input(all_parsers):
     encoding = "cp1255"
     parser = all_parsers
@@ -25,6 +29,7 @@ def test_bytes_io_input(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_unicode(all_parsers):
     parser = all_parsers
     data = BytesIO("\u0141aski, Jan;1".encode())
@@ -34,6 +39,7 @@ def test_read_csv_unicode(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("sep", [",", "\t"])
 @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
 def test_utf16_bom_skiprows(all_parsers, sep, encoding):
@@ -68,6 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_utf16_example(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "utf16_ex.txt")
     parser = all_parsers
@@ -75,6 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path):
     assert len(result) == 50
 
 
+@xfail_pyarrow
 def test_unicode_encoding(all_parsers, csv_dir_path):
     path = os.path.join(csv_dir_path, "unicode_series.csv")
     parser = all_parsers
@@ -87,6 +95,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
     assert got == expected
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -120,6 +129,7 @@ def _encode_data_with_bom(_data):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     # see gh-13549
     expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
@@ -132,6 +142,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "file_path,encoding",
     [
@@ -163,6 +174,7 @@ def test_binary_mode_file_buffers(
     tm.assert_frame_equal(expected, result)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("pass_encoding", [True, False])
 def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
     # see gh-24130
@@ -179,6 +191,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding)
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_encoding_named_temp_file(all_parsers):
     # see gh-31819
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
index 4cd110136d7b0..34eaf6ae306b4 100644
--- a/pandas/tests/io/parser/test_header.py
+++ b/pandas/tests/io/parser/test_header.py
@@ -14,7 +14,11 @@
 from pandas import DataFrame, Index, MultiIndex
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_read_with_bad_header(all_parsers):
     parser = all_parsers
     msg = r"but only \d+ lines in file"
@@ -82,6 +86,7 @@ def test_no_header_prefix(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_with_index_col(all_parsers):
     parser = all_parsers
     data = """foo,1,2,3
@@ -119,6 +124,7 @@ def test_header_not_first_line(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_multi_index(all_parsers):
     parser = all_parsers
     expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
@@ -184,6 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg):
 _TestTuple = namedtuple("names", ["first", "second"])
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -231,6 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -277,6 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -324,6 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_multi_index_common_format_malformed1(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -344,6 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
+@xfail_pyarrow
 def test_header_multi_index_common_format_malformed2(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -365,6 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
+@xfail_pyarrow
 def test_header_multi_index_common_format_malformed3(all_parsers):
     parser = all_parsers
     expected = DataFrame(
@@ -385,6 +397,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers):
     tm.assert_frame_equal(expected, result)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
 )
@@ -397,6 +410,7 @@ def test_header_names_backward_compat(all_parsers, data, header):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)])
 def test_read_only_header_no_rows(all_parsers, kwargs):
     # See gh-7773
@@ -442,6 +456,7 @@ def test_non_int_header(all_parsers, header):
         parser.read_csv(StringIO(data), header=header)
 
 
+@xfail_pyarrow
 def test_singleton_header(all_parsers):
     # see gh-7757
     data = """a,b,c\n0,1,2\n1,2,3"""
@@ -452,6 +467,7 @@ def test_singleton_header(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -498,6 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("index_col", [None, [0]])
 @pytest.mark.parametrize(
     "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
@@ -541,6 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_read_csv_multiindex_columns(all_parsers):
     # GH#6051
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
index 4d64f2bf411bd..a0a4fdbc25d49 100644
--- a/pandas/tests/io/parser/test_index_col.py
+++ b/pandas/tests/io/parser/test_index_col.py
@@ -11,7 +11,11 @@
 from pandas import DataFrame, Index, MultiIndex
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@skip_pyarrow
 @pytest.mark.parametrize("with_header", [True, False])
 def test_index_col_named(all_parsers, with_header):
     parser = all_parsers
@@ -66,6 +70,7 @@ def test_index_col_is_true(all_parsers):
         parser.read_csv(StringIO(data), index_col=True)
 
 
+@xfail_pyarrow
 def test_infer_index_col(all_parsers):
     data = """A,B,C
 foo,1,2,3
@@ -83,6 +88,7 @@ def test_infer_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "index_col,kwargs",
     [
@@ -127,6 +133,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_with_index_col_false(all_parsers):
     # see gh-10413
     data = "x,y"
@@ -137,6 +144,7 @@ def test_empty_with_index_col_false(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "index_names",
     [
@@ -161,6 +169,7 @@ def test_multi_index_naming(all_parsers, index_names):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multi_index_naming_not_all_at_beginning(all_parsers):
     parser = all_parsers
     data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
@@ -175,6 +184,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_multi_index_level_names_empty(all_parsers):
     # GH 10984
     parser = all_parsers
@@ -186,6 +196,7 @@ def test_no_multi_index_level_names_empty(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_header_with_index_col(all_parsers):
     # GH 33476
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
index 5c4e642115798..cc88a1d974767 100644
--- a/pandas/tests/io/parser/test_mangle_dupes.py
+++ b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -10,7 +10,10 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
 def test_basic(all_parsers, kwargs):
     # TODO: add test for condition "mangle_dupe_cols=False"
@@ -24,6 +27,7 @@ def test_basic(all_parsers, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_basic_names(all_parsers):
     # See gh-7160
     parser = all_parsers
@@ -44,6 +48,7 @@ def test_basic_names_raise(all_parsers):
         parser.read_csv(StringIO(data), names=["a", "b", "a"])
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -111,6 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
         parser.read_csv(StringIO(data), names=names)
 
 
+@xfail_pyarrow
 def test_mangled_unnamed_placeholders(all_parsers):
     # xref gh-13017
     orig_key = "0"
diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py
index d50560c684084..06f14e28435ef 100644
--- a/pandas/tests/io/parser/test_multi_thread.py
+++ b/pandas/tests/io/parser/test_multi_thread.py
@@ -12,6 +12,8 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 def _construct_dataframe(num_rows):
     """
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 9f86bbd65640e..9e7a445234a45 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -12,7 +12,11 @@
 from pandas import DataFrame, Index, MultiIndex
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_string_nas(all_parsers):
     parser = all_parsers
     data = """A,B,C
@@ -28,6 +32,7 @@ def test_string_nas(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_detect_string_na(all_parsers):
     parser = all_parsers
     data = """A,B
@@ -42,6 +47,7 @@ def test_detect_string_na(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "na_values",
     [
@@ -79,6 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_default_na_values(all_parsers):
     _NA_VALUES = {
         "-1.#IND",
@@ -126,6 +133,7 @@ def f(i, v):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_values", ["baz", ["baz"]])
 def test_custom_na_values(all_parsers, na_values):
     parser = all_parsers
@@ -159,6 +167,7 @@ def test_bool_na_values(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_na_value_dict(all_parsers):
     data = """A,B,C
 foo,bar,NA
@@ -177,6 +186,7 @@ def test_na_value_dict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "index_col,expected",
     [
@@ -210,6 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -297,6 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_keep_default_na_dict_na_values(all_parsers):
     # see gh-19227
     data = "a,b\n,2"
@@ -308,6 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
     # see gh-19227
     #
@@ -319,6 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
 def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
     # see gh-19227
@@ -348,6 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "na_filter,row_data",
     [
@@ -369,6 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_na_trailing_columns(all_parsers):
     parser = all_parsers
     data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
@@ -396,6 +412,7 @@ def test_na_trailing_columns(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "na_values,row_data",
     [
@@ -414,6 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_na_values_dict_aliasing(all_parsers):
     parser = all_parsers
     na_values = {"a": 2, "b": 1}
@@ -429,6 +447,7 @@ def test_na_values_dict_aliasing(all_parsers):
     tm.assert_dict_equal(na_values, na_values_copy)
 
 
+@xfail_pyarrow
 def test_na_values_dict_col_index(all_parsers):
     # see gh-14203
     data = "a\nfoo\n1"
@@ -440,6 +459,7 @@ def test_na_values_dict_col_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -469,6 +489,7 @@ def test_empty_na_values_no_default_with_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
 )
@@ -497,6 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers):
     tm.assert_frame_equal(out, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
     # see gh-20377
@@ -512,6 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data, na_values",
     [
@@ -540,6 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
         )
 
 
+@xfail_pyarrow
 def test_str_nan_dropped(all_parsers):
     # see gh-21131
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 662659982c0b3..722170c9b76df 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -34,7 +34,10 @@
 else:
     date_strategy = st.datetimes()
 
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 def test_separator_date_conflict(all_parsers):
     # Regression test for gh-4678
     #
@@ -56,6 +59,7 @@ def test_separator_date_conflict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col_custom(all_parsers, keep_date_col):
     data = """\
@@ -199,6 +203,7 @@ def date_parser(*date_cols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("container", [list, tuple, Index, Series])
 @pytest.mark.parametrize("dim", [1, 2])
 def test_concat_date_col_fail(container, dim):
@@ -211,6 +216,7 @@ def test_concat_date_col_fail(container, dim):
         parsing.concat_date_cols(date_cols)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
 def test_multiple_date_col(all_parsers, keep_date_col):
     data = """\
@@ -370,6 +376,7 @@ def test_date_col_as_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -434,6 +441,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multiple_date_col_timestamp_parse(all_parsers):
     parser = all_parsers
     data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
@@ -468,6 +476,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multiple_date_cols_with_header(all_parsers):
     parser = all_parsers
     data = """\
@@ -637,6 +646,7 @@ def test_date_parser_int_bug(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_nat_parse(all_parsers):
     # see gh-3062
     parser = all_parsers
@@ -652,6 +662,7 @@ def test_nat_parse(all_parsers):
         tm.assert_frame_equal(result, df)
 
 
+@xfail_pyarrow
 def test_csv_custom_parser(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -666,6 +677,7 @@ def test_csv_custom_parser(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_parse_dates_implicit_first_col(all_parsers):
     data = """A,B,C
 20090101,a,1,2
@@ -679,6 +691,7 @@ def test_parse_dates_implicit_first_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_parse_dates_string(all_parsers):
     data = """date,A,B,C
 20090101,a,1,2
@@ -723,6 +736,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
 def test_parse_dates_column_list(all_parsers, parse_dates):
     data = "a,b,c\n01/01/2010,1,15/02/2010"
@@ -739,6 +753,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_parse_dates(all_parsers, index_col):
     data = """index1,index2,A,B,C
@@ -784,6 +799,7 @@ def test_multi_index_parse_dates(all_parsers, index_col):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)])
 def test_parse_dates_custom_euro_format(all_parsers, kwargs):
     parser = all_parsers
@@ -828,6 +844,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
             )
 
 
+@xfail_pyarrow
 def test_parse_tz_aware(all_parsers):
     # See gh-1693
     parser = all_parsers
@@ -841,6 +858,7 @@ def test_parse_tz_aware(all_parsers):
     assert result.index.tz is pytz.utc
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "parse_dates,index_col",
     [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
@@ -941,6 +959,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_multiple_date_cols_chunked(all_parsers):
     parser = all_parsers
     data = """\
@@ -1033,6 +1052,7 @@ def test_multiple_date_cols_chunked(all_parsers):
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
+@xfail_pyarrow
 def test_multiple_date_col_named_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1056,6 +1076,7 @@ def test_multiple_date_col_named_index_compat(all_parsers):
     tm.assert_frame_equal(with_indices, with_names)
 
 
+@xfail_pyarrow
 def test_multiple_date_col_multiple_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1123,6 +1144,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
     )
 
 
+@xfail_pyarrow
 def test_parse_dates_empty_string(all_parsers):
     # see gh-2263
     parser = all_parsers
@@ -1135,6 +1157,7 @@ def test_parse_dates_empty_string(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -1174,6 +1197,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1202,6 +1226,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
@@ -1290,6 +1315,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]),
@@ -1312,6 +1338,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1343,6 +1370,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_parser, warning",
     (
@@ -1374,6 +1402,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_generic(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
@@ -1392,6 +1421,7 @@ def test_generic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_date_parser_resolution_if_not_ns(all_parsers):
     # see gh-10245
     parser = all_parsers
@@ -1489,6 +1519,7 @@ def test_parse_timezone(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_string",
     ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
@@ -1500,6 +1531,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "date_string,dayfirst,expected",
     [
@@ -1565,6 +1597,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
     assert result == expected
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "names, usecols, parse_dates, missing_cols",
     [
diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py
index 14773dfbea20e..8b010df470386 100644
--- a/pandas/tests/io/parser/test_quoting.py
+++ b/pandas/tests/io/parser/test_quoting.py
@@ -13,7 +13,11 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,msg",
     [
@@ -33,6 +37,7 @@ def test_bad_quote_char(all_parsers, kwargs, msg):
         parser.read_csv(StringIO(data), **kwargs)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "quoting,msg",
     [
@@ -57,6 +62,7 @@ def test_quote_char_basic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
 def test_quote_char_various(all_parsers, quote_char):
     parser = all_parsers
@@ -69,6 +75,7 @@ def test_quote_char_various(all_parsers, quote_char):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
 @pytest.mark.parametrize("quote_char", ["", None])
 def test_null_quote_char(all_parsers, quoting, quote_char):
@@ -88,6 +95,7 @@ def test_null_quote_char(all_parsers, quoting, quote_char):
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,exp_data",
     [
@@ -114,6 +122,7 @@ def test_quoting_various(all_parsers, kwargs, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
 )
@@ -137,6 +146,7 @@ def test_quotechar_unicode(all_parsers, quotechar):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("balanced", [True, False])
 def test_unbalanced_quoting(all_parsers, balanced):
     # see gh-22789.
diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
index fdccef1127c7e..732f2eb18fdd9 100644
--- a/pandas/tests/io/parser/test_skiprows.py
+++ b/pandas/tests/io/parser/test_skiprows.py
@@ -14,7 +14,10 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
+
+@xfail_pyarrow
 @pytest.mark.parametrize("skiprows", [list(range(6)), 6])
 def test_skip_rows_bug(all_parsers, skiprows):
     # see gh-505
@@ -42,6 +45,7 @@ def test_skip_rows_bug(all_parsers, skiprows):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_deep_skip_rows(all_parsers):
     # see gh-4382
     parser = all_parsers
@@ -57,6 +61,7 @@ def test_deep_skip_rows(all_parsers):
     tm.assert_frame_equal(result, condensed_result)
 
 
+@xfail_pyarrow
 def test_skip_rows_blank(all_parsers):
     # see gh-9832
     parser = all_parsers
@@ -83,6 +88,7 @@ def test_skip_rows_blank(all_parsers):
     tm.assert_frame_equal(data, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -123,6 +129,7 @@ def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skip_row_with_quote(all_parsers):
     # see gh-12775 and gh-10911
     parser = all_parsers
@@ -138,6 +145,7 @@ def test_skip_row_with_quote(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,exp_data",
     [
@@ -173,6 +181,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "line_terminator", ["\n", "\r\n", "\r"]  # "LF"  # "CRLF"  # "CR"
 )
@@ -209,6 +218,7 @@ def test_skiprows_lineterminator(all_parsers, line_terminator):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skiprows_infield_quote(all_parsers):
     # see gh-14459
     parser = all_parsers
@@ -219,6 +229,7 @@ def test_skiprows_infield_quote(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "kwargs,expected",
     [
@@ -234,6 +245,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_skip_rows_skip_all(all_parsers):
     parser = all_parsers
     data = "a\n1\n2\n3\n4\n5"
@@ -243,6 +255,7 @@ def test_skip_rows_skip_all(all_parsers):
         parser.read_csv(StringIO(data), skiprows=lambda x: True)
 
 
+@xfail_pyarrow
 def test_skip_rows_bad_callable(all_parsers):
     msg = "by zero"
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index 7e9c9866a666d..0f2e5882439f8 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -12,6 +12,9 @@
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
+skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+
 _msg_validate_usecols_arg = (
     "'usecols' must either be list-like "
     "of all strings, all unicode, all "
@@ -22,6 +25,7 @@
 )
 
 
+@skip_pyarrow
 def test_raise_on_mixed_dtype_usecols(all_parsers):
     # See gh-12678
     data = """a,b,c
@@ -35,6 +39,7 @@ def test_raise_on_mixed_dtype_usecols(all_parsers):
         parser.read_csv(StringIO(data), usecols=usecols)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
 def test_usecols(all_parsers, usecols):
     data = """\
@@ -50,6 +55,7 @@ def test_usecols(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_with_names(all_parsers):
     data = """\
 a,b,c
@@ -65,6 +71,7 @@ def test_usecols_with_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
 )
@@ -81,6 +88,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_relative_to_names2(all_parsers):
     # see gh-5766
     data = """\
@@ -97,6 +105,7 @@ def test_usecols_relative_to_names2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_name_length_conflict(all_parsers):
     data = """\
 1,2,3
@@ -125,6 +134,7 @@ def test_usecols_single_string(all_parsers):
         parser.read_csv(StringIO(data), usecols="foo")
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
 )
@@ -138,6 +148,7 @@ def test_usecols_index_col_false(all_parsers, data):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize("index_col", ["b", 0])
 @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
 def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
@@ -164,6 +175,7 @@ def test_usecols_index_col_conflict2(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_implicit_index_col(all_parsers):
     # see gh-2654
     parser = all_parsers
@@ -174,6 +186,7 @@ def test_usecols_implicit_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_regex_sep(all_parsers):
     # see gh-2733
     parser = all_parsers
@@ -184,6 +197,7 @@ def test_usecols_regex_sep(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_with_whitespace(all_parsers):
     parser = all_parsers
     data = "a  b  c\n4  apple  bat  5.7\n8  orange  cow  10"
@@ -193,6 +207,7 @@ def test_usecols_with_whitespace(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,expected",
     [
@@ -212,6 +227,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
 def test_usecols_with_parse_dates(all_parsers, usecols):
     # see gh-9755
@@ -230,6 +246,7 @@ def test_usecols_with_parse_dates(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 def test_usecols_with_parse_dates2(all_parsers):
     # see gh-13604
     parser = all_parsers
@@ -290,6 +307,7 @@ def test_usecols_with_parse_dates3(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_usecols_with_parse_dates4(all_parsers):
     data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
     usecols = list("abcdefghij")
@@ -313,6 +331,7 @@ def test_usecols_with_parse_dates4(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
 @pytest.mark.parametrize(
     "names",
@@ -406,6 +425,7 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_empty_usecols(all_parsers):
     data = "a,b,c\n1,2,3\n4,5,6"
     expected = DataFrame()
@@ -426,6 +446,7 @@ def test_np_array_usecols(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "usecols,expected",
     [
@@ -458,6 +479,7 @@ def test_callable_usecols(all_parsers, usecols, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
 def test_incomplete_first_row(all_parsers, usecols):
     # see gh-6710
@@ -470,6 +492,7 @@ def test_incomplete_first_row(all_parsers, usecols):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,usecols,kwargs,expected",
     [
@@ -502,6 +525,7 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@skip_pyarrow
 @pytest.mark.parametrize(
     "usecols,kwargs,expected,msg",
     [
@@ -558,6 +582,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected
         tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
 def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
     if all_parsers.engine != "c":

From b53a620b8fb77e1ab804a18e01662d85cf653bf7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 28 Oct 2020 04:07:45 +0000
Subject: [PATCH 32/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 asv_bench/benchmarks/io/csv.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 8792fff5300d3..c1fad1efde082 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -256,7 +256,10 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
 
     def time_read_csv_arrow(self, sep, decimal, float_precision):
         read_csv(
-            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+            self.data(self.StringIO_input),
+            sep=sep,
+            header=None,
+            names=list("abc"),
         )
 
 

From f13113d37ccad7f16d493931dac876d4cd246d96 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 28 Oct 2020 10:39:52 -0700
Subject: [PATCH 33/35] Fix typos

---
 pandas/compat/_optional.py                 | 30 +++++++++++-----------
 pandas/io/parsers.py                       | 10 +++-----
 pandas/tests/io/parser/test_unsupported.py |  3 ++-
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 6f00c8ddb37af..6569b077069e2 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -45,6 +45,7 @@
     "pandas_gbq": "pandas-gbq",
     "sqlalchemy": "SQLAlchemy",
     "jinja2": "Jinja2",
+    "pyarrow.csv": "pyarrow",
 }
 
 
@@ -119,23 +120,22 @@ def import_optional_dependency(
     # Handle submodules: if we have submodule, grab parent module from sys.modules
     parent = name.split(".")[0]
     if parent != name:
-        name = parent
-        module_to_get = sys.modules[name]
+        install_name = parent
+        module_to_get = sys.modules[install_name]
     else:
         module_to_get = module
     minimum_version = min_version if min_version is not None else VERSIONS.get(name)
-    if minimum_version:
-        version = _get_version(module_to_get)
-        if distutils.version.LooseVersion(version) < minimum_version:
-            assert on_version in {"warn", "raise", "ignore"}
-            msg = (
-                f"Pandas requires version '{minimum_version}' or newer of '{name}' "
-                f"(version '{version}' currently installed)."
-            )
-            if on_version == "warn":
-                warnings.warn(msg, UserWarning)
-                return None
-            elif on_version == "raise":
-                raise ImportError(msg)
+    version = _get_version(module_to_get)
+    if distutils.version.LooseVersion(version) < minimum_version:
+        assert on_version in {"warn", "raise", "ignore"}
+        msg = (
+            f"Pandas requires version '{minimum_version}' or newer of '{name}' "
+            f"(version '{version}' currently installed)."
+        )
+        if on_version == "warn":
+            warnings.warn(msg, UserWarning)
+            return None
+        elif on_version == "raise":
+            raise ImportError(msg)
 
     return module
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 03a70615591a1..75c1d7b06b635 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -839,7 +839,7 @@ def __init__(self, f, engine=None, **kwds):
             if engine == "pyarrow":
                 raise ValueError(
                     "The 'dialect' option is not supported with the 'pyarrow' engine"
-                
+                )
             kwds = _merge_with_dialect_properties(dialect, kwds)
 
         if kwds.get("header", "infer") == "infer":
@@ -2223,11 +2223,7 @@ def __init__(self, src, **kwds):
             self.src = BytesIOWrapper(self.src, encoding=encoding)
 
     def read(self):
-        pyarrow = import_optional_dependency(
-            "pyarrow.csv",
-            min_version="0.15.0",
-            extra="pyarrow is required to use the pyarrow engine",
-        )
+        pyarrow = import_optional_dependency("pyarrow.csv", min_version="0.15.0")
         kwdscopy = {k: v for k, v in self.kwds.items() if v is not None}
         # these are kwargs passed to pyarrow
         parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"}
@@ -3434,7 +3430,7 @@ def _isindex(colspec):
                     colspec = orig_names[colspec]
                 if _isindex(colspec):
                     continue
-                data_dict[colspec] = converter(np.array(data_dict[colspec]))
+                data_dict[colspec] = converter(np.asarray(data_dict[colspec]))
             else:
                 new_name, col, old_names = _try_convert_dates(
                     converter, colspec, data_dict, orig_names
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index d2ae4c160d519..6e9cdacd40586 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -132,11 +132,12 @@ def test_pyarrow_engine(self):
         1,2,3,4,"""
 
         for default in pa_unsupported:
-            print(default)
             msg = (
                 f"The {repr(default)} option is not "
                 f"supported with the 'pyarrow' engine"
             )
             kwargs = {default: object()}
+            if default == "dialect":
+                kwargs[default] = "excel"  # test a random dialect
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)

From f9ce2e46838a0aec07d180dc8e909573b5408918 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 28 Oct 2020 11:47:47 -0700
Subject: [PATCH 34/35] Doc fixes and more typo fixes

---
 doc/source/whatsnew/v1.1.0.rst |  8 --------
 doc/source/whatsnew/v1.2.0.rst |  6 ++++++
 pandas/compat/_optional.py     | 23 ++++++++++++-----------
 pandas/io/parsers.py           |  7 ++++---
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index a0383d7248624..50443f8810e5f 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -270,14 +270,6 @@ change, as ``fsspec`` will still bring in the same packages as before.
 
 .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/
 
-
-read_csv() now accepts pyarrow as an engine
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
-with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
-
-
 .. _whatsnew_110.enhancements.other:
 
 Other enhancements
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index f1f24ab7a101b..16b0324acaf6c 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -203,6 +203,12 @@ example where the index name is preserved:
 The same is true for :class:`MultiIndex`, but the logic is applied separately on a
 level-by-level basis.
 
+read_csv() now accepts pyarrow as an engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
+with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+
 .. _whatsnew_120.enhancements.other:
 
 Other enhancements
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 6569b077069e2..a6a14fcbee757 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -126,16 +126,17 @@ def import_optional_dependency(
         module_to_get = module
     minimum_version = min_version if min_version is not None else VERSIONS.get(name)
     version = _get_version(module_to_get)
-    if distutils.version.LooseVersion(version) < minimum_version:
-        assert on_version in {"warn", "raise", "ignore"}
-        msg = (
-            f"Pandas requires version '{minimum_version}' or newer of '{name}' "
-            f"(version '{version}' currently installed)."
-        )
-        if on_version == "warn":
-            warnings.warn(msg, UserWarning)
-            return None
-        elif on_version == "raise":
-            raise ImportError(msg)
+    if minimum_version:
+        if distutils.version.LooseVersion(version) < minimum_version:
+            assert on_version in {"warn", "raise", "ignore"}
+            msg = (
+                f"Pandas requires version '{minimum_version}' or newer of '{name}' "
+                f"(version '{version}' currently installed)."
+            )
+            if on_version == "warn":
+                warnings.warn(msg, UserWarning)
+                return None
+            elif on_version == "raise":
+                raise ImportError(msg)
 
     return module
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 75c1d7b06b635..5c70e31aca041 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -165,10 +165,11 @@
     of dtype conversion.
 engine : {{'c', 'python', 'pyarrow'}}, optional
     Parser engine to use. The C and pyarrow engines are faster, while the python engine
-    is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15
-    as a dependency however.
+    is currently more feature-complete. The pyarrow engine also supports multithreading
+    something that is not present in the C or python engines. It requires
+    ``pyarrow`` >= 0.15 as a dependency however.
 
-    .. versionchanged:: 1.1
+    .. versionchanged:: 1.2
         The "pyarrow" engine was added.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either

From 4158d6af395ba4335a59001010621ae0479abf48 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 2 Nov 2020 09:59:01 -0800
Subject: [PATCH 35/35] Green?

---
 pandas/compat/_optional.py             | 2 +-
 pandas/tests/io/parser/test_dialect.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index a6a14fcbee757..28741c1560543 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -125,8 +125,8 @@ def import_optional_dependency(
     else:
         module_to_get = module
     minimum_version = min_version if min_version is not None else VERSIONS.get(name)
-    version = _get_version(module_to_get)
     if minimum_version:
+        version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
             assert on_version in {"warn", "raise", "ignore"}
             msg = (
diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py
index 7a65e46ba670f..afdd7548ed0dd 100644
--- a/pandas/tests/io/parser/test_dialect.py
+++ b/pandas/tests/io/parser/test_dialect.py
@@ -13,7 +13,7 @@
 from pandas import DataFrame
 import pandas._testing as tm
 
-pytestmark = pytest.mark.usefixtures("pyarrow_xfail")
+pytestmark = pytest.mark.usefixtures("pyarrow_skip")
 
 
 @pytest.fixture