From f22ff466b510d13b323c5e483cdeecbbf739dd4e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 8 Feb 2020 20:24:35 -0800 Subject: [PATCH 01/35] add arrow engine to read_csv --- pandas/io/parsers.py | 132 +++++++++++++++++++++++++++++++------------ 1 file changed, 97 insertions(+), 35 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 84a8b5b2a94fe..f5c00f3f7d137 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,6 +20,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -165,9 +166,10 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : {{'c', 'python'}}, optional - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. +engine : {{'c', 'python', 'arrow'}}, optional + Parser engine to use. The C and arrow engines are faster while the python engine is + currently more feature-complete. The arrow engine requires ``pyarrow`` + as a dependency however. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -506,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } - _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, @@ -520,6 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} +_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} @@ -705,7 +707,6 @@ def read_fwf( infer_nrows=100, **kwds, ): - r""" Read a table of fixed-width formatted lines into DataFrame. @@ -879,7 +880,8 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - self._engine.close() + if self.engine != "arrow": + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -945,16 +947,16 @@ def _clean_options(self, options, engine): delim_whitespace = options["delim_whitespace"] # C engine not supported yet - if engine == "c": + if engine == "c" or engine == "arrow": if options["skipfooter"] > 0: - fallback_reason = "the 'c' engine does not support skipfooter" + fallback_reason = f"the {engine} engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" if sep is None and not delim_whitespace: - if engine == "c": + if engine == "c" or engine == "arrow": fallback_reason = ( - "the 'c' engine does not support " + f"the {engine} engine does not support " "sep=None with delim_whitespace=False" ) engine = "python" @@ -1081,14 +1083,20 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python parsers + # c-engine, so only need for python parser if engine != "c": - if is_integer(skiprows): - skiprows = list(range(skiprows)) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) + if engine == "arrow": + if not is_integer(skiprows) and skiprows is not None: + raise ValueError( + "skiprows argument must be integer when using arrow engine" + ) + else: + if is_integer(skiprows): + skiprows = list(range(skiprows)) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) # put stuff back result["names"] = names @@ -1109,6 +1117,8 @@ def __next__(self): def _make_engine(self, engine="c"): if engine == "c": self._engine = CParserWrapper(self.f, **self.options) + elif engine == "arrow": + self._engine = ArrowParserWrapper(self.f, **self.options) else: if engine == "python": klass = PythonParser @@ -1125,29 +1135,32 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = _validate_integer("nrows", nrows) - ret = self._engine.read(nrows) + if self.engine == "arrow": + return self._engine.read(nrows) + else: + nrows = _validate_integer("nrows", nrows) + ret = self._engine.read(nrows) - # May alter columns / col_dict - index, columns, col_dict = self._create_index(ret) + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 else: - new_rows = 0 - else: - new_rows = len(index) + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows - if self.squeeze and len(df.columns) == 1: - return df[df.columns[0]].copy() - return df + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]].copy() + return df def _create_index(self, ret): index, columns, col_dict = ret @@ -2135,6 +2148,56 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +class ArrowParserWrapper(ParserBase): + """ + + """ + + def __init__(self, src, **kwds): + self.kwds = kwds + self.src = src + kwds = kwds.copy() + + ParserBase.__init__(self, kwds) + + # #2442 + kwds["allow_leading_cols"] = self.index_col is not False + + # GH20529, validate usecol arg before TextReader + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + kwds["usecols"] = self.usecols + + self.names = kwds["names"] + + def read(self, nrows=None): + pyarrow = import_optional_dependency( + "pyarrow.csv", extra="pyarrow is required to use arrow engine" + ) + nrows = _validate_integer("nrows", nrows) + table = pyarrow.read_csv( + self.src, + read_options=pyarrow.ReadOptions( + skip_rows=self.kwds.get("skiprows"), column_names=self.names + ), + parse_options=pyarrow.ParseOptions( + delimiter=self.kwds.get("delimiter"), + quote_char=self.kwds.get("quotechar"), + ), + convert_options=pyarrow.ConvertOptions( + include_columns=self.usecols, column_types=self.kwds.get("dtype") + ), + ) + if nrows: + table = table[:nrows] + table_width = len(table.column_names) + if self.names is None: + if self.prefix: + self.names = [f"{self.prefix}{i}" for i in range(table_width)] + if self.names: + table = table.rename_columns(self.names) + return table.to_pandas() + + def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference @@ -3336,7 +3399,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): - if na_values is None: if keep_default_na: na_values = STR_NA_VALUES From 8ae43e44cdbec134771173b69a5d4c1a2400504f Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 8 Feb 2020 21:01:26 -0800 Subject: [PATCH 02/35] fix failing test --- pandas/io/parsers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f5c00f3f7d137..75da1d991dc9b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1135,7 +1135,7 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - if self.engine == "arrow": + if isinstance(self._engine, ArrowParserWrapper): return self._engine.read(nrows) else: nrows = _validate_integer("nrows", nrows) @@ -2165,9 +2165,6 @@ def __init__(self, src, **kwds): # GH20529, validate usecol arg before TextReader self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - kwds["usecols"] = self.usecols - - self.names = kwds["names"] def read(self, nrows=None): pyarrow = import_optional_dependency( From 09074df84e42eec3e7f7dd1ae7c710af53b386cc Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 10:01:55 -0800 Subject: [PATCH 03/35] formatting and revert unnecessary change --- pandas/io/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 75da1d991dc9b..ad60b223daa06 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -167,7 +167,7 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. engine : {{'c', 'python', 'arrow'}}, optional - Parser engine to use. The C and arrow engines are faster while the python engine is + Parser engine to use. The C and arrow engines are faster, while the python engine is currently more feature-complete. The arrow engine requires ``pyarrow`` as a dependency however. converters : dict, optional @@ -508,6 +508,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } + _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, From 6be276db8c7c5e1384bfb45591534176d2f6bfe5 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 10:07:03 -0800 Subject: [PATCH 04/35] remove bloat and more formatting changes --- pandas/io/parsers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ad60b223daa06..6d8764fef385c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -881,8 +881,7 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - if self.engine != "arrow": - self._engine.close() + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -1089,7 +1088,7 @@ def _clean_options(self, options, engine): if engine == "arrow": if not is_integer(skiprows) and skiprows is not None: raise ValueError( - "skiprows argument must be integer when using arrow engine" + "skiprows argument must be an integer when using engine='arrow'" ) else: if is_integer(skiprows): From df4fa7e2ac359f7e25031f8f92d312049972d1ec Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 10:25:25 -0800 Subject: [PATCH 05/35] Whatsnew --- doc/source/whatsnew/v1.1.0.rst | 4 +++- pandas/io/parsers.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 920919755dc23..2c4f5dcfbcde8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -42,7 +42,9 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) -- +- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing + if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or + "python" counterparts. - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6d8764fef385c..938bafa780d89 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -708,6 +708,7 @@ def read_fwf( infer_nrows=100, **kwds, ): + r""" Read a table of fixed-width formatted lines into DataFrame. @@ -3396,6 +3397,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): + if na_values is None: if keep_default_na: na_values = STR_NA_VALUES From ecaf3fd036d38dfd34e5d9a5de45304dbdfacca4 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 9 Feb 2020 16:35:32 -0800 Subject: [PATCH 06/35] Get tests up and running --- pandas/io/parsers.py | 12 +++++++----- pandas/tests/io/parser/conftest.py | 12 ++++++++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 371660b19b171..43272ef2cf600 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -508,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } - _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, @@ -522,7 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} -_arrow_unsupported = {"skipfooter", "low_memory", "float_precision"} +_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} @@ -708,7 +707,6 @@ def read_fwf( infer_nrows=100, **kwds, ): - r""" Read a table of fixed-width formatted lines into DataFrame. @@ -947,7 +945,12 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - # C engine not supported yet + # arrow engine not supported yet + if engine == "arrow": + if options["chunksize"] is not None: + fallback_reason = f"the arrow engine does not support chunksize" + engine = "python" + # C and arrow engine not supported yet if engine == "c" or engine == "arrow": if options["skipfooter"] > 0: fallback_reason = f"the {engine} engine does not support skipfooter" @@ -3401,7 +3404,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): - if na_values is None: if keep_default_na: na_values = STR_NA_VALUES diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 15967e3be176a..751db1d22e8ae 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -44,6 +44,11 @@ class PythonParser(BaseParser): float_precision_choices = [None] +class ArrowParser(BaseParser): + engine = "arrow" + float_precision_choices = [None] + + @pytest.fixture def csv_dir_path(datapath): """ @@ -63,14 +68,17 @@ def csv1(csv_dir_path): _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() +_arrowParser = ArrowParser() _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_all_parsers = [*_c_parsers_only, *_py_parsers_only] +_arrow_parsers_only = [_arrowParser] +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] +_arrow_parser_ids = ["arrow"] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) From b3c328723bb997a675e31cd8db84d77d75afa4f7 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 Feb 2020 07:26:58 -0800 Subject: [PATCH 07/35] Some fixes --- pandas/io/parsers.py | 45 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 43272ef2cf600..d3f40a6b9df2b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -947,7 +947,7 @@ def _clean_options(self, options, engine): # arrow engine not supported yet if engine == "arrow": - if options["chunksize"] is not None: + if self.chunksize is not None: fallback_reason = f"the arrow engine does not support chunksize" engine = "python" # C and arrow engine not supported yet @@ -1087,10 +1087,11 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python parser + # c-engine, so only need for python and arrow parsers if engine != "c": if engine == "arrow": if not is_integer(skiprows) and skiprows is not None: + # pyarrow expects skiprows to be passed as an integer raise ValueError( "skiprows argument must be an integer when using engine='arrow'" ) @@ -1131,7 +1132,7 @@ def _make_engine(self, engine="c"): else: raise ValueError( f"Unknown engine: {engine} (valid options " - 'are "c", "python", or "python-fwf")' + 'are "c", "python", "arrow", or "python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -1139,32 +1140,31 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - if isinstance(self._engine, ArrowParserWrapper): + nrows = _validate_integer("nrows", nrows) + if self.engine == "arrow": return self._engine.read(nrows) - else: - nrows = _validate_integer("nrows", nrows) - ret = self._engine.read(nrows) + ret = self._engine.read(nrows) - # May alter columns / col_dict - index, columns, col_dict = self._create_index(ret) + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) - else: - new_rows = 0 + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) else: - new_rows = len(index) + new_rows = 0 + else: + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows - if self.squeeze and len(df.columns) == 1: - return df[df.columns[0]].copy() - return df + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]].copy() + return df def _create_index(self, ret): index, columns, col_dict = ret @@ -2178,7 +2178,6 @@ def read(self, nrows=None): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use arrow engine" ) - nrows = _validate_integer("nrows", nrows) table = pyarrow.read_csv( self.src, read_options=pyarrow.ReadOptions( From 474baf4c83ee28330ef38b426f09617d2f8cfc9e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 Feb 2020 20:35:38 -0800 Subject: [PATCH 08/35] Add asvs and xfail some tests --- asv_bench/benchmarks/io/csv.py | 10 ++++++++++ pandas/io/parsers.py | 8 +++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9bcd125f56bbb..89c81a937090b 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -254,6 +254,16 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) + def time_read_csv_arrow_engine(self, sep, decimal, float_precision): + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + engine="arrow", + float_precision=None, + names=list("abc"), + ) + class ReadCSVCategorical(BaseIO): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d3f40a6b9df2b..dd2155d2d735b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -521,7 +521,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} -_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"} +_arrow_unsupported = { + "skipfooter", + "low_memory", + "float_precision", + "chunksize", + "comment", +} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} From 2cd993771b6c07a8144c8472c710e164410c8e37 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 19 Feb 2020 16:57:52 -0800 Subject: [PATCH 09/35] address comments --- asv_bench/benchmarks/io/csv.py | 4 +- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/io/parsers.py | 63 +++++++++++++++++++----------- pandas/tests/io/parser/conftest.py | 14 +++---- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 89c81a937090b..a4e6f94f326ba 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -254,12 +254,12 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) - def time_read_csv_arrow_engine(self, sep, decimal, float_precision): + def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision): read_csv( self.data(self.StringIO_input), sep=sep, header=None, - engine="arrow", + engine="pyarrow", float_precision=None, names=list("abc"), ) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fc0e486978ffb..297c561557053 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -43,7 +43,7 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing - if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or + if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or "python" counterparts. - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dd2155d2d735b..59678d675b0b1 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper +from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper import re import sys from textwrap import fill @@ -166,10 +166,11 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : {{'c', 'python', 'arrow'}}, optional - Parser engine to use. The C and arrow engines are faster, while the python engine is - currently more feature-complete. The arrow engine requires ``pyarrow`` +engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. + .. versionchanged(1.1) converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -521,9 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} -_arrow_unsupported = { +_pyarrow_unsupported = { "skipfooter", - "low_memory", "float_precision", "chunksize", "comment", @@ -951,20 +951,29 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - # arrow engine not supported yet - if engine == "arrow": - if self.chunksize is not None: - fallback_reason = f"the arrow engine does not support chunksize" - engine = "python" - # C and arrow engine not supported yet - if engine == "c" or engine == "arrow": + # pyarrow engine not supported yet + if engine == "pyarrow": + for option in _pyarrow_unsupported: + if option != "chunksize" and option != "skipfooter": + if options[option] is not None: + fallback_reason = ( + f"the pyarrow engine does not support the {option} argumnet" + ) + engine = "python" + else: + if self.chunksize is not None: + fallback_reason = ( + "the pyarrow engine does not support using chunksize" + ) + # C and pyarrow engine not supported yet + if engine == "c" or "pyarrow": if options["skipfooter"] > 0: fallback_reason = f"the {engine} engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" if sep is None and not delim_whitespace: - if engine == "c" or engine == "arrow": + if engine == "c" or engine == "pyarrow": fallback_reason = ( f"the {engine} engine does not support " "sep=None with delim_whitespace=False" @@ -1093,13 +1102,14 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python and arrow parsers + # c-engine, so only need for python and pyarrow parsers if engine != "c": - if engine == "arrow": + if engine == "pyarrow": if not is_integer(skiprows) and skiprows is not None: # pyarrow expects skiprows to be passed as an integer raise ValueError( - "skiprows argument must be an integer when using engine='arrow'" + "skiprows argument must be an integer when using " + "engine='pyarrow'" ) else: if is_integer(skiprows): @@ -2164,7 +2174,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): class ArrowParserWrapper(ParserBase): """ - + Wrapper for the pyarrow engine for pd.read_csv() """ def __init__(self, src, **kwds): @@ -2174,12 +2184,13 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - # #2442 - kwds["allow_leading_cols"] = self.index_col is not False + encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" - # GH20529, validate usecol arg before TextReader self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + if isinstance(self.src, StringIO): + self.src = BytesIO(self.src.getvalue().encode(encoding)) + def read(self, nrows=None): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use arrow engine" @@ -2197,12 +2208,18 @@ def read(self, nrows=None): include_columns=self.usecols, column_types=self.kwds.get("dtype") ), ) - if nrows: - table = table[:nrows] + table_width = len(table.column_names) if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(table_width)] + elif self.header is not None: + if self.header == "infer": + header = 0 + else: + header = self.header + self.names = table[header] + del table[header] if self.names: table = table.rename_columns(self.names) return table.to_pandas() diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 751db1d22e8ae..327f87303aeb0 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -44,8 +44,8 @@ class PythonParser(BaseParser): float_precision_choices = [None] -class ArrowParser(BaseParser): - engine = "arrow" +class PyArrowParser(BaseParser): + engine = "pyarrow" float_precision_choices = [None] @@ -68,17 +68,17 @@ def csv1(csv_dir_path): _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() -_arrowParser = ArrowParser() +_pyarrowParser = PyArrowParser() _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_arrow_parsers_only = [_arrowParser] -_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only] +_pyarrow_parsers_only = [_pyarrowParser] +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_arrow_parser_ids = ["arrow"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids] +_pyarrow_parser_ids = ["pyarrow"] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) From 3d15a5660d7779eb7638875a33882b3e9103b190 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 20 Feb 2020 10:57:11 -0800 Subject: [PATCH 10/35] fix typo --- pandas/io/parsers.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 59678d675b0b1..4d31ca3230df6 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1138,7 +1138,7 @@ def __next__(self): def _make_engine(self, engine="c"): if engine == "c": self._engine = CParserWrapper(self.f, **self.options) - elif engine == "arrow": + elif engine == "pyarrow": self._engine = ArrowParserWrapper(self.f, **self.options) else: if engine == "python": @@ -1157,7 +1157,7 @@ def _failover_to_python(self): def read(self, nrows=None): nrows = _validate_integer("nrows", nrows) - if self.engine == "arrow": + if self.engine == "pyarrow": return self._engine.read(nrows) ret = self._engine.read(nrows) @@ -2208,21 +2208,19 @@ def read(self, nrows=None): include_columns=self.usecols, column_types=self.kwds.get("dtype") ), ) - + frame = table.to_pandas() table_width = len(table.column_names) if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(table_width)] - elif self.header is not None: - if self.header == "infer": - header = 0 - else: - header = self.header - self.names = table[header] - del table[header] + elif self.header is not None and self.header != "infer": + header = self.header + self.names = frame.iloc[header] + frame = frame.drop(header, axis=0) + if self.names: - table = table.rename_columns(self.names) - return table.to_pandas() + frame = frame.rename(self.names, axis="columns") + return frame def TextParser(*args, **kwds): From 98aa134d85044ab84adade39f66639777d971eed Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 29 Feb 2020 08:59:43 -0800 Subject: [PATCH 11/35] some fixes --- pandas/io/parsers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3ab847ebd7e04..dbd55f2015d1c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2195,7 +2195,9 @@ def read(self, nrows=None): table = pyarrow.read_csv( self.src, read_options=pyarrow.ReadOptions( - skip_rows=self.kwds.get("skiprows"), column_names=self.names + skip_rows=self.kwds.get("skiprows"), + column_names=self.names, + autogenerate_column_names=True if self.header != 0 else False, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), @@ -2215,8 +2217,7 @@ def read(self, nrows=None): self.names = frame.iloc[header] frame = frame.drop(header, axis=0) - if self.names: - frame = frame.rename(self.names, axis="columns") + frame = frame.rename(zip(frame.names, self.names), axis="columns") return frame From b9c6d2c0a2b177c12c94b30f7c1395d77d1d0242 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 4 Apr 2020 19:42:14 -0700 Subject: [PATCH 12/35] Fix bug --- pandas/io/parsers.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dbd55f2015d1c..ac7658d5b3772 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -526,6 +526,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "float_precision", "chunksize", "comment", + "nrows", } _python_unsupported = {"low_memory", "float_precision"} @@ -952,7 +953,11 @@ def _clean_options(self, options, engine): # pyarrow engine not supported yet if engine == "pyarrow": for option in _pyarrow_unsupported: - if option != "chunksize" and option != "skipfooter": + if ( + option != "chunksize" + and option != "skipfooter" + and option != "nrows" + ): if options[option] is not None: fallback_reason = ( f"the pyarrow engine does not support the {option} argumnet" @@ -963,6 +968,10 @@ def _clean_options(self, options, engine): fallback_reason = ( "the pyarrow engine does not support using chunksize" ) + if self.nrows is not None: + fallback_reason = ( + "the pyarrow engine does not support using skipfooter" + ) # C and pyarrow engine not supported yet if engine == "c" or "pyarrow": if options["skipfooter"] > 0: @@ -2171,7 +2180,7 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): class ArrowParserWrapper(ParserBase): """ - Wrapper for the pyarrow engine for pd.read_csv() + Wrapper for the pyarrow engine for read_csv() """ def __init__(self, src, **kwds): @@ -2208,16 +2217,22 @@ def read(self, nrows=None): ), ) frame = table.to_pandas() - table_width = len(table.column_names) + num_cols = len(frame.columns) if self.names is None: if self.prefix: - self.names = [f"{self.prefix}{i}" for i in range(table_width)] - elif self.header is not None and self.header != "infer": + self.names = [f"{self.prefix}{i}" for i in range(num_cols)] + frame = frame.rename( + dict(zip(frame.columns, self.names), axis="columns") + ) + elif self.header != 0: header = self.header self.names = frame.iloc[header] frame = frame.drop(header, axis=0) - - frame = frame.rename(zip(frame.names, self.names), axis="columns") + frame = frame.rename( + dict(zip(frame.columns, self.names), axis="columns") + ) + if self.kwds.get("squeeze"): + frame = frame.squeeze() return frame From 7f891a64d8887d69ca435d6b7093a81239ca95f3 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 10 Apr 2020 11:02:05 -0700 Subject: [PATCH 13/35] New benchmark and fix more tests --- asv_bench/benchmarks/io/csv.py | 37 ++++++++++------- pandas/io/parsers.py | 73 ++++++++++++++++++++++------------ 2 files changed, 71 insertions(+), 39 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index a4e6f94f326ba..047fc1fe5f7f7 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,7 +10,6 @@ class ToCSV(BaseIO): - fname = "__test__.csv" params = ["wide", "long", "mixed"] param_names = ["kind"] @@ -43,7 +42,6 @@ def time_frame(self, kind): class ToCSVDatetime(BaseIO): - fname = "__test__.csv" def setup(self): @@ -55,7 +53,6 @@ def time_frame_date_formatting(self): class ToCSVDatetimeBig(BaseIO): - fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] @@ -83,7 +80,6 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): - params = ([True, False], ["custom", "iso8601", "ymd"]) param_names = ["infer_datetime_format", "format"] @@ -108,7 +104,6 @@ def time_read_csv(self, infer_datetime_format, format): class ReadCSVConcatDatetime(StringIORewind): - iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): @@ -126,7 +121,6 @@ def time_read_csv(self): class ReadCSVConcatDatetimeBadDateValue(StringIORewind): - params = (["nan", "0", ""],) param_names = ["bad_date_value"] @@ -144,7 +138,6 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): - fname = "__test__.csv" params = [None, 10000] param_names = ["skiprows"] @@ -190,7 +183,6 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): - fname = "__test__.csv" params = ([",", "|"], [None, ","]) param_names = ["sep", "thousands"] @@ -222,7 +214,6 @@ def time_comment(self): class ReadCSVFloatPrecision(StringIORewind): - params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) param_names = ["sep", "decimal", "float_precision"] @@ -254,19 +245,38 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) - def time_read_csv_pyarrow_engine(self, sep, decimal, float_precision): + def time_read_csv_arrow(self, sep): + read_csv( + self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + ) + + +class ReadCSVEngine(StringIORewind): + def setup(self): + data = ["A,B,C"] + (["1,2,3"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) + + def time_read_csv_c(self, sep): + read_csv( + self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + ) + + def time_read_csv_arrow(self, sep): + read_csv( + self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + ) + + def time_read_csv_python_engine(self, sep): read_csv( self.data(self.StringIO_input), sep=sep, header=None, - engine="pyarrow", - float_precision=None, + engine="python", names=list("abc"), ) class ReadCSVCategorical(BaseIO): - fname = "__test__.csv" def setup(self): @@ -335,7 +345,6 @@ def time_read_csv_cached(self, do_cache): class ReadCSVMemoryGrowth(BaseIO): - chunksize = 20 num_rows = 1000 fname = "__test__.csv" diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f17c1008e29a5..175dccf0633df 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import BytesIO, StringIO, TextIOWrapper +from io import StringIO, TextIOBase, TextIOWrapper import itertools import re import sys @@ -172,7 +172,7 @@ Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. - .. versionchanged(1.1) + .. versionchanged:: (1.1) converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -1167,27 +1167,28 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = _validate_integer("nrows", nrows) if self.engine == "pyarrow": - return self._engine.read(nrows) - ret = self._engine.read(nrows) + df = self._engine.read() + else: + nrows = _validate_integer("nrows", nrows) + ret = self._engine.read(nrows) - # May alter columns / col_dict - index, columns, col_dict = self._create_index(ret) + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 else: - new_rows = 0 - else: - new_rows = len(index) + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() @@ -2231,6 +2232,19 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +class BytesIOWrapper: + def __init__(self, string_buffer, encoding="utf-8"): + self.string_buffer = string_buffer + self.encoding = encoding + + def __getattr__(self, attr): + return getattr(self.string_buffer, attr) + + def read(self, size=-1): + content = self.string_buffer.read(size) + return content.encode(self.encoding) + + class ArrowParserWrapper(ParserBase): """ Wrapper for the pyarrow engine for read_csv() @@ -2247,10 +2261,10 @@ def __init__(self, src, **kwds): self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - if isinstance(self.src, StringIO): - self.src = BytesIO(self.src.getvalue().encode(encoding)) + if isinstance(self.src, TextIOBase): + self.src = BytesIOWrapper(self.src, encoding=encoding) - def read(self, nrows=None): + def read(self): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use arrow engine" ) @@ -2259,7 +2273,9 @@ def read(self, nrows=None): read_options=pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), column_names=self.names, - autogenerate_column_names=True if self.header != 0 else False, + autogenerate_column_names=True + if self.header != 0 or self.kwds.get("skiprows") != set() + else False, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), @@ -2277,15 +2293,22 @@ def read(self, nrows=None): frame = frame.rename( dict(zip(frame.columns, self.names), axis="columns") ) - elif self.header != 0: + elif self.header is not None and self.header != 0: header = self.header self.names = frame.iloc[header] frame = frame.drop(header, axis=0) frame = frame.rename( - dict(zip(frame.columns, self.names), axis="columns") + columns=dict(zip(frame.columns, self.names), axis="columns") ) - if self.kwds.get("squeeze"): - frame = frame.squeeze() + elif self.header is None: + self.names = range(len(frame.columns)) + frame = frame.rename( + columns=dict(zip(frame.columns, self.names), axis="columns") + ) + + index_col = self.kwds.get("index_col")[0] # flatten list w/ 1 elem + if index_col is not None: + frame.set_index(frame.columns[index_col], drop=True, inplace=True) return frame From 23425f7be4840ac48ff35058ae9a64d064628537 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 10 Apr 2020 15:27:33 -0700 Subject: [PATCH 14/35] More cleanups --- asv_bench/benchmarks/io/csv.py | 22 +++++++--------------- doc/source/whatsnew/v1.1.0.rst | 6 +++--- pandas/io/parsers.py | 7 +++---- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 047fc1fe5f7f7..b7d7c4e8c120a 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -245,7 +245,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) - def time_read_csv_arrow(self, sep): + def time_read_csv_arrow(self, sep, decimal, float_precision): read_csv( self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), ) @@ -256,23 +256,15 @@ def setup(self): data = ["A,B,C"] + (["1,2,3"] * 100000) self.StringIO_input = StringIO("\n".join(data)) - def time_read_csv_c(self, sep): - read_csv( - self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), - ) + def time_read_csv_c(self): + read_csv(self.data(self.StringIO_input)) - def time_read_csv_arrow(self, sep): - read_csv( - self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), - ) + def time_read_csv_arrow(self): + read_csv(self.data(self.StringIO_input), engine="pyarrow") - def time_read_csv_python_engine(self, sep): + def time_read_csv_python_engine(self): read_csv( - self.data(self.StringIO_input), - sep=sep, - header=None, - engine="python", - names=list("abc"), + self.data(self.StringIO_input), engine="python", ) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4c44e35169ba7..b60a79a239628 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -88,9 +88,6 @@ Other enhancements - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing - if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or - "python" counterparts. (:issue:`23697`) .. --------------------------------------------------------------------------- @@ -412,6 +409,9 @@ I/O - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) - Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) +- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing + if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or + "python" counterparts. (:issue:`23697`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 175dccf0633df..455b7f748102d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -172,6 +172,7 @@ Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. + .. versionchanged:: (1.1) converters : dict, optional Dict of functions for converting values in certain columns. Keys can either @@ -2266,16 +2267,14 @@ def __init__(self, src, **kwds): def read(self): pyarrow = import_optional_dependency( - "pyarrow.csv", extra="pyarrow is required to use arrow engine" + "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) table = pyarrow.read_csv( self.src, read_options=pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), column_names=self.names, - autogenerate_column_names=True - if self.header != 0 or self.kwds.get("skiprows") != set() - else False, + autogenerate_column_names=True if self.header != 0 else False, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), From 01c03942b61f4ab38cf4712c4d078a52c4f27939 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 10 Apr 2020 19:46:34 -0700 Subject: [PATCH 15/35] Formatting fixes and typo correction --- asv_bench/benchmarks/io/csv.py | 9 +++++++++ doc/source/whatsnew/v1.1.0.rst | 2 ++ pandas/io/parsers.py | 6 +++--- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index b7d7c4e8c120a..8dec39091e322 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,6 +10,7 @@ class ToCSV(BaseIO): + fname = "__test__.csv" params = ["wide", "long", "mixed"] param_names = ["kind"] @@ -42,6 +43,7 @@ def time_frame(self, kind): class ToCSVDatetime(BaseIO): + fname = "__test__.csv" def setup(self): @@ -53,6 +55,7 @@ def time_frame_date_formatting(self): class ToCSVDatetimeBig(BaseIO): + fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] @@ -80,6 +83,7 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): + params = ([True, False], ["custom", "iso8601", "ymd"]) param_names = ["infer_datetime_format", "format"] @@ -104,6 +108,7 @@ def time_read_csv(self, infer_datetime_format, format): class ReadCSVConcatDatetime(StringIORewind): + iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): @@ -121,6 +126,7 @@ def time_read_csv(self): class ReadCSVConcatDatetimeBadDateValue(StringIORewind): + params = (["nan", "0", ""],) param_names = ["bad_date_value"] @@ -138,6 +144,7 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): + fname = "__test__.csv" params = [None, 10000] param_names = ["skiprows"] @@ -183,6 +190,7 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): + fname = "__test__.csv" params = ([",", "|"], [None, ","]) param_names = ["sep", "thousands"] @@ -214,6 +222,7 @@ def time_comment(self): class ReadCSVFloatPrecision(StringIORewind): + params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) param_names = ["sep", "decimal", "float_precision"] diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 690df648ceada..1704f3c096801 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -527,6 +527,8 @@ I/O - :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing if pyarrow>0.13 is installed. However, the pyarrow engine is less feature-complete than its "c" or "python" counterparts. (:issue:`23697`) + + Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 455b7f748102d..0cf148366cc1c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -975,7 +975,7 @@ def _clean_options(self, options, engine): ) if self.nrows is not None: fallback_reason = ( - "the pyarrow engine does not support using skipfooter" + "the pyarrow engine does not support using nrows" ) # C and pyarrow engine not supported yet if engine == "c" or "pyarrow": @@ -2305,9 +2305,9 @@ def read(self): columns=dict(zip(frame.columns, self.names), axis="columns") ) - index_col = self.kwds.get("index_col")[0] # flatten list w/ 1 elem + index_col = self.kwds.get("index_col") # need to flatten since returns list if index_col is not None: - frame.set_index(frame.columns[index_col], drop=True, inplace=True) + frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True) return frame From ba5620ff84a14baa0814f96d2499b652a30afdd8 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 11 Apr 2020 17:22:45 -0700 Subject: [PATCH 16/35] skip pyarrow tests if not installed --- asv_bench/benchmarks/io/csv.py | 1 + pandas/tests/io/parser/conftest.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 8dec39091e322..fef4fee047862 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -346,6 +346,7 @@ def time_read_csv_cached(self, do_cache): class ReadCSVMemoryGrowth(BaseIO): + chunksize = 20 num_rows = 1000 fname = "__test__.csv" diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 327f87303aeb0..87a34d728bc60 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,4 +1,5 @@ import os +import pkgutil from typing import List, Optional import pytest @@ -73,12 +74,17 @@ def csv1(csv_dir_path): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] _pyarrow_parsers_only = [_pyarrowParser] -_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] + +if pkgutil.find_loader("pyarrow"): + _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] + _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] +else: + _all_parsers = [*_c_parsers_only, *_py_parsers_only] + _all_parser_ids = [*_c_parser_ids, *_py_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) From 2570c823f28eb722435929dd86ccfdfb2ff1a37b Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 11 Apr 2020 17:31:51 -0700 Subject: [PATCH 17/35] Address comments --- pandas/io/parsers.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 0cf148366cc1c..235cefd82f2d5 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -173,7 +173,8 @@ is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 as a dependency however. - .. versionchanged:: (1.1) + .. versionchanged:: 1.1 + The "pyarrow" engine was added. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -958,11 +959,7 @@ def _clean_options(self, options, engine): # pyarrow engine not supported yet if engine == "pyarrow": for option in _pyarrow_unsupported: - if ( - option != "chunksize" - and option != "skipfooter" - and option != "nrows" - ): + if option not in ["chunksize", "skipfooter", "nrows"]: if options[option] is not None: fallback_reason = ( f"the pyarrow engine does not support the {option} argumnet" @@ -2274,11 +2271,12 @@ def read(self): read_options=pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), column_names=self.names, - autogenerate_column_names=True if self.header != 0 else False, + autogenerate_column_names=False if self.header == 0 else True, ), parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), quote_char=self.kwds.get("quotechar"), + ignore_empty_lines=self.kwds.get("skip_blank_lines"), ), convert_options=pyarrow.ConvertOptions( include_columns=self.usecols, column_types=self.kwds.get("dtype") @@ -2289,21 +2287,15 @@ def read(self): if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - frame = frame.rename( - dict(zip(frame.columns, self.names), axis="columns") - ) + frame.columns = self.names elif self.header is not None and self.header != 0: header = self.header self.names = frame.iloc[header] frame = frame.drop(header, axis=0) - frame = frame.rename( - columns=dict(zip(frame.columns, self.names), axis="columns") - ) + frame.columns = self.names elif self.header is None: - self.names = range(len(frame.columns)) - frame = frame.rename( - columns=dict(zip(frame.columns, self.names), axis="columns") - ) + self.names = range(num_cols) + frame.columns = self.names index_col = self.kwds.get("index_col") # need to flatten since returns list if index_col is not None: From b3a1f6628879b8df819c82bc75686d6fd89f42d2 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 14 Apr 2020 14:24:28 -0700 Subject: [PATCH 18/35] Get some more tests to pass --- asv_bench/benchmarks/io/csv.py | 2 +- pandas/io/parsers.py | 45 ++++++++++++++++----------- pandas/tests/io/parser/test_common.py | 1 + 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index fef4fee047862..55bc8d35af432 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): def setup(self): - data = ["A,B,C"] + (["1,2,3"] * 100000) + data = ["A,B,C"] + (["1,2,3"] * 1000000) self.StringIO_input = StringIO("\n".join(data)) def time_read_csv_c(self): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 235cefd82f2d5..444582cbe723c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -531,6 +531,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "chunksize", "comment", "nrows", + "thousands", } _python_unsupported = {"low_memory", "float_precision"} @@ -959,12 +960,11 @@ def _clean_options(self, options, engine): # pyarrow engine not supported yet if engine == "pyarrow": for option in _pyarrow_unsupported: - if option not in ["chunksize", "skipfooter", "nrows"]: + if option not in ["chunksize", "nrows"]: if options[option] is not None: fallback_reason = ( f"the pyarrow engine does not support the {option} argumnet" ) - engine = "python" else: if self.chunksize is not None: fallback_reason = ( @@ -974,10 +974,10 @@ def _clean_options(self, options, engine): fallback_reason = ( "the pyarrow engine does not support using nrows" ) - # C and pyarrow engine not supported yet - if engine == "c" or "pyarrow": + # C engine not supported yet + if engine == "c": if options["skipfooter"] > 0: - fallback_reason = f"the {engine} engine does not support skipfooter" + fallback_reason = f"the 'c' engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" @@ -1157,7 +1157,7 @@ def _make_engine(self, engine="c"): else: raise ValueError( f"Unknown engine: {engine} (valid options " - 'are "c", "python", "arrow", or "python-fwf")' + 'are "c", "python", "pyarrow", or "python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -2266,13 +2266,24 @@ def read(self): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) - table = pyarrow.read_csv( - self.src, - read_options=pyarrow.ReadOptions( + try: + read_options = pyarrow.ReadOptions( skip_rows=self.kwds.get("skiprows"), - column_names=self.names, autogenerate_column_names=False if self.header == 0 else True, - ), + ) + except TypeError as e: + msg = "__init__() got an unexpected keyword argument" + if msg in str(e): + raise ImportError( + "Pyarrow version >= 0.15.0 is needed in order " + "to use skiprows kwarg with engine=pyarrow. " + "Please upgrade Pyarrow or switch engines." + ) + else: + raise e + table = pyarrow.read_csv( + self.src, + read_options=read_options, parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), quote_char=self.kwds.get("quotechar"), @@ -2287,17 +2298,13 @@ def read(self): if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - frame.columns = self.names elif self.header is not None and self.header != 0: - header = self.header - self.names = frame.iloc[header] - frame = frame.drop(header, axis=0) - frame.columns = self.names + self.names = frame.iloc[self.header] + frame = frame.drop(self.header, axis=0) elif self.header is None: self.names = range(num_cols) - frame.columns = self.names - - index_col = self.kwds.get("index_col") # need to flatten since returns list + frame.columns = self.names + index_col = self.index_col # need to flatten since returns list if index_col is not None: frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True) return frame diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 5bf9587a6ca22..f27178cdc429f 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -63,6 +63,7 @@ def _set_noconvert_columns(self): "parse_dates": parse_dates, "delimiter": ",", } + parser.engine = "c" parser._engine = MyCParserWrapper(StringIO(data), **parser.options) result = parser.read() From d46ceed07a5197cc24748e09a92c3b8199ce7fa3 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 16 Apr 2020 20:20:22 -0700 Subject: [PATCH 19/35] Fix some bugs and cleanups --- pandas/io/parsers.py | 113 ++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 28 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 444582cbe723c..39ee43f905950 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -532,6 +532,24 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "comment", "nrows", "thousands", + "memory_map", + "dialect", + "warn_bad_lines", + "error_bad_lines", + "delim_whitespace", + "quoting", + "lineterminator", + "converters", + "decimal", + "iterator", + "cache_dates", + "dayfirst", + "keep_date_col", + "infer_datetime_format", + "verbose", + "skipinitialspace", + "date_parser", + "cache_dates", } _python_unsupported = {"low_memory", "float_precision"} @@ -902,6 +920,16 @@ def _get_options_with_defaults(self, engine): for argname, default in _parser_defaults.items(): value = kwds.get(argname, default) + if argname in _pyarrow_unsupported: + if engine == "pyarrow" and value != default: + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" + ) + if argname == "iterator" and engine == "pyarrow": + raise ValueError( + "The iterator option is not supported with the" "pyarrow engine" + ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: raise ValueError("Setting mangle_dupe_cols=False is not supported yet") @@ -957,27 +985,10 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - # pyarrow engine not supported yet - if engine == "pyarrow": - for option in _pyarrow_unsupported: - if option not in ["chunksize", "nrows"]: - if options[option] is not None: - fallback_reason = ( - f"the pyarrow engine does not support the {option} argumnet" - ) - else: - if self.chunksize is not None: - fallback_reason = ( - "the pyarrow engine does not support using chunksize" - ) - if self.nrows is not None: - fallback_reason = ( - "the pyarrow engine does not support using nrows" - ) # C engine not supported yet if engine == "c": if options["skipfooter"] > 0: - fallback_reason = f"the 'c' engine does not support skipfooter" + fallback_reason = "the 'c' engine does not support skipfooter" engine = "python" encoding = sys.getfilesystemencoding() or "utf-8" @@ -2251,13 +2262,16 @@ class ArrowParserWrapper(ParserBase): def __init__(self, src, **kwds): self.kwds = kwds self.src = src - kwds = kwds.copy() + # kwds = kwds.copy() ParserBase.__init__(self, kwds) encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + self.na_values = _clean_na_values( + kwds["na_values"], keep_default_na=kwds["keep_default_na"] + ) if isinstance(self.src, TextIOBase): self.src = BytesIOWrapper(self.src, encoding=encoding) @@ -2268,8 +2282,7 @@ def read(self): ) try: read_options = pyarrow.ReadOptions( - skip_rows=self.kwds.get("skiprows"), - autogenerate_column_names=False if self.header == 0 else True, + skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True, ) except TypeError as e: msg = "__init__() got an unexpected keyword argument" @@ -2287,10 +2300,14 @@ def read(self): parse_options=pyarrow.ParseOptions( delimiter=self.kwds.get("delimiter"), quote_char=self.kwds.get("quotechar"), + escape_char=self.kwds.get("escapechar"), ignore_empty_lines=self.kwds.get("skip_blank_lines"), ), convert_options=pyarrow.ConvertOptions( - include_columns=self.usecols, column_types=self.kwds.get("dtype") + include_columns=self.usecols, + null_values=self.kwds.get("na_values"), + true_values=self.kwds.get("true_values"), + false_values=self.kwds.get("false_values"), ), ) frame = table.to_pandas() @@ -2298,17 +2315,57 @@ def read(self): if self.names is None: if self.prefix: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - elif self.header is not None and self.header != 0: - self.names = frame.iloc[self.header] - frame = frame.drop(self.header, axis=0) + elif self.header is not None: + self.names = frame.iloc[self.header].tolist() + frame.drop(range(self.header + 1), axis=0, inplace=True) + frame.reset_index(drop=True, inplace=True) elif self.header is None: self.names = range(num_cols) frame.columns = self.names - index_col = self.index_col # need to flatten since returns list - if index_col is not None: - frame.set_index(frame.columns[index_col[0]], drop=True, inplace=True) + if self.index_col is not None: + index_col = [frame.columns[i] for i in self.index_col] + frame.set_index(index_col, drop=True, inplace=True) + if self.kwds.get("dtype") is not None: + frame = frame.astype(self.kwds.get("dtype")) + else: + frame = frame.infer_objects() return frame + def _clean_na_values(na_values, keep_default_na=True): + if na_values is None: + if keep_default_na: + na_values = STR_NA_VALUES + else: + na_values = set() + na_fvalues = set() + elif isinstance(na_values, dict): + old_na_values = na_values.copy() + na_values = {} # Prevent aliasing. + + # Convert the values in the na_values dictionary + # into array-likes for further use. This is also + # where we append the default NaN values, provided + # that `keep_default_na=True`. + for k, v in old_na_values.items(): + if not is_list_like(v): + v = [v] + + if keep_default_na: + v = set(v) | STR_NA_VALUES + + na_values[k] = v + na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} + else: + if not is_list_like(na_values): + na_values = [na_values] + na_values = _stringify_na_values(na_values) + if keep_default_na: + na_values = na_values | STR_NA_VALUES + + na_fvalues = _floatify_na_values(na_values) + + return na_values, na_fvalues + def TextParser(*args, **kwds): """ From 637845922e829e9a6bc97c577b064935591f99ac Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 19 May 2020 20:40:57 -0700 Subject: [PATCH 20/35] Perform version checks for submodule imports too --- pandas/compat/_optional.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 7e253a52a9c00..139641f300980 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -1,5 +1,6 @@ import distutils.version import importlib +import sys import types import warnings @@ -92,10 +93,16 @@ def import_optional_dependency( raise ImportError(msg) from None else: return None - + # Grab parent module if submodule being imported + parent = name.split(".")[0] + if parent != name: + name = parent + module_to_get = sys.modules[name] + else: + module_to_get = module minimum_version = VERSIONS.get(name) if minimum_version: - version = _get_version(module) + version = _get_version(module_to_get) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( From 9d648821b047419b9541381ad50c419f9f571847 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 19 May 2020 20:44:52 -0700 Subject: [PATCH 21/35] Refresh with newer pyarrow --- asv_bench/benchmarks/io/csv.py | 19 ++++-- pandas/io/parsers.py | 116 +++++++++++++-------------------- 2 files changed, 59 insertions(+), 76 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 55bc8d35af432..52d88d20b6d52 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -262,20 +262,31 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): def setup(self): - data = ["A,B,C"] + (["1,2,3"] * 1000000) + data = ["A,B,C"] + (["1,2,3"] * 100000) self.StringIO_input = StringIO("\n".join(data)) + # simulate reading from file + self.BytesIO_input = self.StringIO_input.read().encode("utf-8") - def time_read_csv_c(self): + def time_read_stringcsv_c(self): read_csv(self.data(self.StringIO_input)) - def time_read_csv_arrow(self): + def time_read_stringcsv_arrow(self): read_csv(self.data(self.StringIO_input), engine="pyarrow") - def time_read_csv_python_engine(self): + def time_read_stringcsv_python_engine(self): read_csv( self.data(self.StringIO_input), engine="python", ) + def time_read_bytescsv_c(self): + read_csv(self.BytesIO_input) + + def time_read_bytescsv_arrow(self): + read_csv(self.BytesIO_input, engine="pyarrow") + + def time_read_bytescsv_python_engine(self): + read_csv(self.BytesIO_input, engine="python") + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 39ee43f905950..40dbfc4c4956d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -170,7 +170,7 @@ of dtype conversion. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.13 + is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15 as a dependency however. .. versionchanged:: 1.1 @@ -919,7 +919,6 @@ def _get_options_with_defaults(self, engine): for argname, default in _parser_defaults.items(): value = kwds.get(argname, default) - if argname in _pyarrow_unsupported: if engine == "pyarrow" and value != default: raise ValueError( @@ -928,7 +927,7 @@ def _get_options_with_defaults(self, engine): ) if argname == "iterator" and engine == "pyarrow": raise ValueError( - "The iterator option is not supported with the" "pyarrow engine" + "The iterator option is not supported with the pyarrow engine" ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: @@ -2262,17 +2261,22 @@ class ArrowParserWrapper(ParserBase): def __init__(self, src, **kwds): self.kwds = kwds self.src = src - # kwds = kwds.copy() ParserBase.__init__(self, kwds) encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - self.na_values = _clean_na_values( - kwds["na_values"], keep_default_na=kwds["keep_default_na"] + na_values = kwds["na_values"] + if isinstance(na_values, dict): + raise ValueError( + "The pyarrow engine doesn't support passing a dict for na_values" + ) + self.na_values = list( + _clean_na_values( + kwds["na_values"], keep_default_na=kwds["keep_default_na"] + )[0] ) - if isinstance(self.src, TextIOBase): self.src = BytesIOWrapper(self.src, encoding=encoding) @@ -2280,48 +2284,51 @@ def read(self): pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) + kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} + # these are kwargs passed to pyarrow + parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} + convertoptions = { + "include_columns", + "null_values", + "true_values", + "false_values", + } + parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions} + convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} + read_options = pyarrow.ReadOptions(autogenerate_column_names=True) + headerexists = True if self.header is not None and self.header >= 0 else False try: - read_options = pyarrow.ReadOptions( - skip_rows=self.kwds.get("skiprows"), autogenerate_column_names=True, - ) + skiprows = self.kwds.get("skiprows") + if skiprows is not None: + read_options = pyarrow.ReadOptions(skip_rows=skiprows) + elif self.header >= 0: + read_options = pyarrow.ReadOptions(skip_rows=self.header) except TypeError as e: msg = "__init__() got an unexpected keyword argument" if msg in str(e): raise ImportError( - "Pyarrow version >= 0.15.0 is needed in order " - "to use skiprows kwarg with engine=pyarrow. " - "Please upgrade Pyarrow or switch engines." + "pyarrow version >= 0.15.0 is required to use " + "read_csv with engine='pyarrow'" ) - else: - raise e table = pyarrow.read_csv( self.src, read_options=read_options, - parse_options=pyarrow.ParseOptions( - delimiter=self.kwds.get("delimiter"), - quote_char=self.kwds.get("quotechar"), - escape_char=self.kwds.get("escapechar"), - ignore_empty_lines=self.kwds.get("skip_blank_lines"), - ), - convert_options=pyarrow.ConvertOptions( - include_columns=self.usecols, - null_values=self.kwds.get("na_values"), - true_values=self.kwds.get("true_values"), - false_values=self.kwds.get("false_values"), - ), + parse_options=pyarrow.ParseOptions(**parse_options), + convert_options=pyarrow.ConvertOptions(**convert_options), ) frame = table.to_pandas() num_cols = len(frame.columns) - if self.names is None: - if self.prefix: - self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - elif self.header is not None: - self.names = frame.iloc[self.header].tolist() - frame.drop(range(self.header + 1), axis=0, inplace=True) - frame.reset_index(drop=True, inplace=True) - elif self.header is None: - self.names = range(num_cols) - frame.columns = self.names + if not headerexists: + if self.names is None: + if self.prefix is not None: + self.names = [f"{self.prefix}{i}" for i in range(num_cols)] + # elif self.header is not None: + # self.names = frame.iloc[self.header].tolist() + # frame.drop(range(self.header + 1), axis=0, inplace=True) + # frame.reset_index(drop=True, inplace=True) + elif self.header is None: + self.names = range(num_cols) + frame.columns = self.names if self.index_col is not None: index_col = [frame.columns[i] for i in self.index_col] frame.set_index(index_col, drop=True, inplace=True) @@ -2331,41 +2338,6 @@ def read(self): frame = frame.infer_objects() return frame - def _clean_na_values(na_values, keep_default_na=True): - if na_values is None: - if keep_default_na: - na_values = STR_NA_VALUES - else: - na_values = set() - na_fvalues = set() - elif isinstance(na_values, dict): - old_na_values = na_values.copy() - na_values = {} # Prevent aliasing. - - # Convert the values in the na_values dictionary - # into array-likes for further use. This is also - # where we append the default NaN values, provided - # that `keep_default_na=True`. - for k, v in old_na_values.items(): - if not is_list_like(v): - v = [v] - - if keep_default_na: - v = set(v) | STR_NA_VALUES - - na_values[k] = v - na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} - else: - if not is_list_like(na_values): - na_values = [na_values] - na_values = _stringify_na_values(na_values) - if keep_default_na: - na_values = na_values | STR_NA_VALUES - - na_fvalues = _floatify_na_values(na_values) - - return na_values, na_fvalues - def TextParser(*args, **kwds): """ From 93382b421cf62c2ad2f1ede65bd702e2912e8db6 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 21 May 2020 11:55:20 -0700 Subject: [PATCH 22/35] Start xfailing tests --- asv_bench/benchmarks/io/csv.py | 4 +-- pandas/io/parsers.py | 4 --- pandas/tests/io/parser/conftest.py | 19 ++++++++-- pandas/tests/io/parser/test_common.py | 42 +++++++++++----------- pandas/tests/io/parser/test_compression.py | 15 +++++--- 5 files changed, 50 insertions(+), 34 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 52d88d20b6d52..6e166ec315df6 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,4 +1,4 @@ -from io import StringIO +from io import BytesIO, StringIO import random import string @@ -265,7 +265,7 @@ def setup(self): data = ["A,B,C"] + (["1,2,3"] * 100000) self.StringIO_input = StringIO("\n".join(data)) # simulate reading from file - self.BytesIO_input = self.StringIO_input.read().encode("utf-8") + self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) def time_read_stringcsv_c(self): read_csv(self.data(self.StringIO_input)) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5bcd9253abb72..e64ca0651e7c7 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2322,10 +2322,6 @@ def read(self): if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - # elif self.header is not None: - # self.names = frame.iloc[self.header].tolist() - # frame.drop(range(self.header + 1), axis=0, inplace=True) - # frame.reset_index(drop=True, inplace=True) elif self.header is None: self.names = range(num_cols) frame.columns = self.names diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 87a34d728bc60..8f473bded9225 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,7 +1,8 @@ +import distutils.version import os -import pkgutil from typing import List, Optional +import pkg_resources import pytest from pandas import read_csv, read_table @@ -79,7 +80,10 @@ def csv1(csv_dir_path): _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -if pkgutil.find_loader("pyarrow"): +pyarrow_version = pkg_resources.get_distribution("pyarrow").version +if ( + distutils.version.LooseVersion(pyarrow_version) > "0.15.0" +): # TODO remove this if block once required pyarrow>0.15.0 _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] else: @@ -135,3 +139,14 @@ def encoding_fmt(request): Fixture for all possible string formats of a UTF encoding. """ return request.param + + +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + pytest.xfail("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b6987dae5ed2b..e0b6d70b607d6 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -70,7 +70,7 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) -def test_empty_decimal_marker(all_parsers): +def test_empty_decimal_marker(all_parsers, pyarrow_xfail): data = """A|B|C 1|2,334|5 10|13|10. @@ -83,7 +83,7 @@ def test_empty_decimal_marker(all_parsers): parser.read_csv(StringIO(data), decimal="") -def test_bad_stream_exception(all_parsers, csv_dir_path): +def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail): # see gh-13652 # # This test validates that both the Python engine and C engine will @@ -169,7 +169,7 @@ def test_squeeze(all_parsers): assert not result._is_view -def test_malformed(all_parsers): +def test_malformed(all_parsers, pyarrow_xfail): # see gh-6607 parser = all_parsers data = """ignore @@ -184,7 +184,7 @@ def test_malformed(all_parsers): @pytest.mark.parametrize("nrows", [5, 3, None]) -def test_malformed_chunks(all_parsers, nrows): +def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail): data = """ignore A,B,C skip @@ -203,7 +203,7 @@ def test_malformed_chunks(all_parsers, nrows): reader.read(nrows) -def test_unnamed_columns(all_parsers): +def test_unnamed_columns(all_parsers, pyarrow_xfail): data = """A,B,C,, 1,2,3,4,5 6,7,8,9,10 @@ -306,7 +306,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_wrong_num_columns(all_parsers): +def test_read_csv_wrong_num_columns(all_parsers, pyarrow_xfail): # Too few columns. data = """A,B,C,D,E,F 1,2,3,4,5,6 @@ -422,7 +422,7 @@ def test_int_conversion(all_parsers): @pytest.mark.parametrize("nrows", [3, 3.0]) -def test_read_nrows(all_parsers, nrows): +def test_read_nrows(all_parsers, nrows, pyarrow_xfail): # see gh-10476 data = """index,A,B,C,D foo,2,3,4,5 @@ -443,7 +443,7 @@ def test_read_nrows(all_parsers, nrows): @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) -def test_read_nrows_bad(all_parsers, nrows): +def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -460,7 +460,7 @@ def test_read_nrows_bad(all_parsers, nrows): @pytest.mark.parametrize("index_col", [0, "index"]) -def test_read_chunksize_with_index(all_parsers, index_col): +def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail): parser = all_parsers data = """index,A,B,C,D foo,2,3,4,5 @@ -492,7 +492,7 @@ def test_read_chunksize_with_index(all_parsers, index_col): @pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) -def test_read_chunksize_bad(all_parsers, chunksize): +def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -509,7 +509,7 @@ def test_read_chunksize_bad(all_parsers, chunksize): @pytest.mark.parametrize("chunksize", [2, 8]) -def test_read_chunksize_and_nrows(all_parsers, chunksize): +def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail): # see gh-15755 data = """index,A,B,C,D foo,2,3,4,5 @@ -527,7 +527,7 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize): tm.assert_frame_equal(concat(reader), expected) -def test_read_chunksize_and_nrows_changing_size(all_parsers): +def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -549,7 +549,7 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers): reader.get_chunk(size=3) -def test_get_chunk_passed_chunksize(all_parsers): +def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail): parser = all_parsers data = """A,B,C 1,2,3 @@ -565,7 +565,7 @@ def test_get_chunk_passed_chunksize(all_parsers): @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)]) -def test_read_chunksize_compat(all_parsers, kwargs): +def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail): # see gh-12185 data = """index,A,B,C,D foo,2,3,4,5 @@ -582,7 +582,7 @@ def test_read_chunksize_compat(all_parsers, kwargs): tm.assert_frame_equal(concat(reader), result) -def test_read_chunksize_jagged_names(all_parsers): +def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail): # see gh-23509 parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) @@ -594,7 +594,7 @@ def test_read_chunksize_jagged_names(all_parsers): tm.assert_frame_equal(result, expected) -def test_read_data_list(all_parsers): +def test_read_data_list(all_parsers, pyarrow_xfail): parser = all_parsers kwargs = dict(index_col=0) data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" @@ -608,7 +608,7 @@ def test_read_data_list(all_parsers): tm.assert_frame_equal(result, expected) -def test_iterator(all_parsers): +def test_iterator(all_parsers, pyarrow_xfail): # see gh-6607 data = """index,A,B,C,D foo,2,3,4,5 @@ -631,7 +631,7 @@ def test_iterator(all_parsers): tm.assert_frame_equal(last_chunk, expected[3:]) -def test_iterator2(all_parsers): +def test_iterator2(all_parsers, pyarrow_xfail): parser = all_parsers data = """A,B,C foo,1,2,3 @@ -694,7 +694,7 @@ def test_reader_list_skiprows(all_parsers): tm.assert_frame_equal(chunks[0], expected[1:3]) -def test_iterator_stop_on_chunksize(all_parsers): +def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail): # gh-3967: stopping iteration when chunksize is specified parser = all_parsers data = """A,B,C @@ -718,7 +718,7 @@ def test_iterator_stop_on_chunksize(all_parsers): @pytest.mark.parametrize( "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)] ) -def test_iterator_skipfooter_errors(all_parsers, kwargs): +def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail): msg = "'skipfooter' not supported for 'iteration'" parser = all_parsers data = "a\n1\n2" @@ -727,7 +727,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs): parser.read_csv(StringIO(data), skipfooter=1, **kwargs) -def test_nrows_skipfooter_errors(all_parsers): +def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail): msg = "'skipfooter' not supported with 'nrows'" data = "a\n1\n2\n3\n4\n5\n6" parser = all_parsers diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index b773664adda72..22bba9bd3f98a 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -29,7 +29,7 @@ def parser_and_data(all_parsers, csv1): @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) -def test_zip(parser_and_data, compression): +def test_zip(parser_and_data, compression, pyarrow_xfail): parser, data, expected = parser_and_data with tm.ensure_clean("test_file.zip") as path: @@ -46,7 +46,7 @@ def test_zip(parser_and_data, compression): @pytest.mark.parametrize("compression", ["zip", "infer"]) -def test_zip_error_multiple_files(parser_and_data, compression): +def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail): parser, data, expected = parser_and_data with tm.ensure_clean("combined_zip.zip") as path: @@ -60,7 +60,7 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) -def test_zip_error_no_files(parser_and_data): +def test_zip_error_no_files(parser_and_data, pyarrow_xfail): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -71,7 +71,7 @@ def test_zip_error_no_files(parser_and_data): parser.read_csv(path, compression="zip") -def test_zip_error_invalid_zip(parser_and_data): +def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -86,6 +86,11 @@ def test_compression(parser_and_data, compression_only, buffer, filename): compress_type = compression_only ext = "gz" if compress_type == "gzip" else compress_type + pyarrow_unsupported_exts = {"bz2", "zip", "xz"} + if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow": + # need to skip since this test will hang forever and not fail + pytest.skip(f"The pyarrow package doesn't come with {ext} support") + filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: @@ -141,7 +146,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) -def test_invalid_compression(all_parsers, invalid_compression): +def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail): parser = all_parsers compress_kwargs = dict(compression=invalid_compression) From f1bb4e25c77f4b672ddd5dfc7afc2af51abc9e32 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 27 May 2020 10:57:57 -0700 Subject: [PATCH 23/35] Get all tests to run & some fixes --- pandas/io/parsers.py | 37 ++++++++++++---------- pandas/tests/io/parser/conftest.py | 7 ++-- pandas/tests/io/parser/test_common.py | 4 +-- pandas/tests/io/parser/test_compression.py | 11 +++---- pandas/tests/io/parser/test_unsupported.py | 19 +++++++++++ 5 files changed, 50 insertions(+), 28 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e64ca0651e7c7..2f9e4ec11187e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,7 +21,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import import_optional_dependency, VERSIONS from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -444,7 +444,14 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) - chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) + chunksize = kwds.get("chunksize", None) + if kwds.get("engine") == "pyarrow": # chunksize not supported for pyarrow + if iterator: + raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine") + if chunksize is not None: + raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine") + else: + chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) nrows = kwds.get("nrows", None) # Check for duplicates in names. @@ -830,6 +837,9 @@ def __init__(self, f, engine=None, **kwds): self._engine_specified = kwds.get("engine_specified", engine_specified) if kwds.get("dialect") is not None: + if engine == "pyarrow": + raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine") + dialect = kwds["dialect"] if dialect in csv.list_dialects(): dialect = csv.get_dialect(dialect) @@ -923,11 +933,11 @@ def _get_options_with_defaults(self, engine): if engine == "pyarrow" and value != default: raise ValueError( f"The {repr(argname)} option is not supported with the " - f"{repr(engine)} engine" + f"'pyarrow' engine" ) if argname == "iterator" and engine == "pyarrow": raise ValueError( - "The iterator option is not supported with the pyarrow engine" + "The iterator option is not supported with the 'pyarrow' engine" ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: @@ -2281,6 +2291,7 @@ def __init__(self, src, **kwds): self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): + VERSIONS["pyarrow"] = "0.15.0" pyarrow = import_optional_dependency( "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" ) @@ -2297,19 +2308,11 @@ def read(self): convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} read_options = pyarrow.ReadOptions(autogenerate_column_names=True) headerexists = True if self.header is not None and self.header >= 0 else False - try: - skiprows = self.kwds.get("skiprows") - if skiprows is not None: - read_options = pyarrow.ReadOptions(skip_rows=skiprows) - elif self.header >= 0: - read_options = pyarrow.ReadOptions(skip_rows=self.header) - except TypeError as e: - msg = "__init__() got an unexpected keyword argument" - if msg in str(e): - raise ImportError( - "pyarrow version >= 0.15.0 is required to use " - "read_csv with engine='pyarrow'" - ) + skiprows = self.kwds.get("skiprows") + if skiprows is not None: + read_options = pyarrow.ReadOptions(skip_rows=skiprows) + elif headerexists: + read_options = pyarrow.ReadOptions(skip_rows=self.header) table = pyarrow.read_csv( self.src, read_options=read_options, diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 8f473bded9225..09379ac1b6922 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -80,10 +80,13 @@ def csv1(csv_dir_path): _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -pyarrow_version = pkg_resources.get_distribution("pyarrow").version +try: + pyarrow_version = pkg_resources.get_distribution("pyarrow").version +except pkg_resources.DistributionNotFound: + pyarrow_version = None if ( distutils.version.LooseVersion(pyarrow_version) > "0.15.0" -): # TODO remove this if block once required pyarrow>0.15.0 +): _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] else: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e0b6d70b607d6..f35da606110fe 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1517,7 +1517,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): ), ], ) -def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xfail): # see gh-12493 parser = all_parsers @@ -2082,7 +2082,7 @@ def test_read_table_equivalency_to_read_csv(all_parsers): tm.assert_frame_equal(result, expected) -def test_first_row_bom(all_parsers): +def test_first_row_bom(all_parsers, pyarrow_xfail): # see gh-26545 parser = all_parsers data = '''\ufeff"Head1" "Head2" "Head3"''' diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 22bba9bd3f98a..2c5f1b61370a5 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -81,16 +81,11 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): @pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression(parser_and_data, compression_only, buffer, filename): +def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail): parser, data, expected = parser_and_data compress_type = compression_only ext = "gz" if compress_type == "gzip" else compress_type - pyarrow_unsupported_exts = {"bz2", "zip", "xz"} - if ext in pyarrow_unsupported_exts and parser.engine == "pyarrow": - # need to skip since this test will hang forever and not fail - pytest.skip(f"The pyarrow package doesn't come with {ext} support") - filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: @@ -118,6 +113,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): expected = parser.read_csv(csv1, **kwargs) kwargs["compression"] = "infer" + if ext == "bz2": + pytest.xfail("pyarrow wheels don't have bz2 codec support") if buffer: with open(csv1) as f: result = parser.read_csv(f, **kwargs) @@ -128,7 +125,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail): # see gh-18071, gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 267fae760398a..44865d61d1b05 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -121,3 +121,22 @@ def read(self): with pytest.raises(ValueError, match=msg): read_csv(NoNextBuffer(data), engine=python_engine) + + def test_pyarrow_engine(self): + from pandas.io.parsers import _pyarrow_unsupported as pa_unsupported + + data = """1,2,3,, + 1,2,3,4, + 1,2,3,4,5 + 1,2,,, + 1,2,3,4,""" + + for default in pa_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the 'pyarrow' engine" + ) + print(default) + kwargs = {default: object()} + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="pyarrow", **kwargs) From 7876b4ef795150510837f74538fdc10b1c38333e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 29 May 2020 15:57:58 -0700 Subject: [PATCH 24/35] Lint and CI --- pandas/io/parsers.py | 15 +++++++++++---- pandas/tests/io/parser/conftest.py | 6 ++---- pandas/tests/io/parser/test_common.py | 2 +- pandas/tests/io/parser/test_compression.py | 8 ++++++-- pandas/tests/io/parser/test_dtypes.py | 2 +- pandas/tests/io/parser/test_unsupported.py | 1 - 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2f9e4ec11187e..f1a89da794849 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,7 +21,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer -from pandas.compat._optional import import_optional_dependency, VERSIONS +from pandas.compat._optional import VERSIONS, import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -447,9 +447,13 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): chunksize = kwds.get("chunksize", None) if kwds.get("engine") == "pyarrow": # chunksize not supported for pyarrow if iterator: - raise ValueError("The 'iterator' option is not supported with the 'pyarrow' engine") + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) if chunksize is not None: - raise ValueError("The 'chunksize' option is not supported with the 'pyarrow' engine") + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) else: chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) nrows = kwds.get("nrows", None) @@ -557,6 +561,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skipinitialspace", "date_parser", "cache_dates", + "parse_dates", } _python_unsupported = {"low_memory", "float_precision"} @@ -838,7 +843,9 @@ def __init__(self, f, engine=None, **kwds): if kwds.get("dialect") is not None: if engine == "pyarrow": - raise ValueError("The 'dialect' option is not supported with the 'pyarrow' engine") + raise ValueError( + "The 'dialect' option is not supported with the 'pyarrow' engine" + ) dialect = kwds["dialect"] if dialect in csv.list_dialects(): diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 09379ac1b6922..9aa23bd739d24 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -83,10 +83,8 @@ def csv1(csv_dir_path): try: pyarrow_version = pkg_resources.get_distribution("pyarrow").version except pkg_resources.DistributionNotFound: - pyarrow_version = None -if ( - distutils.version.LooseVersion(pyarrow_version) > "0.15.0" -): + pyarrow_version = "0" # represents pyarrow not found +if distutils.version.LooseVersion(pyarrow_version) > "0.15.0": _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] else: diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index f35da606110fe..96410f626952b 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1561,7 +1561,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -def test_raise_on_sep_with_delim_whitespace(all_parsers): +def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail): # see gh-6607 data = "a b c\n1 2 3" parser = all_parsers diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 2c5f1b61370a5..ecc35dd6644c8 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -81,7 +81,9 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): @pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression(parser_and_data, compression_only, buffer, filename, pyarrow_xfail): +def test_compression( + parser_and_data, compression_only, buffer, filename, pyarrow_xfail +): parser, data, expected = parser_and_data compress_type = compression_only @@ -125,7 +127,9 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail): +def test_compression_utf_encoding( + all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail +): # see gh-18071, gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index d1ed85cc6f466..626d4febd7ddf 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -403,7 +403,7 @@ def test_empty_with_multi_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfail): parser = all_parsers data = "one,one" diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 44865d61d1b05..2e6165619f318 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -136,7 +136,6 @@ def test_pyarrow_engine(self): f"The {repr(default)} option is not " f"supported with the 'pyarrow' engine" ) - print(default) kwargs = {default: object()} with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) From 008acab51559e76c1646bd659146d6b79081b99d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 3 Jun 2020 14:20:56 -0700 Subject: [PATCH 25/35] parse_dates support and fixups of some tests --- asv_bench/benchmarks/io/csv.py | 2 +- pandas/io/parsers.py | 8 +++----- pandas/tests/io/parser/test_unsupported.py | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 6e166ec315df6..f2462184abb37 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -262,7 +262,7 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): def setup(self): - data = ["A,B,C"] + (["1,2,3"] * 100000) + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000) self.StringIO_input = StringIO("\n".join(data)) # simulate reading from file self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f1a89da794849..24aff9ddba376 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -555,13 +555,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "iterator", "cache_dates", "dayfirst", - "keep_date_col", "infer_datetime_format", "verbose", "skipinitialspace", - "date_parser", "cache_dates", - "parse_dates", } _python_unsupported = {"low_memory", "float_precision"} @@ -2338,10 +2335,11 @@ def read(self): if self.index_col is not None: index_col = [frame.columns[i] for i in self.index_col] frame.set_index(index_col, drop=True, inplace=True) + + frame.columns, frame = self._do_date_conversions(frame.columns, frame) + if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) - else: - frame = frame.infer_objects() return frame diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 2e6165619f318..d2ae4c160d519 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -132,6 +132,7 @@ def test_pyarrow_engine(self): 1,2,3,4,""" for default in pa_unsupported: + print(default) msg = ( f"The {repr(default)} option is not " f"supported with the 'pyarrow' engine" From 2dddae747d4d612ab8e78761bd058ff76a13a5eb Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 12 Jun 2020 21:33:34 -0700 Subject: [PATCH 26/35] Date parsing fixes and address comments --- asv_bench/benchmarks/io/csv.py | 68 +++++++++---------- doc/source/user_guide/io.rst | 8 ++- doc/source/whatsnew/v1.1.0.rst | 6 +- pandas/io/parsers.py | 102 ++++++++++++++++++++++++----- pandas/tests/io/parser/conftest.py | 16 ++--- 5 files changed, 130 insertions(+), 70 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index f2462184abb37..3681cd4df481f 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = [None, 10000] - param_names = ["skiprows"] + params = ([None, 10000], ["c", "pyarrow"]) + param_names = ["skiprows", "engine"] - def setup(self, skiprows): + def setup(self, skiprows, engine): N = 20000 index = tm.makeStringIndex(N) df = DataFrame( @@ -164,8 +164,8 @@ def setup(self, skiprows): ) df.to_csv(self.fname) - def time_skipprows(self, skiprows): - read_csv(self.fname, skiprows=skiprows) + def time_skipprows(self, skiprows, engine): + read_csv(self.fname, skiprows=skiprows, engine=engine) class ReadUint64Integers(StringIORewind): @@ -261,31 +261,20 @@ def time_read_csv_arrow(self, sep, decimal, float_precision): class ReadCSVEngine(StringIORewind): - def setup(self): - data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 1000000) + params = ["c", "python", "pyarrow"] + param_names = ["engine"] + + def setup(self, engine): + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000) self.StringIO_input = StringIO("\n".join(data)) # simulate reading from file self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) - def time_read_stringcsv_c(self): - read_csv(self.data(self.StringIO_input)) - - def time_read_stringcsv_arrow(self): - read_csv(self.data(self.StringIO_input), engine="pyarrow") - - def time_read_stringcsv_python_engine(self): - read_csv( - self.data(self.StringIO_input), engine="python", - ) - - def time_read_bytescsv_c(self): - read_csv(self.BytesIO_input) - - def time_read_bytescsv_arrow(self): - read_csv(self.BytesIO_input, engine="pyarrow") + def time_read_stringcsv(self, engine): + read_csv(self.data(self.StringIO_input), engine=engine) - def time_read_bytescsv_python_engine(self): - read_csv(self.BytesIO_input, engine="python") + def time_read_bytescsv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) class ReadCSVCategorical(BaseIO): @@ -305,7 +294,10 @@ def time_convert_direct(self): class ReadCSVParseDates(StringIORewind): - def setup(self): + params = ["c", "pyarrow", "python"] + param_names = ["engine"] + + def setup(self, engine): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n @@ -316,18 +308,20 @@ def setup(self): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self): + def time_multiple_date(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]], ) - def time_baseline(self): + def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, parse_dates=[1], @@ -336,17 +330,18 @@ def time_baseline(self): class ReadCSVCachedParseDates(StringIORewind): - params = ([True, False],) - param_names = ["do_cache"] + params = ([True, False], ["c", "pyarrow", "python"]) + param_names = ["do_cache", "engine"] - def setup(self, do_cache): + def setup(self, do_cache, engine): data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) - def time_read_csv_cached(self, do_cache): + def time_read_csv_cached(self, do_cache, engine): try: read_csv( self.data(self.StringIO_input), + engine=engine, header=None, parse_dates=[0], cache_dates=do_cache, @@ -376,22 +371,23 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (["mY", "mdY", "hm"],) - param_names = ["value"] + params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"]) + param_names = ["value", "engine"] objects = { "mY": "01-2019\n10-2019\n02/2000\n", "mdY": "12/02/2010\n", "hm": "21:34\n", } - def setup(self, value): + def setup(self, value, engine): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value): + def time_read_special_date(self, value, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=["Date"], diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index df6b44ac654ce..9ff714a8211bb 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -160,9 +160,11 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use `str` or `object` together with suitable ``na_values`` settings to preserve and not interpret dtype. -engine : {``'c'``, ``'python'``} - Parser engine to use. The C engine is faster while the Python engine is - currently more feature-complete. +engine : {``'c'``, ``'pyarrow'``,``'python'``} + Parser engine to use. In terms of performance, the pyarrow engine, + which requires pyarrow>=0.15.0, is faster than the C engine, which + is faster than the python engine. However, the pyarrow and C engines + are currently less feature complete than their Python counterpart. converters : dict, default ``None`` Dict of functions for converting values in certain columns. Keys can either be integers or column labels. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 44a56e0818ae8..dee66257f2d56 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -288,6 +288,9 @@ Other enhancements - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing + if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or + "python" counterparts. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. --------------------------------------------------------------------------- @@ -901,9 +904,6 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) -- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing - if pyarrow>0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or - "python" counterparts. (:issue:`23697`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 24aff9ddba376..d8ef6488dc02a 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -170,9 +170,8 @@ of dtype conversion. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. The pyarrow engine requires ``pyarrow`` > 0.15 + is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. - .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional @@ -445,7 +444,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) - if kwds.get("engine") == "pyarrow": # chunksize not supported for pyarrow + # chunksize and iterator not supported for pyarrow + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( "The 'iterator' option is not supported with the 'pyarrow' engine" @@ -523,6 +523,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "skip_blank_lines": True, } + _c_parser_defaults = { "delim_whitespace": False, "na_filter": True, @@ -553,12 +554,11 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "converters", "decimal", "iterator", - "cache_dates", "dayfirst", "infer_datetime_format", "verbose", "skipinitialspace", - "cache_dates", + "low_memory", } _python_unsupported = {"low_memory", "float_precision"} @@ -939,10 +939,6 @@ def _get_options_with_defaults(self, engine): f"The {repr(argname)} option is not supported with the " f"'pyarrow' engine" ) - if argname == "iterator" and engine == "pyarrow": - raise ValueError( - "The iterator option is not supported with the 'pyarrow' engine" - ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: raise ValueError("Setting mangle_dupe_cols=False is not supported yet") @@ -2255,14 +2251,18 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): class BytesIOWrapper: - def __init__(self, string_buffer, encoding="utf-8"): + """ + Allows the pyarrow engine for read_csv() to read from string buffers + """ + + def __init__(self, string_buffer: StringIO, encoding: str = "utf-8"): self.string_buffer = string_buffer self.encoding = encoding - def __getattr__(self, attr): + def __getattr__(self, attr: str): return getattr(self.string_buffer, attr) - def read(self, size=-1): + def read(self, size: int = -1): content = self.string_buffer.read(size) return content.encode(self.encoding) @@ -2332,16 +2332,85 @@ def read(self): elif self.header is None: self.names = range(num_cols) frame.columns = self.names - if self.index_col is not None: - index_col = [frame.columns[i] for i in self.index_col] - frame.set_index(index_col, drop=True, inplace=True) - frame.columns, frame = self._do_date_conversions(frame.columns, frame) + frame = self._date_conversion( + frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col + ) + + if self.index_col is not None: + for i, item in enumerate(self.index_col): + if is_integer(item): + self.index_col[i] = frame.columns[item] + frame.set_index(self.index_col, drop=True, inplace=True) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) return frame + def _date_conversion( + self, data, converter, parse_spec, keep_date_col=False, + ): + + orig_names = data.columns + columns = list(data.columns) + + date_cols = set() + + if parse_spec is None or isinstance(parse_spec, bool): + return data, columns + + if isinstance(parse_spec, list): + # list of column lists + for colspec in parse_spec: + if is_scalar(colspec): + if isinstance(colspec, int) and colspec not in data: + colspec = orig_names[colspec] + data[colspec] = converter(data[colspec].values) + else: + new_name, col, old_names = self._try_convert_dates( + converter, colspec, data, orig_names + ) + if new_name in data: + raise ValueError(f"New date column already in dict {new_name}") + data[new_name] = col + date_cols.update(old_names) + + elif isinstance(parse_spec, dict): + # dict of new name to column list + for new_name, colspec in parse_spec.items(): + if new_name in data: + raise ValueError(f"Date column {new_name} already in dict") + + _, col, old_names = self._try_convert_dates( + converter, colspec, data, orig_names + ) + + data[new_name] = col + date_cols.update(old_names) + + if not keep_date_col: + data = data.drop(date_cols, axis=1) + + return data + + def _try_convert_dates(self, parser, colspec, data, columns): + colset = set(columns) + colnames = [] + + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int) and c not in columns: + colnames.append(columns[c]) + else: + colnames.append(c) + + new_name = "_".join(str(x) for x in colnames) + to_parse = [data[c].values for c in colnames if c in data] + + new_col = parser(*to_parse) + return new_name, new_col, colnames + def TextParser(*args, **kwds): """ @@ -3548,6 +3617,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): + if na_values is None: if keep_default_na: na_values = STR_NA_VALUES diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 9aa23bd739d24..11710fda521f1 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,8 +1,6 @@ -import distutils.version import os from typing import List, Optional -import pkg_resources import pytest from pandas import read_csv, read_table @@ -80,16 +78,8 @@ def csv1(csv_dir_path): _c_parser_ids = ["c_high", "c_low"] _pyarrow_parser_ids = ["pyarrow"] -try: - pyarrow_version = pkg_resources.get_distribution("pyarrow").version -except pkg_resources.DistributionNotFound: - pyarrow_version = "0" # represents pyarrow not found -if distutils.version.LooseVersion(pyarrow_version) > "0.15.0": - _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] - _all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] -else: - _all_parsers = [*_c_parsers_only, *_py_parsers_only] - _all_parser_ids = [*_c_parser_ids, *_py_parser_ids] +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) @@ -97,6 +87,8 @@ def all_parsers(request): """ Fixture all of the CSV parsers. """ + if request.param.engine == "pyarrow": + pytest.importorskip("pyarrow", "0.15.0") return request.param From 88e200a108985baa5ac05e5c07287b8971ea091d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 29 Jun 2020 11:04:49 -0700 Subject: [PATCH 27/35] Clean/Address comments/Update docs --- asv_bench/benchmarks/io/csv.py | 2 +- doc/source/whatsnew/v1.1.0.rst | 11 ++- pandas/compat/_optional.py | 16 ++-- pandas/io/parsers.py | 108 ++++++----------------- pandas/tests/test_optional_dependency.py | 7 +- 5 files changed, 51 insertions(+), 93 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 3681cd4df481f..8792fff5300d3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -294,7 +294,7 @@ def time_convert_direct(self): class ReadCSVParseDates(StringIORewind): - params = ["c", "pyarrow", "python"] + params = ["c", "python"] param_names = ["engine"] def setup(self, engine): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7c0a707c964c5..d54935c2bdc08 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -245,6 +245,14 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. +.. _whatsnew_110.enhancements.read_csv_pyarrow_engine_support: + +read_csv() now accepts pyarrow as an engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines +with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) + .. _whatsnew_110.enhancements.other: @@ -293,9 +301,6 @@ Other enhancements - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). -- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing - if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or - "python" counterparts. See the :doc:`I/O docs ` for more info. (:issue:`23697`) .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index ed025ec36dafd..f65d53c05257c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -2,6 +2,7 @@ import importlib import sys import types +from typing import Optional import warnings # Update install.rst when updating versions! @@ -46,7 +47,11 @@ def _get_version(module: types.ModuleType) -> str: def import_optional_dependency( - name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise" + name: str, + extra: str = "", + raise_on_missing: bool = True, + on_version: str = "raise", + min_version: Optional[str] = None, ): """ Import an optional dependency. @@ -58,8 +63,7 @@ def import_optional_dependency( Parameters ---------- name : str - The module name. This should be top-level only, so that the - version may be checked. + The module name. extra : str Additional text to include in the ImportError message. raise_on_missing : bool, default True @@ -73,6 +77,8 @@ def import_optional_dependency( * ignore: Return the module, even if the version is too old. It's expected that users validate the version locally when using ``on_version="ignore"`` (see. ``io/html.py``) + min_version: Optional[str] + Specify the minimum version Returns ------- @@ -93,14 +99,14 @@ def import_optional_dependency( raise ImportError(msg) from None else: return None - # Grab parent module if submodule being imported + # Handle submodules: if we have submodule, grab parent module from sys.modules parent = name.split(".")[0] if parent != name: name = parent module_to_get = sys.modules[name] else: module_to_get = module - minimum_version = VERSIONS.get(name) + minimum_version = min_version if min_version is not None else VERSIONS.get(name) if minimum_version: version = _get_version(module_to_get) if distutils.version.LooseVersion(version) < minimum_version: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3563a1ea0f04e..ebaefafd8b5b8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,7 +21,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer, Union -from pandas.compat._optional import VERSIONS, import_optional_dependency +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -172,6 +172,7 @@ Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. + .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional @@ -1015,7 +1016,7 @@ def _clean_options(self, options, engine): elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( - "the 'c' engine does not support " + f"the '{engine}' engine does not support " "regex separators (separators > 1 char and " r"different from '\s+' are interpreted as regex)" ) @@ -2302,9 +2303,10 @@ def __init__(self, src, **kwds): self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): - VERSIONS["pyarrow"] = "0.15.0" pyarrow = import_optional_dependency( - "pyarrow.csv", extra="pyarrow is required to use the pyarrow engine" + "pyarrow.csv", + min_version="0.15.0", + extra="pyarrow is required to use the pyarrow engine", ) kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow @@ -2315,15 +2317,26 @@ def read(self): "true_values", "false_values", } + # rename some arguments to pass to pyarrow + kwdscopy["include_columns"] = kwdscopy.get("usecols") + kwdscopy["null_values"] = kwdscopy.get("na_values") + kwdscopy["escape_char"] = kwdscopy.get("escapechar") + kwdscopy["ignore_empty_lines"] = kwdscopy.get("skip_blank_lines") + parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions} convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} - read_options = pyarrow.ReadOptions(autogenerate_column_names=True) - headerexists = True if self.header is not None and self.header >= 0 else False + headerexists = True if self.header is not None else False + read_options = dict() + skiprows = self.kwds.get("skiprows") - if skiprows is not None: - read_options = pyarrow.ReadOptions(skip_rows=skiprows) - elif headerexists: - read_options = pyarrow.ReadOptions(skip_rows=self.header) + if headerexists: + read_options["skip_rows"] = self.header + read_options["autogenerate_column_names"] = False + else: + if skiprows is not None: + read_options["skip_rows"] = skiprows + read_options["autogenerate_column_names"] = True + read_options = pyarrow.ReadOptions(**read_options) table = pyarrow.read_csv( self.src, read_options=read_options, @@ -2339,11 +2352,8 @@ def read(self): elif self.header is None: self.names = range(num_cols) frame.columns = self.names - - frame = self._date_conversion( - frame, self._date_conv, self.parse_dates, keep_date_col=self.keep_date_col - ) - + # we only need the frame not the names + frame.columns, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): @@ -2354,70 +2364,6 @@ def read(self): frame = frame.astype(self.kwds.get("dtype")) return frame - def _date_conversion( - self, data, converter, parse_spec, keep_date_col=False, - ): - - orig_names = data.columns - columns = list(data.columns) - - date_cols = set() - - if parse_spec is None or isinstance(parse_spec, bool): - return data, columns - - if isinstance(parse_spec, list): - # list of column lists - for colspec in parse_spec: - if is_scalar(colspec): - if isinstance(colspec, int) and colspec not in data: - colspec = orig_names[colspec] - data[colspec] = converter(data[colspec].values) - else: - new_name, col, old_names = self._try_convert_dates( - converter, colspec, data, orig_names - ) - if new_name in data: - raise ValueError(f"New date column already in dict {new_name}") - data[new_name] = col - date_cols.update(old_names) - - elif isinstance(parse_spec, dict): - # dict of new name to column list - for new_name, colspec in parse_spec.items(): - if new_name in data: - raise ValueError(f"Date column {new_name} already in dict") - - _, col, old_names = self._try_convert_dates( - converter, colspec, data, orig_names - ) - - data[new_name] = col - date_cols.update(old_names) - - if not keep_date_col: - data = data.drop(date_cols, axis=1) - - return data - - def _try_convert_dates(self, parser, colspec, data, columns): - colset = set(columns) - colnames = [] - - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int) and c not in columns: - colnames.append(columns[c]) - else: - colnames.append(c) - - new_name = "_".join(str(x) for x in colnames) - to_parse = [data[c].values for c in colnames if c in data] - - new_col = parser(*to_parse) - return new_name, new_col, colnames - def TextParser(*args, **kwds): """ @@ -3568,7 +3514,7 @@ def _isindex(colspec): colspec = orig_names[colspec] if _isindex(colspec): continue - data_dict[colspec] = converter(data_dict[colspec]) + data_dict[colspec] = converter(np.array(data_dict[colspec])) else: new_name, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names @@ -3617,7 +3563,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): colnames.append(c) new_name = "_".join(str(x) for x in colnames) - to_parse = [data_dict[c] for c in colnames if c in data_dict] + to_parse = [np.array(data_dict[c]) for c in colnames if c in data_dict] new_col = parser(*to_parse) return new_name, new_col, colnames diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index e5ed69b7703b1..61dbd81e2cee5 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -27,14 +27,15 @@ def test_bad_version(monkeypatch): module = types.ModuleType(name) module.__version__ = "0.9.0" sys.modules[name] = module - monkeypatch.setitem(VERSIONS, name, "1.0.0") match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'" with pytest.raises(ImportError, match=match): - import_optional_dependency("fakemodule") + import_optional_dependency("fakemodule", min_version="1.0.0") with tm.assert_produces_warning(UserWarning): - result = import_optional_dependency("fakemodule", on_version="warn") + result = import_optional_dependency( + "fakemodule", min_version="1.0.0", on_version="warn" + ) assert result is None module.__version__ = "1.0.0" # exact match is OK From ede279925c591f42a1585d0aae9e186a3b936cd0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 29 Jun 2020 11:08:18 -0700 Subject: [PATCH 28/35] Fix typo Co-authored-by: Joris Van den Bossche --- pandas/io/parsers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b3127d4f84cd8..de2a833e51ea0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -173,6 +173,7 @@ is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. + .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional From e8eff08c8b939539ecbe6e9466f9248722fd0927 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 8 Jul 2020 16:46:46 -0700 Subject: [PATCH 29/35] Fix doc failures --- doc/source/user_guide/io.rst | 21 ++++++++++++++------- pandas/io/parsers.py | 1 - 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 2fcffcd814195..e4da778ee7378 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -160,9 +160,9 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use `str` or `object` together with suitable ``na_values`` settings to preserve and not interpret dtype. -engine : {``'c'``, ``'pyarrow'``,``'python'``} +engine : {``'c'``, ``'pyarrow'``, ``'python'``} Parser engine to use. In terms of performance, the pyarrow engine, - which requires pyarrow>=0.15.0, is faster than the C engine, which + which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which is faster than the python engine. However, the pyarrow and C engines are currently less feature complete than their Python counterpart. converters : dict, default ``None`` @@ -1621,11 +1621,18 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: Specifying the parser engine '''''''''''''''''''''''''''' -Under the hood pandas uses a fast and efficient parser implemented in C as well -as a Python implementation which is currently more feature-complete. Where -possible pandas uses the C parser (specified as ``engine='c'``), but may fall -back to Python if C-unsupported options are specified. Currently, C-unsupported -options include: +Currently, pandas supports using three engines, the C engine, the python engine, +and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance +the pyarrow engine is fastest, followed by the C and Python engines. However, +the pyarrow engine is much less robust than the C engine, which in turn lacks a +couple of features present in the Python parser. + +Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall +back to Python if C-unsupported options are specified. If pyarrow unsupported options are +specified while using ``engine='pyarrow'``, the parser will error out +(a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``). + +Currently, C-unsupported options include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index de2a833e51ea0..b3127d4f84cd8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -173,7 +173,6 @@ is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 as a dependency however. - .. versionchanged:: 1.1 The "pyarrow" engine was added. converters : dict, optional From 55139ee19a512c3bd83b3c07caa4c44a92a49a59 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 22 Oct 2020 16:35:14 +0100 Subject: [PATCH 30/35] wip --- pandas/tests/io/parser/conftest.py | 14 +++++- pandas/tests/io/parser/test_comment.py | 2 + pandas/tests/io/parser/test_common.py | 64 ++++++++++++++++++++++++-- 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 05fae470f5a88..a179c1b82baae 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -142,4 +142,16 @@ def pyarrow_xfail(request): if "all_parsers" in request.fixturenames: parser = request.getfixturevalue("all_parsers") if parser.engine == "pyarrow": - pytest.xfail("pyarrow doesn't support this.") + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.node.add_marker(mark) + + +@pytest.fixture +def pyarrow_skip(request): + """ + Fixture that skips a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + pytest.skip("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 60e32d7c27200..a9a03f006668b 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -10,6 +10,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("na_values", [None, ["NaN"]]) def test_comment(all_parsers, na_values): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 753189ea7c8d2..1295f0061f808 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -23,6 +23,9 @@ from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + def test_override_set_noconvert_columns(): # see gh-17351 @@ -84,7 +87,8 @@ def test_empty_decimal_marker(all_parsers, pyarrow_xfail): parser.read_csv(StringIO(data), decimal="") -def test_bad_stream_exception(all_parsers, csv_dir_path, pyarrow_xfail): +@skip_pyarrow +def test_bad_stream_exception(all_parsers, csv_dir_path): # see gh-13652 # # This test validates that both the Python engine and C engine will @@ -139,6 +143,7 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -232,6 +237,7 @@ def test_csv_mixed_type(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -280,6 +286,7 @@ def test_read_csv_dataframe(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -348,6 +355,7 @@ def test_read_duplicate_index_explicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -728,7 +736,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail): parser.read_csv(StringIO(data), skipfooter=1, **kwargs) -def test_nrows_skipfooter_errors(all_parsers, pyarrow_xfail): +def test_nrows_skipfooter_errors(all_parsers): msg = "'skipfooter' not supported with 'nrows'" data = "a\n1\n2\n3\n4\n5\n6" parser = all_parsers @@ -799,6 +807,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D @@ -823,6 +832,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -856,6 +866,7 @@ def test_multi_index_no_level_names_implicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected,header", [ @@ -877,6 +888,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -939,6 +951,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@xfail_pyarrow def test_path_path_lib(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -946,6 +959,7 @@ def test_path_path_lib(all_parsers): tm.assert_frame_equal(df, result) +@xfail_pyarrow def test_path_local_path(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -955,6 +969,7 @@ def test_path_local_path(all_parsers): tm.assert_frame_equal(df, result) +@xfail_pyarrow def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError @@ -968,6 +983,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename +@xfail_pyarrow @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 @@ -990,6 +1006,7 @@ def test_no_permission(all_parsers): assert path == e.value.filename +@xfail_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -1005,6 +1022,7 @@ def test_missing_trailing_delimiters(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -1065,6 +1083,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1168,6 +1187,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): assert df.a.dtype == object +@skip_pyarrow @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 @@ -1179,6 +1199,7 @@ def test_integer_overflow_bug(all_parsers, sep): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -1198,6 +1219,7 @@ def test_catch_too_many_names(all_parsers): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) +@xfail_pyarrow def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers @@ -1218,6 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -1228,6 +1251,7 @@ def test_empty_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -1240,6 +1264,7 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers @@ -1251,6 +1276,7 @@ def test_empty_with_reversed_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_float_parser(all_parsers): # see gh-9565 parser = all_parsers @@ -1272,6 +1298,7 @@ def test_scientific_no_exponent(all_parsers): tm.assert_frame_equal(df_roundtrip, df) +@xfail_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv): data = """ID @@ -1315,6 +1342,7 @@ def test_int64_overflow(all_parsers, conv): parser.read_csv(StringIO(data), converters={"ID": conv}) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -1328,6 +1356,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -1341,6 +1370,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 @@ -1353,6 +1383,7 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 @@ -1370,6 +1401,7 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected,msg", [ @@ -1477,6 +1509,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -1531,6 +1564,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xf tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -1562,7 +1596,7 @@ def test_trailing_spaces(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail): +def test_raise_on_sep_with_delim_whitespace(all_parsers): # see gh-6607 data = "a b c\n1 2 3" parser = all_parsers @@ -1571,6 +1605,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers, pyarrow_xfail): parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) +@xfail_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -1589,6 +1624,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "sep,skip_blank_lines,exp_data", [ @@ -1628,6 +1664,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -1643,6 +1680,7 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -1671,6 +1709,7 @@ def test_whitespace_regex_separator(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_verbose_read(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1694,6 +1733,7 @@ def test_verbose_read(all_parsers, capsys): assert captured.out == "Filled 3 NA values in column a\n" +@xfail_pyarrow def test_verbose_read2(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1735,6 +1775,7 @@ def test_iteration_open_handle(all_parsers): tm.assert_series_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -1766,6 +1807,7 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_euro_decimal_format(all_parsers): parser = all_parsers data = """Id;Number1;Number2;Text1;Text2;Number3 @@ -1785,6 +1827,7 @@ def test_euro_decimal_format(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -1808,6 +1851,7 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers @@ -1825,6 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -1835,6 +1880,7 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) +@xfail_pyarrow @td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") @@ -1848,6 +1894,7 @@ def test_memory_map(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_null_byte_char(all_parsers): # see gh-2741 data = "\x00,foo" @@ -1864,6 +1911,7 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) +@xfail_pyarrow def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -1985,6 +2033,7 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in. @@ -2003,6 +2052,7 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): parser.read_csv(StringIO(data), **kwargs) +@xfail_pyarrow def test_warn_bad_lines(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2017,6 +2067,7 @@ def test_warn_bad_lines(all_parsers, capsys): assert "Skipping line 5" in captured.err +@xfail_pyarrow def test_suppress_error_output(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2045,6 +2096,7 @@ def test_filename_with_special_chars(all_parsers, filename): tm.assert_frame_equal(result, df) +@xfail_pyarrow def test_read_csv_memory_growth_chunksize(all_parsers): # see gh-24805 # @@ -2127,6 +2179,7 @@ def test_first_row_bom(all_parsers, pyarrow_xfail): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -2147,6 +2200,7 @@ def test_integer_precision(all_parsers): tm.assert_series_equal(result, expected) +@xfail_pyarrow def test_file_descriptor_leak(all_parsers): # GH 31488 @@ -2160,6 +2214,7 @@ def test(): td.check_file_leaks(test)() +@xfail_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -2173,6 +2228,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): tm.assert_frame_equal(df, ref[:nrows]) +@xfail_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -2203,6 +2259,7 @@ def test_read_csv_with_use_inf_as_na(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") @@ -2244,6 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) +@xfail_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" From c1aeecf20a519d3ae5b198097a4746291942c936 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 22 Oct 2020 20:27:33 +0100 Subject: [PATCH 31/35] more xfails and skips --- pandas/tests/io/parser/test_common.py | 8 ++--- pandas/tests/io/parser/test_compression.py | 5 +++- pandas/tests/io/parser/test_converters.py | 2 ++ pandas/tests/io/parser/test_dialect.py | 2 ++ pandas/tests/io/parser/test_dtypes.py | 25 ++++++++++++++++ pandas/tests/io/parser/test_encoding.py | 13 ++++++++ pandas/tests/io/parser/test_header.py | 18 +++++++++++ pandas/tests/io/parser/test_index_col.py | 11 +++++++ pandas/tests/io/parser/test_mangle_dupes.py | 6 ++++ pandas/tests/io/parser/test_multi_thread.py | 2 ++ pandas/tests/io/parser/test_na_values.py | 24 +++++++++++++++ pandas/tests/io/parser/test_parse_dates.py | 33 +++++++++++++++++++++ pandas/tests/io/parser/test_quoting.py | 10 +++++++ pandas/tests/io/parser/test_skiprows.py | 13 ++++++++ pandas/tests/io/parser/test_usecols.py | 25 ++++++++++++++++ 15 files changed, 192 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 1295f0061f808..cbf474ad5e5c6 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1240,7 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -1264,7 +1264,7 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers @@ -1869,7 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -2301,7 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) -@xfail_pyarrow +@skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index ecc35dd6644c8..e23b91373f611 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -11,6 +11,8 @@ import pandas as pd import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture(params=[True, False]) def buffer(request): @@ -80,6 +82,7 @@ def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): parser.read_csv(f, compression="zip") +@skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) def test_compression( parser_and_data, compression_only, buffer, filename, pyarrow_xfail @@ -147,7 +150,7 @@ def test_compression_utf_encoding( @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) -def test_invalid_compression(all_parsers, invalid_compression, pyarrow_xfail): +def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers compress_kwargs = dict(compression=invalid_compression) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 88b400d9a11df..a70fe847b6ae9 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -12,6 +12,8 @@ from pandas import DataFrame, Index import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + def test_converters_type_must_be_dict(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index cc65def0fd096..7a65e46ba670f 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.fixture def custom_dialect(): diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 1ba6f0ea0a342..8e6462767513a 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -16,7 +16,11 @@ from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) def test_dtype_all_columns(all_parsers, dtype, check_orig): @@ -43,6 +47,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_dtype_all_columns_empty(all_parsers): # see gh-12048 parser = all_parsers @@ -52,6 +57,7 @@ def test_dtype_all_columns_empty(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -70,6 +76,7 @@ def test_dtype_per_column(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_invalid_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -83,6 +90,7 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) +@xfail_pyarrow @pytest.mark.parametrize( "dtype", [ @@ -109,6 +117,7 @@ def test_categorical_dtype(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 @@ -124,6 +133,7 @@ def test_categorical_dtype_single(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -142,6 +152,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -160,6 +171,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 @@ -187,6 +199,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -201,6 +214,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -219,6 +233,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -320,6 +335,7 @@ def test_categorical_coerces_timestamp(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} @@ -361,6 +377,7 @@ def test_categorical_unexpected_categories(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_pass_dtype(all_parsers): parser = all_parsers @@ -374,6 +391,7 @@ def test_empty_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers @@ -388,6 +406,7 @@ def test_empty_with_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers @@ -416,6 +435,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfai tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): parser = all_parsers @@ -429,6 +449,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers @@ -457,6 +478,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) +@xfail_pyarrow def test_raise_on_passed_int_dtype_with_nas(all_parsers): # see gh-2631 parser = all_parsers @@ -474,6 +496,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) +@xfail_pyarrow def test_dtype_with_converters(all_parsers): parser = all_parsers data = """a,b @@ -489,6 +512,7 @@ def test_dtype_with_converters(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "dtype,expected", [ @@ -553,6 +577,7 @@ def test_numeric_dtype(all_parsers, dtype): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_boolean_dtype(all_parsers): parser = all_parsers data = "\n".join( diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 876696ecdad9c..eac906601876b 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -13,7 +13,11 @@ from pandas import DataFrame, read_csv import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_bytes_io_input(all_parsers): encoding = "cp1255" parser = all_parsers @@ -25,6 +29,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -34,6 +39,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -68,6 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_utf16_example(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers @@ -75,6 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path): assert len(result) == 50 +@xfail_pyarrow def test_unicode_encoding(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers @@ -87,6 +95,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): assert got == expected +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -120,6 +129,7 @@ def _encode_data_with_bom(_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -132,6 +142,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "file_path,encoding", [ @@ -163,6 +174,7 @@ def test_binary_mode_file_buffers( tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): # see gh-24130 @@ -179,6 +191,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 4cd110136d7b0..34eaf6ae306b4 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -14,7 +14,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -82,6 +86,7 @@ def test_no_header_prefix(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -119,6 +124,7 @@ def test_header_not_first_line(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) @@ -184,6 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("names", ["first", "second"]) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -231,6 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -277,6 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -324,6 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -344,6 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -365,6 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -385,6 +397,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) @@ -397,6 +410,7 @@ def test_header_names_backward_compat(all_parsers, data, header): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -442,6 +456,7 @@ def test_non_int_header(all_parsers, header): parser.read_csv(StringIO(data), header=header) +@xfail_pyarrow def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -452,6 +467,7 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -498,6 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -541,6 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 4d64f2bf411bd..a0a4fdbc25d49 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -11,7 +11,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -66,6 +70,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) +@xfail_pyarrow def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -83,6 +88,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "index_col,kwargs", [ @@ -127,6 +133,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -137,6 +144,7 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_names", [ @@ -161,6 +169,7 @@ def test_multi_index_naming(all_parsers, index_names): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -175,6 +184,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -186,6 +196,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 5c4e642115798..cc88a1d974767 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -10,7 +10,10 @@ from pandas import DataFrame import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" @@ -24,6 +27,7 @@ def test_basic(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -44,6 +48,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -111,6 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@xfail_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index d50560c684084..06f14e28435ef 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + def _construct_dataframe(num_rows): """ diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 9f86bbd65640e..9e7a445234a45 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -12,7 +12,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -28,6 +32,7 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -42,6 +47,7 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_values", [ @@ -79,6 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -126,6 +133,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -159,6 +167,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -177,6 +186,7 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -210,6 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -297,6 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" @@ -308,6 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # @@ -319,6 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -348,6 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_filter,row_data", [ @@ -369,6 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -396,6 +412,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_values,row_data", [ @@ -414,6 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -429,6 +447,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +@xfail_pyarrow def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -440,6 +459,7 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -469,6 +489,7 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) @@ -497,6 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -512,6 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data, na_values", [ @@ -540,6 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) +@xfail_pyarrow def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 662659982c0b3..722170c9b76df 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -34,7 +34,10 @@ else: date_strategy = st.datetimes() +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # @@ -56,6 +59,7 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col_custom(all_parsers, keep_date_col): data = """\ @@ -199,6 +203,7 @@ def date_parser(*date_cols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): @@ -211,6 +216,7 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) +@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col): data = """\ @@ -370,6 +376,7 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -434,6 +441,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -468,6 +476,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers data = """\ @@ -637,6 +646,7 @@ def test_date_parser_int_bug(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers @@ -652,6 +662,7 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) +@xfail_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -666,6 +677,7 @@ def test_csv_custom_parser(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -679,6 +691,7 @@ def test_parse_dates_implicit_first_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_parse_dates_string(all_parsers): data = """date,A,B,C 20090101,a,1,2 @@ -723,6 +736,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" @@ -739,6 +753,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_parse_dates(all_parsers, index_col): data = """index1,index2,A,B,C @@ -784,6 +799,7 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers @@ -828,6 +844,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) +@xfail_pyarrow def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers @@ -841,6 +858,7 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is pytz.utc +@xfail_pyarrow @pytest.mark.parametrize( "parse_dates,index_col", [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], @@ -941,6 +959,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_cols_chunked(all_parsers): parser = all_parsers data = """\ @@ -1033,6 +1052,7 @@ def test_multiple_date_cols_chunked(all_parsers): tm.assert_frame_equal(chunks[2], expected[4:]) +@xfail_pyarrow def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1056,6 +1076,7 @@ def test_multiple_date_col_named_index_compat(all_parsers): tm.assert_frame_equal(with_indices, with_names) +@xfail_pyarrow def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1123,6 +1144,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): ) +@xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1135,6 +1157,7 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1174,6 +1197,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1202,6 +1226,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1290,6 +1315,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]), @@ -1312,6 +1338,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1343,6 +1370,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1374,6 +1402,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." @@ -1392,6 +1421,7 @@ def test_generic(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 parser = all_parsers @@ -1489,6 +1519,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1500,6 +1531,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1565,6 +1597,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert result == expected +@xfail_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 14773dfbea20e..8b010df470386 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -13,7 +13,11 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,msg", [ @@ -33,6 +37,7 @@ def test_bad_quote_char(all_parsers, kwargs, msg): parser.read_csv(StringIO(data), **kwargs) +@xfail_pyarrow @pytest.mark.parametrize( "quoting,msg", [ @@ -57,6 +62,7 @@ def test_quote_char_basic(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) def test_quote_char_various(all_parsers, quote_char): parser = all_parsers @@ -69,6 +75,7 @@ def test_quote_char_various(all_parsers, quote_char): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) @pytest.mark.parametrize("quote_char", ["", None]) def test_null_quote_char(all_parsers, quoting, quote_char): @@ -88,6 +95,7 @@ def test_null_quote_char(all_parsers, quoting, quote_char): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,exp_data", [ @@ -114,6 +122,7 @@ def test_quoting_various(all_parsers, kwargs, exp_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] ) @@ -137,6 +146,7 @@ def test_quotechar_unicode(all_parsers, quotechar): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("balanced", [True, False]) def test_unbalanced_quoting(all_parsers, balanced): # see gh-22789. diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index fdccef1127c7e..732f2eb18fdd9 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -14,7 +14,10 @@ from pandas import DataFrame, Index import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) def test_skip_rows_bug(all_parsers, skiprows): # see gh-505 @@ -42,6 +45,7 @@ def test_skip_rows_bug(all_parsers, skiprows): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers @@ -57,6 +61,7 @@ def test_deep_skip_rows(all_parsers): tm.assert_frame_equal(result, condensed_result) +@xfail_pyarrow def test_skip_rows_blank(all_parsers): # see gh-9832 parser = all_parsers @@ -83,6 +88,7 @@ def test_skip_rows_blank(all_parsers): tm.assert_frame_equal(data, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -123,6 +129,7 @@ def test_skip_row_with_newline(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_row_with_quote(all_parsers): # see gh-12775 and gh-10911 parser = all_parsers @@ -138,6 +145,7 @@ def test_skip_row_with_quote(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,exp_data", [ @@ -173,6 +181,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" ) @@ -209,6 +218,7 @@ def test_skiprows_lineterminator(all_parsers, line_terminator): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers @@ -219,6 +229,7 @@ def test_skiprows_infield_quote(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -234,6 +245,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_rows_skip_all(all_parsers): parser = all_parsers data = "a\n1\n2\n3\n4\n5" @@ -243,6 +255,7 @@ def test_skip_rows_skip_all(all_parsers): parser.read_csv(StringIO(data), skiprows=lambda x: True) +@xfail_pyarrow def test_skip_rows_bad_callable(all_parsers): msg = "by zero" parser = all_parsers diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 7e9c9866a666d..0f2e5882439f8 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -12,6 +12,9 @@ from pandas import DataFrame, Index import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + _msg_validate_usecols_arg = ( "'usecols' must either be list-like " "of all strings, all unicode, all " @@ -22,6 +25,7 @@ ) +@skip_pyarrow def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678 data = """a,b,c @@ -35,6 +39,7 @@ def test_raise_on_mixed_dtype_usecols(all_parsers): parser.read_csv(StringIO(data), usecols=usecols) +@skip_pyarrow @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) def test_usecols(all_parsers, usecols): data = """\ @@ -50,6 +55,7 @@ def test_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_names(all_parsers): data = """\ a,b,c @@ -65,6 +71,7 @@ def test_usecols_with_names(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) @@ -81,6 +88,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_relative_to_names2(all_parsers): # see gh-5766 data = """\ @@ -97,6 +105,7 @@ def test_usecols_relative_to_names2(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_name_length_conflict(all_parsers): data = """\ 1,2,3 @@ -125,6 +134,7 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") +@xfail_pyarrow @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -138,6 +148,7 @@ def test_usecols_index_col_false(all_parsers, data): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("index_col", ["b", 0]) @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) def test_usecols_index_col_conflict(all_parsers, usecols, index_col): @@ -164,6 +175,7 @@ def test_usecols_index_col_conflict2(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_implicit_index_col(all_parsers): # see gh-2654 parser = all_parsers @@ -174,6 +186,7 @@ def test_usecols_implicit_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers @@ -184,6 +197,7 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" @@ -193,6 +207,7 @@ def test_usecols_with_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -212,6 +227,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 @@ -230,6 +246,7 @@ def test_usecols_with_parse_dates(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 parser = all_parsers @@ -290,6 +307,7 @@ def test_usecols_with_parse_dates3(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_parse_dates4(all_parsers): data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" usecols = list("abcdefghij") @@ -313,6 +331,7 @@ def test_usecols_with_parse_dates4(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) @pytest.mark.parametrize( "names", @@ -406,6 +425,7 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_usecols(all_parsers): data = "a,b,c\n1,2,3\n4,5,6" expected = DataFrame() @@ -426,6 +446,7 @@ def test_np_array_usecols(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -458,6 +479,7 @@ def test_callable_usecols(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) def test_incomplete_first_row(all_parsers, usecols): # see gh-6710 @@ -470,6 +492,7 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,usecols,kwargs,expected", [ @@ -502,6 +525,7 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "usecols,kwargs,expected,msg", [ @@ -558,6 +582,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): if all_parsers.engine != "c": From b53a620b8fb77e1ab804a18e01662d85cf653bf7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Oct 2020 04:07:45 +0000 Subject: [PATCH 32/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- asv_bench/benchmarks/io/csv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 8792fff5300d3..c1fad1efde082 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -256,7 +256,10 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): def time_read_csv_arrow(self, sep, decimal, float_precision): read_csv( - self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"), + self.data(self.StringIO_input), + sep=sep, + header=None, + names=list("abc"), ) From f13113d37ccad7f16d493931dac876d4cd246d96 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 28 Oct 2020 10:39:52 -0700 Subject: [PATCH 33/35] Fix typos --- pandas/compat/_optional.py | 30 +++++++++++----------- pandas/io/parsers.py | 10 +++----- pandas/tests/io/parser/test_unsupported.py | 3 ++- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6f00c8ddb37af..6569b077069e2 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -45,6 +45,7 @@ "pandas_gbq": "pandas-gbq", "sqlalchemy": "SQLAlchemy", "jinja2": "Jinja2", + "pyarrow.csv": "pyarrow", } @@ -119,23 +120,22 @@ def import_optional_dependency( # Handle submodules: if we have submodule, grab parent module from sys.modules parent = name.split(".")[0] if parent != name: - name = parent - module_to_get = sys.modules[name] + install_name = parent + module_to_get = sys.modules[install_name] else: module_to_get = module minimum_version = min_version if min_version is not None else VERSIONS.get(name) - if minimum_version: - version = _get_version(module_to_get) - if distutils.version.LooseVersion(version) < minimum_version: - assert on_version in {"warn", "raise", "ignore"} - msg = ( - f"Pandas requires version '{minimum_version}' or newer of '{name}' " - f"(version '{version}' currently installed)." - ) - if on_version == "warn": - warnings.warn(msg, UserWarning) - return None - elif on_version == "raise": - raise ImportError(msg) + version = _get_version(module_to_get) + if distutils.version.LooseVersion(version) < minimum_version: + assert on_version in {"warn", "raise", "ignore"} + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." + ) + if on_version == "warn": + warnings.warn(msg, UserWarning) + return None + elif on_version == "raise": + raise ImportError(msg) return module diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 03a70615591a1..75c1d7b06b635 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -839,7 +839,7 @@ def __init__(self, f, engine=None, **kwds): if engine == "pyarrow": raise ValueError( "The 'dialect' option is not supported with the 'pyarrow' engine" - + ) kwds = _merge_with_dialect_properties(dialect, kwds) if kwds.get("header", "infer") == "infer": @@ -2223,11 +2223,7 @@ def __init__(self, src, **kwds): self.src = BytesIOWrapper(self.src, encoding=encoding) def read(self): - pyarrow = import_optional_dependency( - "pyarrow.csv", - min_version="0.15.0", - extra="pyarrow is required to use the pyarrow engine", - ) + pyarrow = import_optional_dependency("pyarrow.csv", min_version="0.15.0") kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} # these are kwargs passed to pyarrow parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} @@ -3434,7 +3430,7 @@ def _isindex(colspec): colspec = orig_names[colspec] if _isindex(colspec): continue - data_dict[colspec] = converter(np.array(data_dict[colspec])) + data_dict[colspec] = converter(np.asarray(data_dict[colspec])) else: new_name, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index d2ae4c160d519..6e9cdacd40586 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -132,11 +132,12 @@ def test_pyarrow_engine(self): 1,2,3,4,""" for default in pa_unsupported: - print(default) msg = ( f"The {repr(default)} option is not " f"supported with the 'pyarrow' engine" ) kwargs = {default: object()} + if default == "dialect": + kwargs[default] = "excel" # test a random dialect with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) From f9ce2e46838a0aec07d180dc8e909573b5408918 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 28 Oct 2020 11:47:47 -0700 Subject: [PATCH 34/35] Doc fixes and more typo fixes --- doc/source/whatsnew/v1.1.0.rst | 8 -------- doc/source/whatsnew/v1.2.0.rst | 6 ++++++ pandas/compat/_optional.py | 23 ++++++++++++----------- pandas/io/parsers.py | 7 ++++--- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a0383d7248624..50443f8810e5f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -270,14 +270,6 @@ change, as ``fsspec`` will still bring in the same packages as before. .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ - -read_csv() now accepts pyarrow as an engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines -with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) - - .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f1f24ab7a101b..16b0324acaf6c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -203,6 +203,12 @@ example where the index name is preserved: The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. +read_csv() now accepts pyarrow as an engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines +with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6569b077069e2..a6a14fcbee757 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -126,16 +126,17 @@ def import_optional_dependency( module_to_get = module minimum_version = min_version if min_version is not None else VERSIONS.get(name) version = _get_version(module_to_get) - if distutils.version.LooseVersion(version) < minimum_version: - assert on_version in {"warn", "raise", "ignore"} - msg = ( - f"Pandas requires version '{minimum_version}' or newer of '{name}' " - f"(version '{version}' currently installed)." - ) - if on_version == "warn": - warnings.warn(msg, UserWarning) - return None - elif on_version == "raise": - raise ImportError(msg) + if minimum_version: + if distutils.version.LooseVersion(version) < minimum_version: + assert on_version in {"warn", "raise", "ignore"} + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." + ) + if on_version == "warn": + warnings.warn(msg, UserWarning) + return None + elif on_version == "raise": + raise ImportError(msg) return module diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 75c1d7b06b635..5c70e31aca041 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -165,10 +165,11 @@ of dtype conversion. engine : {{'c', 'python', 'pyarrow'}}, optional Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. The pyarrow engine requires ``pyarrow`` >= 0.15 - as a dependency however. + is currently more feature-complete. The pyarrow engine also supports multithreading + something that is not present in the C or python engines. It requires + ``pyarrow`` >= 0.15 as a dependency however. - .. versionchanged:: 1.1 + .. versionchanged:: 1.2 The "pyarrow" engine was added. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either From 4158d6af395ba4335a59001010621ae0479abf48 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 2 Nov 2020 09:59:01 -0800 Subject: [PATCH 35/35] Green? --- pandas/compat/_optional.py | 2 +- pandas/tests/io/parser/test_dialect.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index a6a14fcbee757..28741c1560543 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -125,8 +125,8 @@ def import_optional_dependency( else: module_to_get = module minimum_version = min_version if min_version is not None else VERSIONS.get(name) - version = _get_version(module_to_get) if minimum_version: + version = _get_version(module_to_get) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a65e46ba670f..afdd7548ed0dd 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -13,7 +13,7 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.usefixtures("pyarrow_skip") @pytest.fixture