diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9bcd125f56bbb..c1fad1efde082 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,4 +1,4 @@ -from io import StringIO +from io import BytesIO, StringIO import random import string @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = [None, 10000] - param_names = ["skiprows"] + params = ([None, 10000], ["c", "pyarrow"]) + param_names = ["skiprows", "engine"] - def setup(self, skiprows): + def setup(self, skiprows, engine): N = 20000 index = tm.makeStringIndex(N) df = DataFrame( @@ -164,8 +164,8 @@ def setup(self, skiprows): ) df.to_csv(self.fname) - def time_skipprows(self, skiprows): - read_csv(self.fname, skiprows=skiprows) + def time_skipprows(self, skiprows, engine): + read_csv(self.fname, skiprows=skiprows, engine=engine) class ReadUint64Integers(StringIORewind): @@ -254,9 +254,33 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): names=list("abc"), ) + def time_read_csv_arrow(self, sep, decimal, float_precision): + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + names=list("abc"), + ) -class ReadCSVCategorical(BaseIO): +class ReadCSVEngine(StringIORewind): + params = ["c", "python", "pyarrow"] + param_names = ["engine"] + + def setup(self, engine): + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) + # simulate reading from file + self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) + + def time_read_stringcsv(self, engine): + read_csv(self.data(self.StringIO_input), engine=engine) + + def time_read_bytescsv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + + +class ReadCSVCategorical(BaseIO): fname = "__test__.csv" def setup(self): @@ -273,7 +297,10 @@ def time_convert_direct(self): class ReadCSVParseDates(StringIORewind): - def setup(self): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n @@ -284,18 +311,20 @@ def setup(self): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self): + def time_multiple_date(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]], ) - def time_baseline(self): + def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, parse_dates=[1], @@ -304,17 +333,18 @@ def time_baseline(self): class ReadCSVCachedParseDates(StringIORewind): - params = ([True, False],) - param_names = ["do_cache"] + params = ([True, False], ["c", "pyarrow", "python"]) + param_names = ["do_cache", "engine"] - def setup(self, do_cache): + def setup(self, do_cache, engine): data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) - def time_read_csv_cached(self, do_cache): + def time_read_csv_cached(self, do_cache, engine): try: read_csv( self.data(self.StringIO_input), + engine=engine, header=None, parse_dates=[0], cache_dates=do_cache, @@ -344,22 +374,23 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (["mY", "mdY", "hm"],) - param_names = ["value"] + params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"]) + param_names = ["value", "engine"] objects = { "mY": "01-2019\n10-2019\n02/2000\n", "mdY": "12/02/2010\n", "hm": "21:34\n", } - def setup(self, value): + def setup(self, value, engine): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value): + def time_read_special_date(self, value, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=["Date"], diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1bd35131622ab..6f5f820776d09 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -158,9 +158,11 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve and not interpret dtype. -engine : {``'c'``, ``'python'``} - Parser engine to use. The C engine is faster while the Python engine is - currently more feature-complete. +engine : {``'c'``, ``'pyarrow'``, ``'python'``} + Parser engine to use. In terms of performance, the pyarrow engine, + which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which + is faster than the python engine. However, the pyarrow and C engines + are currently less feature complete than their Python counterpart. converters : dict, default ``None`` Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -1600,11 +1602,18 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: Specifying the parser engine '''''''''''''''''''''''''''' -Under the hood pandas uses a fast and efficient parser implemented in C as well -as a Python implementation which is currently more feature-complete. Where -possible pandas uses the C parser (specified as ``engine='c'``), but may fall -back to Python if C-unsupported options are specified. Currently, C-unsupported -options include: +Currently, pandas supports using three engines, the C engine, the python engine, +and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance +the pyarrow engine is fastest, followed by the C and Python engines. However, +the pyarrow engine is much less robust than the C engine, which in turn lacks a +couple of features present in the Python parser. + +Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall +back to Python if C-unsupported options are specified. If pyarrow unsupported options are +specified while using ``engine='pyarrow'``, the parser will error out +(a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``). + +Currently, C-unsupported options include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e054ac830ce41..50443f8810e5f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -252,6 +252,7 @@ If needed you can adjust the bins with the argument ``offset`` (a :class:`Timede For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. + fsspec now used for filesystem handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 62da3c0c5cddc..1225c7aef19ab 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -204,6 +204,12 @@ example where the index name is preserved: The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. +read_csv() now accepts pyarrow as an engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines +with pyarrow>=0.15 installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index d3c7888cac704..28741c1560543 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -1,6 +1,8 @@ import distutils.version import importlib +import sys import types +from typing import Optional import warnings # Update install.rst when updating versions! @@ -43,6 +45,7 @@ "pandas_gbq": "pandas-gbq", "sqlalchemy": "SQLAlchemy", "jinja2": "Jinja2", + "pyarrow.csv": "pyarrow", } @@ -58,7 +61,11 @@ def _get_version(module: types.ModuleType) -> str: def import_optional_dependency( - name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise" + name: str, + extra: str = "", + raise_on_missing: bool = True, + on_version: str = "raise", + min_version: Optional[str] = None, ): """ Import an optional dependency. @@ -70,8 +77,7 @@ def import_optional_dependency( Parameters ---------- name : str - The module name. This should be top-level only, so that the - version may be checked. + The module name. extra : str Additional text to include in the ImportError message. raise_on_missing : bool, default True @@ -85,6 +91,8 @@ def import_optional_dependency( * ignore: Return the module, even if the version is too old. It's expected that users validate the version locally when using ``on_version="ignore"`` (see. ``io/html.py``) + min_version: Optional[str] + Specify the minimum version Returns ------- @@ -109,10 +117,16 @@ def import_optional_dependency( raise ImportError(msg) from None else: return None - - minimum_version = VERSIONS.get(name) + # Handle submodules: if we have submodule, grab parent module from sys.modules + parent = name.split(".")[0] + if parent != name: + install_name = parent + module_to_get = sys.modules[install_name] + else: + module_to_get = module + minimum_version = min_version if min_version is not None else VERSIONS.get(name) if minimum_version: - version = _get_version(module) + version = _get_version(module_to_get) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8d9787a9c8c9e..9ae07ad874140 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -32,6 +32,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import FilePathOrBuffer, StorageOptions, Union +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -173,9 +174,14 @@ to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : {{'c', 'python'}}, optional - Parser engine to use. The C engine is faster while the python engine is - currently more feature-complete. +engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. The pyarrow engine also supports multithreading + something that is not present in the C or python engines. It requires + ``pyarrow`` >= 0.15 as a dependency however. + + .. versionchanged:: 1.2 + The "pyarrow" engine was added. converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. @@ -445,7 +451,19 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) - chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) + chunksize = kwds.get("chunksize", None) + # chunksize and iterator not supported for pyarrow + if kwds.get("engine") == "pyarrow": + if iterator: + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) + if chunksize is not None: + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) + else: + chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) nrows = kwds.get("nrows", None) # Check for duplicates in names. @@ -521,6 +539,29 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} +_pyarrow_unsupported = { + "skipfooter", + "float_precision", + "chunksize", + "comment", + "nrows", + "thousands", + "memory_map", + "dialect", + "warn_bad_lines", + "error_bad_lines", + "delim_whitespace", + "quoting", + "lineterminator", + "converters", + "decimal", + "iterator", + "dayfirst", + "infer_datetime_format", + "verbose", + "skipinitialspace", + "low_memory", +} _python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults: Dict[str, Any] = {} @@ -788,6 +829,10 @@ def __init__(self, f, engine=None, **kwds): dialect = _extract_dialect(kwds) if dialect is not None: + if engine == "pyarrow": + raise ValueError( + "The 'dialect' option is not supported with the 'pyarrow' engine" + ) kwds = _merge_with_dialect_properties(dialect, kwds) if kwds.get("header", "infer") == "infer": @@ -823,7 +868,12 @@ def _get_options_with_defaults(self, engine): for argname, default in _parser_defaults.items(): value = kwds.get(argname, default) - + if argname in _pyarrow_unsupported: + if engine == "pyarrow" and value != default: + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"'pyarrow' engine" + ) # see gh-12935 if argname == "mangle_dupe_cols" and not value: raise ValueError("Setting mangle_dupe_cols=False is not supported yet") @@ -883,9 +933,9 @@ def _clean_options(self, options, engine): delim_whitespace = options["delim_whitespace"] if sep is None and not delim_whitespace: - if engine == "c": + if engine == "c" or engine == "pyarrow": fallback_reason = ( - "the 'c' engine does not support " + f"the {engine} engine does not support " "sep=None with delim_whitespace=False" ) engine = "python" @@ -896,7 +946,7 @@ def _clean_options(self, options, engine): elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( - "the 'c' engine does not support " + f"the '{engine}' engine does not support " "regex separators (separators > 1 char and " r"different from '\s+' are interpreted as regex)" ) @@ -1006,14 +1056,22 @@ def _clean_options(self, options, engine): na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the - # c-engine, so only need for python parsers + # c-engine, so only need for python and pyarrow parsers if engine != "c": - if is_integer(skiprows): - skiprows = list(range(skiprows)) - if skiprows is None: - skiprows = set() - elif not callable(skiprows): - skiprows = set(skiprows) + if engine == "pyarrow": + if not is_integer(skiprows) and skiprows is not None: + # pyarrow expects skiprows to be passed as an integer + raise ValueError( + "skiprows argument must be an integer when using " + "engine='pyarrow'" + ) + else: + if is_integer(skiprows): + skiprows = list(range(skiprows)) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) # put stuff back result["names"] = names @@ -1035,6 +1093,7 @@ def _make_engine(self, engine="c"): mapping: Dict[str, Type[ParserBase]] = { "c": CParserWrapper, "python": PythonParser, + "pyarrow": ArrowParserWrapper, "python-fwf": FixedWidthFieldParser, } if engine not in mapping: @@ -1048,22 +1107,25 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = validate_integer("nrows", nrows) - index, columns, col_dict = self._engine.read(nrows) - - if index is None: - if col_dict: - # Any column is actually fine: - new_rows = len(next(iter(col_dict.values()))) - index = RangeIndex(self._currow, self._currow + new_rows) - else: - new_rows = 0 + if self.engine == "pyarrow": + df = self._engine.read() else: - new_rows = len(index) + nrows = validate_integer("nrows", nrows) + index, columns, col_dict = self._engine.read(nrows) + + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + df = DataFrame(col_dict, columns=columns, index=index) - self._currow += new_rows + self._currow += new_rows if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() @@ -2156,6 +2218,109 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True): return values +class BytesIOWrapper: + """ + Allows the pyarrow engine for read_csv() to read from string buffers + """ + + def __init__(self, string_buffer: StringIO, encoding: str = "utf-8"): + self.string_buffer = string_buffer + self.encoding = encoding + + def __getattr__(self, attr: str): + return getattr(self.string_buffer, attr) + + def read(self, size: int = -1): + content = self.string_buffer.read(size) + return content.encode(self.encoding) + + +class ArrowParserWrapper(ParserBase): + """ + Wrapper for the pyarrow engine for read_csv() + """ + + def __init__(self, src, **kwds): + self.kwds = kwds + self.src = src + + ParserBase.__init__(self, kwds) + + encoding = kwds.get("encoding") if kwds.get("encoding") is not None else "utf-8" + + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + na_values = kwds["na_values"] + if isinstance(na_values, dict): + raise ValueError( + "The pyarrow engine doesn't support passing a dict for na_values" + ) + self.na_values = list( + _clean_na_values( + kwds["na_values"], keep_default_na=kwds["keep_default_na"] + )[0] + ) + if isinstance(self.src, TextIOBase): + self.src = BytesIOWrapper(self.src, encoding=encoding) + + def read(self): + pyarrow = import_optional_dependency("pyarrow.csv", min_version="0.15.0") + kwdscopy = {k: v for k, v in self.kwds.items() if v is not None} + # these are kwargs passed to pyarrow + parseoptions = {"delimiter", "quote_char", "escape_char", "ignore_empty_lines"} + convertoptions = { + "include_columns", + "null_values", + "true_values", + "false_values", + } + # rename some arguments to pass to pyarrow + kwdscopy["include_columns"] = kwdscopy.get("usecols") + kwdscopy["null_values"] = kwdscopy.get("na_values") + kwdscopy["escape_char"] = kwdscopy.get("escapechar") + kwdscopy["ignore_empty_lines"] = kwdscopy.get("skip_blank_lines") + + parse_options = {k: v for k, v in kwdscopy.items() if k in parseoptions} + convert_options = {k: v for k, v in kwdscopy.items() if k in convertoptions} + headerexists = True if self.header is not None else False + read_options = dict() + + skiprows = self.kwds.get("skiprows") + if headerexists: + read_options["skip_rows"] = self.header + read_options["autogenerate_column_names"] = False + else: + if skiprows is not None: + read_options["skip_rows"] = skiprows + read_options["autogenerate_column_names"] = True + read_options = pyarrow.ReadOptions(**read_options) + table = pyarrow.read_csv( + self.src, + read_options=read_options, + parse_options=pyarrow.ParseOptions(**parse_options), + convert_options=pyarrow.ConvertOptions(**convert_options), + ) + frame = table.to_pandas() + num_cols = len(frame.columns) + if not headerexists: + if self.names is None: + if self.prefix is not None: + self.names = [f"{self.prefix}{i}" for i in range(num_cols)] + elif self.header is None: + self.names = range(num_cols) + frame.columns = self.names + # we only need the frame not the names + frame.columns, frame = self._do_date_conversions(frame.columns, frame) + if self.index_col is not None: + for i, item in enumerate(self.index_col): + if is_integer(item): + self.index_col[i] = frame.columns[item] + frame.set_index(self.index_col, drop=True, inplace=True) + + if self.kwds.get("dtype") is not None: + frame = frame.astype(self.kwds.get("dtype")) + return frame + + def TextParser(*args, **kwds): """ Converts lists of lists/tuples into DataFrames with proper type inference @@ -3357,7 +3522,7 @@ def _isindex(colspec): colspec = orig_names[colspec] if _isindex(colspec): continue - data_dict[colspec] = converter(data_dict[colspec]) + data_dict[colspec] = converter(np.asarray(data_dict[colspec])) else: new_name, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names @@ -3406,7 +3571,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): colnames.append(c) new_name = "_".join(str(x) for x in colnames) - to_parse = [data_dict[c] for c in colnames if c in data_dict] + to_parse = [np.array(data_dict[c]) for c in colnames if c in data_dict] new_col = parser(*to_parse) return new_name, new_col, colnames diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index d03c85f65ea8d..a179c1b82baae 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -44,6 +44,11 @@ class PythonParser(BaseParser): float_precision_choices = [None] +class PyArrowParser(BaseParser): + engine = "pyarrow" + float_precision_choices = [None] + + @pytest.fixture def csv_dir_path(datapath): """ @@ -63,14 +68,18 @@ def csv1(datapath): _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() +_pyarrowParser = PyArrowParser() _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_all_parsers = [*_c_parsers_only, *_py_parsers_only] +_pyarrow_parsers_only = [_pyarrowParser] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] +_pyarrow_parser_ids = ["pyarrow"] + +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) @@ -78,6 +87,8 @@ def all_parsers(request): """ Fixture all of the CSV parsers. """ + if request.param.engine == "pyarrow": + pytest.importorskip("pyarrow", "0.15.0") return request.param @@ -121,3 +132,26 @@ def encoding_fmt(request): Fixture for all possible string formats of a UTF encoding. """ return request.param + + +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.node.add_marker(mark) + + +@pytest.fixture +def pyarrow_skip(request): + """ + Fixture that skips a test if the engine is pyarrow. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + if parser.engine == "pyarrow": + pytest.skip("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 60e32d7c27200..a9a03f006668b 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -10,6 +10,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("na_values", [None, ["NaN"]]) def test_comment(all_parsers, na_values): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 8f63d06859f62..a6da6d720f929 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -23,6 +23,9 @@ from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + def test_override_set_noconvert_columns(): # see gh-17351 @@ -64,13 +67,14 @@ def _set_noconvert_columns(self): "parse_dates": parse_dates, "delimiter": ",", } + parser.engine = "c" parser._engine = MyCParserWrapper(StringIO(data), **parser.options) result = parser.read() tm.assert_frame_equal(result, expected) -def test_empty_decimal_marker(all_parsers): +def test_empty_decimal_marker(all_parsers, pyarrow_xfail): data = """A|B|C 1|2,334|5 10|13|10. @@ -83,6 +87,7 @@ def test_empty_decimal_marker(all_parsers): parser.read_csv(StringIO(data), decimal="") +@skip_pyarrow def test_bad_stream_exception(all_parsers, csv_dir_path): # see gh-13652 # @@ -138,6 +143,7 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -169,7 +175,7 @@ def test_squeeze(all_parsers): assert not result._is_view -def test_malformed(all_parsers): +def test_malformed(all_parsers, pyarrow_xfail): # see gh-6607 parser = all_parsers data = """ignore @@ -184,7 +190,7 @@ def test_malformed(all_parsers): @pytest.mark.parametrize("nrows", [5, 3, None]) -def test_malformed_chunks(all_parsers, nrows): +def test_malformed_chunks(all_parsers, nrows, pyarrow_xfail): data = """ignore A,B,C skip @@ -203,7 +209,7 @@ def test_malformed_chunks(all_parsers, nrows): reader.read(nrows) -def test_unnamed_columns(all_parsers): +def test_unnamed_columns(all_parsers, pyarrow_xfail): data = """A,B,C,, 1,2,3,4,5 6,7,8,9,10 @@ -231,6 +237,7 @@ def test_csv_mixed_type(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -279,6 +286,7 @@ def test_read_csv_dataframe(all_parsers, csv1): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -306,7 +314,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_wrong_num_columns(all_parsers): +def test_read_csv_wrong_num_columns(all_parsers, pyarrow_xfail): # Too few columns. data = """A,B,C,D,E,F 1,2,3,4,5,6 @@ -347,6 +355,7 @@ def test_read_duplicate_index_explicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -422,7 +431,7 @@ def test_int_conversion(all_parsers): @pytest.mark.parametrize("nrows", [3, 3.0]) -def test_read_nrows(all_parsers, nrows): +def test_read_nrows(all_parsers, nrows, pyarrow_xfail): # see gh-10476 data = """index,A,B,C,D foo,2,3,4,5 @@ -443,7 +452,7 @@ def test_read_nrows(all_parsers, nrows): @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) -def test_read_nrows_bad(all_parsers, nrows): +def test_read_nrows_bad(all_parsers, nrows, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -460,7 +469,7 @@ def test_read_nrows_bad(all_parsers, nrows): @pytest.mark.parametrize("index_col", [0, "index"]) -def test_read_chunksize_with_index(all_parsers, index_col): +def test_read_chunksize_with_index(all_parsers, index_col, pyarrow_xfail): parser = all_parsers data = """index,A,B,C,D foo,2,3,4,5 @@ -492,7 +501,7 @@ def test_read_chunksize_with_index(all_parsers, index_col): @pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) -def test_read_chunksize_bad(all_parsers, chunksize): +def test_read_chunksize_bad(all_parsers, chunksize, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -509,7 +518,7 @@ def test_read_chunksize_bad(all_parsers, chunksize): @pytest.mark.parametrize("chunksize", [2, 8]) -def test_read_chunksize_and_nrows(all_parsers, chunksize): +def test_read_chunksize_and_nrows(all_parsers, chunksize, pyarrow_xfail): # see gh-15755 data = """index,A,B,C,D foo,2,3,4,5 @@ -527,7 +536,7 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize): tm.assert_frame_equal(concat(reader), expected) -def test_read_chunksize_and_nrows_changing_size(all_parsers): +def test_read_chunksize_and_nrows_changing_size(all_parsers, pyarrow_xfail): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 @@ -549,7 +558,7 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers): reader.get_chunk(size=3) -def test_get_chunk_passed_chunksize(all_parsers): +def test_get_chunk_passed_chunksize(all_parsers, pyarrow_xfail): parser = all_parsers data = """A,B,C 1,2,3 @@ -565,7 +574,7 @@ def test_get_chunk_passed_chunksize(all_parsers): @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)]) -def test_read_chunksize_compat(all_parsers, kwargs): +def test_read_chunksize_compat(all_parsers, kwargs, pyarrow_xfail): # see gh-12185 data = """index,A,B,C,D foo,2,3,4,5 @@ -582,7 +591,7 @@ def test_read_chunksize_compat(all_parsers, kwargs): tm.assert_frame_equal(concat(reader), result) -def test_read_chunksize_jagged_names(all_parsers): +def test_read_chunksize_jagged_names(all_parsers, pyarrow_xfail): # see gh-23509 parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) @@ -594,7 +603,7 @@ def test_read_chunksize_jagged_names(all_parsers): tm.assert_frame_equal(result, expected) -def test_read_data_list(all_parsers): +def test_read_data_list(all_parsers, pyarrow_xfail): parser = all_parsers kwargs = dict(index_col=0) data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" @@ -608,7 +617,7 @@ def test_read_data_list(all_parsers): tm.assert_frame_equal(result, expected) -def test_iterator(all_parsers): +def test_iterator(all_parsers, pyarrow_xfail): # see gh-6607 data = """index,A,B,C,D foo,2,3,4,5 @@ -631,7 +640,7 @@ def test_iterator(all_parsers): tm.assert_frame_equal(last_chunk, expected[3:]) -def test_iterator2(all_parsers): +def test_iterator2(all_parsers, pyarrow_xfail): parser = all_parsers data = """A,B,C foo,1,2,3 @@ -694,7 +703,7 @@ def test_reader_list_skiprows(all_parsers): tm.assert_frame_equal(chunks[0], expected[1:3]) -def test_iterator_stop_on_chunksize(all_parsers): +def test_iterator_stop_on_chunksize(all_parsers, pyarrow_xfail): # gh-3967: stopping iteration when chunksize is specified parser = all_parsers data = """A,B,C @@ -718,7 +727,7 @@ def test_iterator_stop_on_chunksize(all_parsers): @pytest.mark.parametrize( "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)] ) -def test_iterator_skipfooter_errors(all_parsers, kwargs): +def test_iterator_skipfooter_errors(all_parsers, kwargs, pyarrow_xfail): msg = "'skipfooter' not supported for iteration" parser = all_parsers data = "a\n1\n2" @@ -798,6 +807,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D @@ -822,6 +832,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -855,6 +866,7 @@ def test_multi_index_no_level_names_implicit(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected,header", [ @@ -876,6 +888,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -938,6 +951,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@xfail_pyarrow def test_path_path_lib(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -945,6 +959,7 @@ def test_path_path_lib(all_parsers): tm.assert_frame_equal(df, result) +@xfail_pyarrow def test_path_local_path(all_parsers): parser = all_parsers df = tm.makeDataFrame() @@ -954,6 +969,7 @@ def test_path_local_path(all_parsers): tm.assert_frame_equal(df, result) +@xfail_pyarrow def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError @@ -967,6 +983,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename +@xfail_pyarrow @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 @@ -989,6 +1006,7 @@ def test_no_permission(all_parsers): assert path == e.value.filename +@xfail_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -1004,6 +1022,7 @@ def test_missing_trailing_delimiters(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -1064,6 +1083,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1167,6 +1187,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): assert df.a.dtype == object +@skip_pyarrow @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 @@ -1178,6 +1199,7 @@ def test_integer_overflow_bug(all_parsers, sep): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -1197,6 +1219,7 @@ def test_catch_too_many_names(all_parsers): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) +@xfail_pyarrow def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers @@ -1217,6 +1240,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -1227,6 +1251,7 @@ def test_empty_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -1239,6 +1264,7 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers @@ -1250,6 +1276,7 @@ def test_empty_with_reversed_multi_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_float_parser(all_parsers): # see gh-9565 parser = all_parsers @@ -1271,6 +1298,7 @@ def test_scientific_no_exponent(all_parsers): tm.assert_frame_equal(df_roundtrip, df) +@xfail_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv): data = """ID @@ -1314,6 +1342,7 @@ def test_int64_overflow(all_parsers, conv): parser.read_csv(StringIO(data), converters={"ID": conv}) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -1327,6 +1356,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -1340,6 +1370,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 @@ -1352,6 +1383,7 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 @@ -1369,6 +1401,7 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected,msg", [ @@ -1476,6 +1509,7 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -1517,7 +1551,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): ), ], ) -def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected, pyarrow_xfail): # see gh-12493 parser = all_parsers @@ -1530,6 +1564,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -1570,6 +1605,7 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers): parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) +@xfail_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -1588,6 +1624,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "sep,skip_blank_lines,exp_data", [ @@ -1627,6 +1664,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -1642,6 +1680,7 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -1670,6 +1709,7 @@ def test_whitespace_regex_separator(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_verbose_read(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1693,6 +1733,7 @@ def test_verbose_read(all_parsers, capsys): assert captured.out == "Filled 3 NA values in column a\n" +@xfail_pyarrow def test_verbose_read2(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -1734,6 +1775,7 @@ def test_iteration_open_handle(all_parsers): tm.assert_series_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -1765,6 +1807,7 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_euro_decimal_format(all_parsers): parser = all_parsers data = """Id;Number1;Number2;Text1;Text2;Number3 @@ -1784,6 +1827,7 @@ def test_euro_decimal_format(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -1807,6 +1851,7 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers @@ -1824,6 +1869,7 @@ def test_infinity_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -1834,6 +1880,7 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) +@xfail_pyarrow @td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") @@ -1847,6 +1894,7 @@ def test_memory_map(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_null_byte_char(all_parsers): # see gh-2741 data = "\x00,foo" @@ -1863,6 +1911,7 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) +@xfail_pyarrow def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -1984,6 +2033,7 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in. @@ -2002,6 +2052,7 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): parser.read_csv(StringIO(data), **kwargs) +@xfail_pyarrow def test_warn_bad_lines(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2016,6 +2067,7 @@ def test_warn_bad_lines(all_parsers, capsys): assert "Skipping line 5" in captured.err +@xfail_pyarrow def test_suppress_error_output(all_parsers, capsys): # see gh-15925 parser = all_parsers @@ -2044,6 +2096,7 @@ def test_filename_with_special_chars(all_parsers, filename): tm.assert_frame_equal(result, df) +@xfail_pyarrow def test_read_csv_memory_growth_chunksize(all_parsers): # see gh-24805 # @@ -2116,7 +2169,7 @@ def test_read_table_equivalency_to_read_csv(all_parsers): tm.assert_frame_equal(result, expected) -def test_first_row_bom(all_parsers): +def test_first_row_bom(all_parsers, pyarrow_xfail): # see gh-26545 parser = all_parsers data = '''\ufeff"Head1" "Head2" "Head3"''' @@ -2126,6 +2179,7 @@ def test_first_row_bom(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -2146,6 +2200,7 @@ def test_integer_precision(all_parsers): tm.assert_series_equal(result, expected) +@xfail_pyarrow def test_file_descriptor_leak(all_parsers): # GH 31488 @@ -2159,6 +2214,7 @@ def test(): td.check_file_leaks(test)() +@xfail_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -2172,6 +2228,7 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): tm.assert_frame_equal(df, ref[:nrows]) +@xfail_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -2202,6 +2259,7 @@ def test_read_csv_with_use_inf_as_na(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") @@ -2243,6 +2301,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) +@skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 6e957313d8de8..f7ddcf0a71be3 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -11,6 +11,8 @@ import pandas as pd import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture(params=[True, False]) def buffer(request): @@ -29,7 +31,7 @@ def parser_and_data(all_parsers, csv1): @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) -def test_zip(parser_and_data, compression): +def test_zip(parser_and_data, compression, pyarrow_xfail): parser, data, expected = parser_and_data with tm.ensure_clean("test_file.zip") as path: @@ -46,7 +48,7 @@ def test_zip(parser_and_data, compression): @pytest.mark.parametrize("compression", ["zip", "infer"]) -def test_zip_error_multiple_files(parser_and_data, compression): +def test_zip_error_multiple_files(parser_and_data, compression, pyarrow_xfail): parser, data, expected = parser_and_data with tm.ensure_clean("combined_zip.zip") as path: @@ -60,7 +62,7 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) -def test_zip_error_no_files(parser_and_data): +def test_zip_error_no_files(parser_and_data, pyarrow_xfail): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -71,7 +73,7 @@ def test_zip_error_no_files(parser_and_data): parser.read_csv(path, compression="zip") -def test_zip_error_invalid_zip(parser_and_data): +def test_zip_error_invalid_zip(parser_and_data, pyarrow_xfail): parser, _, _ = parser_and_data with tm.ensure_clean() as path: @@ -80,8 +82,11 @@ def test_zip_error_invalid_zip(parser_and_data): parser.read_csv(f, compression="zip") +@skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression(parser_and_data, compression_only, buffer, filename): +def test_compression( + parser_and_data, compression_only, buffer, filename, pyarrow_xfail +): parser, data, expected = parser_and_data compress_type = compression_only @@ -113,6 +118,8 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): expected = parser.read_csv(csv1, **kwargs) kwargs["compression"] = "infer" + if ext == "bz2": + pytest.xfail("pyarrow wheels don't have bz2 codec support") if buffer: with open(csv1) as f: result = parser.read_csv(f, **kwargs) @@ -123,7 +130,9 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): +def test_compression_utf_encoding( + all_parsers, csv_dir_path, utf_value, encoding_fmt, pyarrow_xfail +): # see gh-18071, gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 88b400d9a11df..a70fe847b6ae9 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -12,6 +12,8 @@ from pandas import DataFrame, Index import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + def test_converters_type_must_be_dict(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index cc65def0fd096..afdd7548ed0dd 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -13,6 +13,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_skip") + @pytest.fixture def custom_dialect(): diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 861aeba60cab7..8e6462767513a 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -16,7 +16,11 @@ from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) def test_dtype_all_columns(all_parsers, dtype, check_orig): @@ -43,6 +47,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_dtype_all_columns_empty(all_parsers): # see gh-12048 parser = all_parsers @@ -52,6 +57,7 @@ def test_dtype_all_columns_empty(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -70,6 +76,7 @@ def test_dtype_per_column(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_invalid_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -83,6 +90,7 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) +@xfail_pyarrow @pytest.mark.parametrize( "dtype", [ @@ -109,6 +117,7 @@ def test_categorical_dtype(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@skip_pyarrow @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 @@ -124,6 +133,7 @@ def test_categorical_dtype_single(all_parsers, dtype): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -142,6 +152,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -160,6 +171,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers): # see gh-18186 @@ -187,6 +199,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_utf16(all_parsers, csv_dir_path): # see gh-10153 pth = os.path.join(csv_dir_path, "utf16_ex.txt") @@ -201,6 +214,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -219,6 +233,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) +@xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -320,6 +335,7 @@ def test_categorical_coerces_timestamp(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} @@ -361,6 +377,7 @@ def test_categorical_unexpected_categories(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_pass_dtype(all_parsers): parser = all_parsers @@ -374,6 +391,7 @@ def test_empty_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers @@ -388,6 +406,7 @@ def test_empty_with_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers @@ -403,7 +422,7 @@ def test_empty_with_multi_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers, pyarrow_xfail): parser = all_parsers data = "one,one" @@ -416,6 +435,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): parser = all_parsers @@ -429,6 +449,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers @@ -457,6 +478,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) +@xfail_pyarrow def test_raise_on_passed_int_dtype_with_nas(all_parsers): # see gh-2631 parser = all_parsers @@ -474,6 +496,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) +@xfail_pyarrow def test_dtype_with_converters(all_parsers): parser = all_parsers data = """a,b @@ -489,6 +512,7 @@ def test_dtype_with_converters(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "dtype,expected", [ @@ -553,6 +577,7 @@ def test_numeric_dtype(all_parsers, dtype): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_boolean_dtype(all_parsers): parser = all_parsers data = "\n".join( diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index e74265da3e966..ca5a763b735c3 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -13,7 +13,11 @@ from pandas import DataFrame, read_csv import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_bytes_io_input(all_parsers): encoding = "cp1255" parser = all_parsers @@ -25,6 +29,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -34,6 +39,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -68,6 +74,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_utf16_example(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers @@ -75,6 +82,7 @@ def test_utf16_example(all_parsers, csv_dir_path): assert len(result) == 50 +@xfail_pyarrow def test_unicode_encoding(all_parsers, csv_dir_path): path = os.path.join(csv_dir_path, "unicode_series.csv") parser = all_parsers @@ -87,6 +95,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): assert got == expected +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -120,6 +129,7 @@ def _encode_data_with_bom(_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -132,6 +142,7 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "file_path,encoding", [ @@ -166,6 +177,7 @@ def test_binary_mode_file_buffers( tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): # see gh-24130 @@ -182,6 +194,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 4cd110136d7b0..34eaf6ae306b4 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -14,7 +14,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -82,6 +86,7 @@ def test_no_header_prefix(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -119,6 +124,7 @@ def test_header_not_first_line(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) @@ -184,6 +190,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("names", ["first", "second"]) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -231,6 +238,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -277,6 +285,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -324,6 +333,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -344,6 +354,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -365,6 +376,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): tm.assert_frame_equal(expected, result) +@xfail_pyarrow def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -385,6 +397,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) +@skip_pyarrow @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) @@ -397,6 +410,7 @@ def test_header_names_backward_compat(all_parsers, data, header): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -442,6 +456,7 @@ def test_non_int_header(all_parsers, header): parser.read_csv(StringIO(data), header=header) +@xfail_pyarrow def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -452,6 +467,7 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -498,6 +514,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -541,6 +558,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 9c6cad4b41949..c75e4cf2f3b34 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -11,7 +11,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -66,6 +70,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) +@xfail_pyarrow def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -83,6 +88,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "index_col,kwargs", [ @@ -127,6 +133,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -137,6 +144,7 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "index_names", [ @@ -161,6 +169,7 @@ def test_multi_index_naming(all_parsers, index_names): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -175,6 +184,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -186,6 +196,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 5c4e642115798..cc88a1d974767 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -10,7 +10,10 @@ from pandas import DataFrame import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" @@ -24,6 +27,7 @@ def test_basic(all_parsers, kwargs): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -44,6 +48,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -111,6 +116,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@xfail_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index d50560c684084..06f14e28435ef 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.usefixtures("pyarrow_xfail") + def _construct_dataframe(num_rows): """ diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 9f86bbd65640e..9e7a445234a45 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -12,7 +12,11 @@ from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -28,6 +32,7 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -42,6 +47,7 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_values", [ @@ -79,6 +85,7 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -126,6 +133,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -159,6 +167,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -177,6 +186,7 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -210,6 +220,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -297,6 +308,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" @@ -308,6 +320,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # @@ -319,6 +332,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -348,6 +362,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_filter,row_data", [ @@ -369,6 +384,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -396,6 +412,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "na_values,row_data", [ @@ -414,6 +431,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -429,6 +447,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +@xfail_pyarrow def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -440,6 +459,7 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -469,6 +489,7 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) @@ -497,6 +518,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@xfail_pyarrow @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -512,6 +534,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data, na_values", [ @@ -540,6 +563,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) +@xfail_pyarrow def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 7a5203ca86520..ad2989cb09400 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -34,7 +34,10 @@ else: date_strategy = st.datetimes() +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # @@ -56,6 +59,7 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) +@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col_custom(all_parsers, keep_date_col): data = """\ @@ -199,6 +203,7 @@ def date_parser(*date_cols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): @@ -211,6 +216,7 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) +@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col): data = """\ @@ -370,6 +376,7 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -434,6 +441,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -468,6 +476,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers data = """\ @@ -637,6 +646,7 @@ def test_date_parser_int_bug(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers @@ -652,6 +662,7 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) +@xfail_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -666,6 +677,7 @@ def test_csv_custom_parser(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -679,6 +691,7 @@ def test_parse_dates_implicit_first_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_parse_dates_string(all_parsers): data = """date,A,B,C 20090101,a,1,2 @@ -723,6 +736,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" @@ -739,6 +753,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_parse_dates(all_parsers, index_col): data = """index1,index2,A,B,C @@ -784,6 +799,7 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers @@ -828,6 +844,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) +@xfail_pyarrow def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers @@ -841,6 +858,7 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is pytz.utc +@xfail_pyarrow @pytest.mark.parametrize( "parse_dates,index_col", [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], @@ -941,6 +959,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_multiple_date_cols_chunked(all_parsers): parser = all_parsers data = """\ @@ -1033,6 +1052,7 @@ def test_multiple_date_cols_chunked(all_parsers): tm.assert_frame_equal(chunks[2], expected[4:]) +@xfail_pyarrow def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1056,6 +1076,7 @@ def test_multiple_date_col_named_index_compat(all_parsers): tm.assert_frame_equal(with_indices, with_names) +@xfail_pyarrow def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1123,6 +1144,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): ) +@xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1135,6 +1157,7 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1174,6 +1197,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1202,6 +1226,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), @@ -1290,6 +1315,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]), @@ -1312,6 +1338,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1343,6 +1370,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_parser, warning", ( @@ -1374,6 +1402,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." @@ -1392,6 +1421,7 @@ def test_generic(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 parser = all_parsers @@ -1489,6 +1519,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1500,6 +1531,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1565,6 +1597,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert result == expected +@xfail_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 14773dfbea20e..8b010df470386 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -13,7 +13,11 @@ from pandas import DataFrame import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,msg", [ @@ -33,6 +37,7 @@ def test_bad_quote_char(all_parsers, kwargs, msg): parser.read_csv(StringIO(data), **kwargs) +@xfail_pyarrow @pytest.mark.parametrize( "quoting,msg", [ @@ -57,6 +62,7 @@ def test_quote_char_basic(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) def test_quote_char_various(all_parsers, quote_char): parser = all_parsers @@ -69,6 +75,7 @@ def test_quote_char_various(all_parsers, quote_char): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) @pytest.mark.parametrize("quote_char", ["", None]) def test_null_quote_char(all_parsers, quoting, quote_char): @@ -88,6 +95,7 @@ def test_null_quote_char(all_parsers, quoting, quote_char): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,exp_data", [ @@ -114,6 +122,7 @@ def test_quoting_various(all_parsers, kwargs, exp_data): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] ) @@ -137,6 +146,7 @@ def test_quotechar_unicode(all_parsers, quotechar): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("balanced", [True, False]) def test_unbalanced_quoting(all_parsers, balanced): # see gh-22789. diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index fdccef1127c7e..732f2eb18fdd9 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -14,7 +14,10 @@ from pandas import DataFrame, Index import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +@xfail_pyarrow @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) def test_skip_rows_bug(all_parsers, skiprows): # see gh-505 @@ -42,6 +45,7 @@ def test_skip_rows_bug(all_parsers, skiprows): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers @@ -57,6 +61,7 @@ def test_deep_skip_rows(all_parsers): tm.assert_frame_equal(result, condensed_result) +@xfail_pyarrow def test_skip_rows_blank(all_parsers): # see gh-9832 parser = all_parsers @@ -83,6 +88,7 @@ def test_skip_rows_blank(all_parsers): tm.assert_frame_equal(data, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -123,6 +129,7 @@ def test_skip_row_with_newline(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_row_with_quote(all_parsers): # see gh-12775 and gh-10911 parser = all_parsers @@ -138,6 +145,7 @@ def test_skip_row_with_quote(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,exp_data", [ @@ -173,6 +181,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" ) @@ -209,6 +218,7 @@ def test_skiprows_lineterminator(all_parsers, line_terminator): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers @@ -219,6 +229,7 @@ def test_skiprows_infield_quote(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -234,6 +245,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_skip_rows_skip_all(all_parsers): parser = all_parsers data = "a\n1\n2\n3\n4\n5" @@ -243,6 +255,7 @@ def test_skip_rows_skip_all(all_parsers): parser.read_csv(StringIO(data), skiprows=lambda x: True) +@xfail_pyarrow def test_skip_rows_bad_callable(all_parsers): msg = "by zero" parser = all_parsers diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 267fae760398a..6e9cdacd40586 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -121,3 +121,23 @@ def read(self): with pytest.raises(ValueError, match=msg): read_csv(NoNextBuffer(data), engine=python_engine) + + def test_pyarrow_engine(self): + from pandas.io.parsers import _pyarrow_unsupported as pa_unsupported + + data = """1,2,3,, + 1,2,3,4, + 1,2,3,4,5 + 1,2,,, + 1,2,3,4,""" + + for default in pa_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the 'pyarrow' engine" + ) + kwargs = {default: object()} + if default == "dialect": + kwargs[default] = "excel" # test a random dialect + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="pyarrow", **kwargs) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 7e9c9866a666d..0f2e5882439f8 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -12,6 +12,9 @@ from pandas import DataFrame, Index import pandas._testing as tm +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + _msg_validate_usecols_arg = ( "'usecols' must either be list-like " "of all strings, all unicode, all " @@ -22,6 +25,7 @@ ) +@skip_pyarrow def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678 data = """a,b,c @@ -35,6 +39,7 @@ def test_raise_on_mixed_dtype_usecols(all_parsers): parser.read_csv(StringIO(data), usecols=usecols) +@skip_pyarrow @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) def test_usecols(all_parsers, usecols): data = """\ @@ -50,6 +55,7 @@ def test_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_names(all_parsers): data = """\ a,b,c @@ -65,6 +71,7 @@ def test_usecols_with_names(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) @@ -81,6 +88,7 @@ def test_usecols_relative_to_names(all_parsers, names, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_relative_to_names2(all_parsers): # see gh-5766 data = """\ @@ -97,6 +105,7 @@ def test_usecols_relative_to_names2(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_name_length_conflict(all_parsers): data = """\ 1,2,3 @@ -125,6 +134,7 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") +@xfail_pyarrow @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -138,6 +148,7 @@ def test_usecols_index_col_false(all_parsers, data): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize("index_col", ["b", 0]) @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) def test_usecols_index_col_conflict(all_parsers, usecols, index_col): @@ -164,6 +175,7 @@ def test_usecols_index_col_conflict2(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_implicit_index_col(all_parsers): # see gh-2654 parser = all_parsers @@ -174,6 +186,7 @@ def test_usecols_implicit_index_col(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers @@ -184,6 +197,7 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" @@ -193,6 +207,7 @@ def test_usecols_with_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -212,6 +227,7 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 @@ -230,6 +246,7 @@ def test_usecols_with_parse_dates(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 parser = all_parsers @@ -290,6 +307,7 @@ def test_usecols_with_parse_dates3(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_parse_dates4(all_parsers): data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" usecols = list("abcdefghij") @@ -313,6 +331,7 @@ def test_usecols_with_parse_dates4(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) @pytest.mark.parametrize( "names", @@ -406,6 +425,7 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_empty_usecols(all_parsers): data = "a,b,c\n1,2,3\n4,5,6" expected = DataFrame() @@ -426,6 +446,7 @@ def test_np_array_usecols(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -458,6 +479,7 @@ def test_callable_usecols(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) def test_incomplete_first_row(all_parsers, usecols): # see gh-6710 @@ -470,6 +492,7 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize( "data,usecols,kwargs,expected", [ @@ -502,6 +525,7 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow @pytest.mark.parametrize( "usecols,kwargs,expected,msg", [ @@ -558,6 +582,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected tm.assert_frame_equal(result, expected) +@xfail_pyarrow @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): if all_parsers.engine != "c": diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index e5ed69b7703b1..61dbd81e2cee5 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -27,14 +27,15 @@ def test_bad_version(monkeypatch): module = types.ModuleType(name) module.__version__ = "0.9.0" sys.modules[name] = module - monkeypatch.setitem(VERSIONS, name, "1.0.0") match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'" with pytest.raises(ImportError, match=match): - import_optional_dependency("fakemodule") + import_optional_dependency("fakemodule", min_version="1.0.0") with tm.assert_produces_warning(UserWarning): - result = import_optional_dependency("fakemodule", on_version="warn") + result = import_optional_dependency( + "fakemodule", min_version="1.0.0", on_version="warn" + ) assert result is None module.__version__ = "1.0.0" # exact match is OK