diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 1f6ba3af5bfe5..4b4366fa387bf 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -34,7 +34,6 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_override_set_noconvert_columns(): @@ -515,8 +514,6 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): tm.assert_frame_equal(result, expected) -# Skip for now, actually only one test fails though, but its tricky to xfail -@skip_pyarrow @pytest.mark.parametrize( "sep,skip_blank_lines,exp_data", [ @@ -536,7 +533,7 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): ), ], ) -def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): +def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request): parser = all_parsers data = """\ A,B,C @@ -550,6 +547,12 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): if sep == r"\s+": data = data.replace(",", " ") + if parser.engine == "pyarrow": + mark = pytest.mark.xfail( + raises=ValueError, + reason="the 'pyarrow' engine does not support regex separators", + ) + request.applymarker(mark) result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) expected = DataFrame(exp_data, columns=["A", "B", "C"]) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 1d5b0fec7a7c6..7df14043f478c 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -21,10 +21,6 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - @pytest.mark.parametrize( "data,kwargs,expected", @@ -278,7 +274,8 @@ def test_empty_with_index(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +# CSV parse error: Empty CSV file or block: cannot infer number of columns +@xfail_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -291,7 +288,8 @@ def test_empty_with_multi_index(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +# CSV parse error: Empty CSV file or block: cannot infer number of columns +@xfail_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 939fdbc159454..086b43be59823 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -17,9 +17,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") def test_int_conversion(all_parsers): @@ -102,12 +100,16 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow # Flaky @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 data = "65248E10 11\n55555E55 22\n" parser = all_parsers + if parser.engine == "pyarrow" and sep != " ": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, sep=sep) + return result = parser.read_csv(StringIO(data), header=None, sep=sep) expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) @@ -124,7 +126,8 @@ def test_int64_min_issues(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +# ValueError: The 'converters' option is not supported with the 'pyarrow' engine +@xfail_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) def test_int64_overflow(all_parsers, conv): data = """ID @@ -168,7 +171,7 @@ def test_int64_overflow(all_parsers, conv): parser.read_csv(StringIO(data), converters={"ID": conv}) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -182,7 +185,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -196,7 +199,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # gets float64 dtype instead of object @pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 8b612c8d40994..52ddb38192a6b 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -22,7 +22,6 @@ import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_empty_decimal_marker(all_parsers): @@ -44,7 +43,6 @@ def test_empty_decimal_marker(all_parsers): parser.read_csv(StringIO(data), decimal="") -@skip_pyarrow def test_bad_stream_exception(all_parsers, csv_dir_path): # see gh-13652 # @@ -65,7 +63,7 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): parser.read_csv(stream) -@skip_pyarrow +@xfail_pyarrow # ValueError: The 'comment' option is not supported def test_malformed(all_parsers): # see gh-6607 parser = all_parsers @@ -80,7 +78,7 @@ def test_malformed(all_parsers): parser.read_csv(StringIO(data), header=1, comment="#") -@skip_pyarrow +@xfail_pyarrow # ValueError: The 'iterator' option is not supported @pytest.mark.parametrize("nrows", [5, 3, None]) def test_malformed_chunks(all_parsers, nrows): data = """ignore @@ -100,7 +98,7 @@ def test_malformed_chunks(all_parsers, nrows): reader.read(nrows) -@skip_pyarrow +@xfail_pyarrow # does not raise def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -115,12 +113,17 @@ def test_catch_too_many_names(all_parsers): else "Number of passed names did not match " "number of header fields in the file" ) + depr_msg = "Passing a BlockManager to DataFrame is deprecated" + warn = None + if parser.engine == "pyarrow": + warn = DeprecationWarning - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -208,7 +211,6 @@ def test_read_csv_wrong_num_columns(all_parsers): parser.read_csv(StringIO(data)) -@skip_pyarrow def test_null_byte_char(request, all_parsers): # see gh-2741 data = "\x00,foo" @@ -226,12 +228,19 @@ def test_null_byte_char(request, all_parsers): out = parser.read_csv(StringIO(data), names=names) tm.assert_frame_equal(out, expected) else: - msg = "NULL byte detected" + if parser.engine == "pyarrow": + msg = ( + "CSV parse error: Empty CSV file or block: " + "cannot infer number of columns" + ) + else: + msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), names=names) -@skip_pyarrow +# ValueError: the 'pyarrow' engine does not support sep=None with delim_whitespace=False +@xfail_pyarrow @pytest.mark.filterwarnings("always::ResourceWarning") def test_open_file(request, all_parsers): # GH 39024 diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 591defdde7df9..16ee8ab4106ef 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -279,19 +279,3 @@ def pyarrow_xfail(request): if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") request.applymarker(mark) - - -@pytest.fixture -def pyarrow_skip(request): - """ - Fixture that skips a test if the engine is pyarrow. - """ - if "all_parsers" in request.fixturenames: - parser = request.getfixturevalue("all_parsers") - elif "all_parsers_all_precisions" in request.fixturenames: - # Return value is tuple of (engine, precision) - parser = request.getfixturevalue("all_parsers_all_precisions")[0] - else: - return - if parser.engine == "pyarrow": - pytest.skip("pyarrow doesn't support this.") diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index d305c94b171f3..c7586bd9334ef 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -25,7 +25,6 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @xfail_pyarrow @@ -55,9 +54,8 @@ def test_categorical_dtype(all_parsers, dtype): tm.assert_frame_equal(actual, expected) -@skip_pyarrow # Flaky @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) -def test_categorical_dtype_single(all_parsers, dtype): +def test_categorical_dtype_single(all_parsers, dtype, request): # see gh-10153 parser = all_parsers data = """a,b,c @@ -67,6 +65,13 @@ def test_categorical_dtype_single(all_parsers, dtype): expected = DataFrame( {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} ) + if parser.engine == "pyarrow": + mark = pytest.mark.xfail( + strict=False, + reason="Flaky test sometimes gives object dtype instead of Categorical", + ) + request.applymarker(mark) + actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) @@ -141,6 +146,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) +# ValueError: The 'chunksize' option is not supported with the 'pyarrow' engine @xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 @@ -161,6 +167,7 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) +# ValueError: The 'chunksize' option is not supported with the 'pyarrow' engine @xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 @@ -253,7 +260,6 @@ def test_categorical_coerces_numeric(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow # Flaky def test_categorical_coerces_datetime(all_parsers): parser = all_parsers dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 2ef7102543154..191d0de50b12f 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -17,8 +17,6 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - @pytest.fixture(params=[True, False]) def buffer(request): @@ -36,7 +34,6 @@ def parser_and_data(all_parsers, csv1): return parser, data, expected -@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) def test_zip(parser_and_data, compression): parser, data, expected = parser_and_data @@ -54,7 +51,6 @@ def test_zip(parser_and_data, compression): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer"]) def test_zip_error_multiple_files(parser_and_data, compression): parser, data, expected = parser_and_data @@ -70,7 +66,6 @@ def test_zip_error_multiple_files(parser_and_data, compression): parser.read_csv(path, compression=compression) -@skip_pyarrow def test_zip_error_no_files(parser_and_data): parser, _, _ = parser_and_data @@ -82,7 +77,6 @@ def test_zip_error_no_files(parser_and_data): parser.read_csv(path, compression="zip") -@skip_pyarrow def test_zip_error_invalid_zip(parser_and_data): parser, _, _ = parser_and_data @@ -92,7 +86,6 @@ def test_zip_error_invalid_zip(parser_and_data): parser.read_csv(f, compression="zip") -@skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) def test_compression( request, @@ -128,7 +121,6 @@ def test_compression( tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("ext", [None, "gz", "bz2"]) def test_infer_compression(all_parsers, csv1, buffer, ext): # see gh-9770 @@ -148,7 +140,6 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): # see gh-18071, gh-24130 parser = all_parsers @@ -166,7 +157,6 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers @@ -178,7 +168,6 @@ def test_invalid_compression(all_parsers, invalid_compression): parser.read_csv("test_file.zip", **compress_kwargs) -@skip_pyarrow def test_compression_tar_archive(all_parsers, csv_dir_path): parser = all_parsers path = os.path.join(csv_dir_path, "tar_csv.tar.gz") @@ -200,7 +189,6 @@ def test_ignore_compression_extension(all_parsers): tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) -@skip_pyarrow def test_writes_tar_gz(all_parsers): parser = all_parsers data = DataFrame( diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 070027860b829..9e1200c142d6b 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -23,7 +23,6 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -38,7 +37,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -181,13 +180,16 @@ def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath): tm.assert_frame_equal(expected, result) -@skip_pyarrow @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): # see gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) + if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]: + # FIXME: this is bad! + pytest.skip("These cases freeze") + expected = DataFrame({"foo": ["bar"]}) with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f: @@ -198,7 +200,6 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index d059cc0c49db4..2edb389a0c830 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -22,11 +22,10 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -# TODO(1.4): Change me to xfails at release time -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -80,7 +79,7 @@ def test_bool_header_arg(all_parsers, header): parser.read_csv(StringIO(data), header=header) -@skip_pyarrow +@xfail_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -118,7 +117,7 @@ def test_header_not_first_line(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index(all_parsers): parser = all_parsers expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) @@ -184,7 +183,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("_TestTuple", ["first", "second"]) -@skip_pyarrow +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -232,7 +231,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -279,7 +278,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow @pytest.mark.parametrize( "kwargs", [ @@ -327,7 +326,7 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -348,7 +347,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): tm.assert_frame_equal(expected, result) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -370,7 +369,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): tm.assert_frame_equal(expected, result) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -391,7 +390,7 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_blank_line(all_parsers): # GH 40442 parser = all_parsers @@ -403,20 +402,24 @@ def test_header_multi_index_blank_line(all_parsers): tm.assert_frame_equal(expected, result) -@skip_pyarrow @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) -def test_header_names_backward_compat(all_parsers, data, header): +def test_header_names_backward_compat(all_parsers, data, header, request): # see gh-2539 parser = all_parsers + + if parser.engine == "pyarrow" and header is not None: + mark = pytest.mark.xfail(reason="mismatched index") + request.applymarker(mark) + expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block: cannot infer @pytest.mark.parametrize("kwargs", [{}, {"index_col": False}]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -461,7 +464,7 @@ def test_non_int_header(all_parsers, header): parser.read_csv(StringIO(data), header=header) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -472,7 +475,7 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "data,expected", [ @@ -519,7 +522,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is requireds @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -558,7 +561,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Expected 2 columns, got 3 def test_names_longer_than_header_but_equal_with_data_rows(all_parsers): # GH#38453 parser = all_parsers @@ -571,7 +574,7 @@ def test_names_longer_than_header_but_equal_with_data_rows(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers @@ -603,7 +606,7 @@ def test_read_csv_multiindex_columns(all_parsers): tm.assert_frame_equal(df2, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_read_csv_multi_header_length_check(all_parsers): # GH#43102 parser = all_parsers @@ -619,7 +622,7 @@ def test_read_csv_multi_header_length_check(all_parsers): parser.read_csv(StringIO(case), header=[0, 2]) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Expected 3 columns, got 2 def test_header_none_and_implicit_index(all_parsers): # GH#22144 parser = all_parsers @@ -631,7 +634,7 @@ def test_header_none_and_implicit_index(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got " def test_header_none_and_implicit_index_in_second_row(all_parsers): # GH#22144 parser = all_parsers @@ -640,7 +643,6 @@ def test_header_none_and_implicit_index_in_second_row(all_parsers): parser.read_csv(StringIO(data), names=["a", "b"], header=None) -@skip_pyarrow def test_header_none_and_on_bad_lines_skip(all_parsers): # GH#22144 parser = all_parsers @@ -652,7 +654,7 @@ def test_header_none_and_on_bad_lines_skip(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is requireds def test_header_missing_rows(all_parsers): # GH#47400 parser = all_parsers @@ -664,7 +666,8 @@ def test_header_missing_rows(all_parsers): parser.read_csv(StringIO(data), header=[0, 1, 2]) -@skip_pyarrow +# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +@xfail_pyarrow def test_header_multiple_whitespaces(all_parsers): # GH#54931 parser = all_parsers @@ -676,7 +679,8 @@ def test_header_multiple_whitespaces(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +@xfail_pyarrow def test_header_delim_whitespace(all_parsers): # GH#54918 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 83dae12b472da..b938b129ac38d 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -19,8 +19,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -# TODO(1.4): Change me to xfails at release time -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("with_header", [True, False]) @@ -77,7 +76,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -95,7 +94,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "index_col,kwargs", [ @@ -144,7 +143,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -155,7 +154,6 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "index_names", [ @@ -166,9 +164,13 @@ def test_empty_with_index_col_false(all_parsers): ["NotReallyUnnamed", "Unnamed: 0"], ], ) -def test_multi_index_naming(all_parsers, index_names): +def test_multi_index_naming(all_parsers, index_names, request): parser = all_parsers + if parser.engine == "pyarrow" and "" in index_names: + mark = pytest.mark.xfail(reason="One case raises, others are wrong") + request.applymarker(mark) + # We don't want empty index names being replaced with "Unnamed: 0" data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) result = parser.read_csv(StringIO(data), index_col=[0, 1]) @@ -180,7 +182,7 @@ def test_multi_index_naming(all_parsers, index_names): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -195,7 +197,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -211,7 +213,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers @@ -257,7 +259,7 @@ def test_index_col_large_csv(all_parsers, monkeypatch): tm.assert_frame_equal(result, df.set_index("a")) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_index_col_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -274,7 +276,7 @@ def test_index_col_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_index_col_header_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -287,7 +289,7 @@ def test_index_col_header_no_data(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -298,7 +300,7 @@ def test_multiindex_columns_no_data(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_multiindex_columns_index_col_with_data(all_parsers): # GH#38292 parser = all_parsers @@ -315,7 +317,7 @@ def test_multiindex_columns_index_col_with_data(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Empty CSV file or block def test_infer_types_boolean_sum(all_parsers): # GH#44079 parser = all_parsers @@ -353,7 +355,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_multiindex_columns_not_leading_index_col(all_parsers): # GH#38549 parser = all_parsers diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 4acbb82a5f23f..7d148ae6c5a27 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -10,10 +10,15 @@ from pandas import DataFrame import pandas._testing as tm -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@skip_pyarrow +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@xfail_pyarrow def test_basic(all_parsers): parser = all_parsers @@ -24,7 +29,7 @@ def test_basic(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -45,7 +50,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) -@skip_pyarrow +@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -74,7 +79,6 @@ def test_thorough_mangle_columns(all_parsers, data, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "data,names,expected", [ @@ -114,7 +118,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@skip_pyarrow +@xfail_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" @@ -137,7 +141,7 @@ def test_mangled_unnamed_placeholders(all_parsers): tm.assert_frame_equal(df, expected) -@skip_pyarrow +@xfail_pyarrow def test_mangle_dupe_cols_already_exists(all_parsers): # GH#14704 parser = all_parsers @@ -151,7 +155,7 @@ def test_mangle_dupe_cols_already_exists(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): # GH#14704 parser = all_parsers @@ -165,7 +169,6 @@ def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")]) def test_mangle_cols_names(all_parsers, usecol, engine): # GH 11823 diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 86c50fe103f2c..59dae1eaa7e6c 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -20,7 +20,6 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -59,7 +58,7 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "na_values", [ @@ -145,8 +144,8 @@ def f(i, v): tm.assert_frame_equal(result, expected) -# TODO: needs skiprows list support in pyarrow -@skip_pyarrow +# ValueError: skiprows argument must be an integer when using engine='pyarrow' +@xfail_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -183,8 +182,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) -# TODO: Needs pyarrow support for dictionary in na_values -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -236,8 +234,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -# TODO: xfail components of this test, the first one passes -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict @pytest.mark.parametrize( "kwargs,expected", [ @@ -325,8 +322,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" @@ -338,8 +334,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # @@ -351,8 +346,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -382,8 +376,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -# TODO: Empty null_values doesn't work properly on pyarrow -@skip_pyarrow +@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -405,8 +398,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) -# TODO: Arrow parse error -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Expected 8 columns, got 5: def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -434,8 +426,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) -# TODO: xfail the na_values dict case -@skip_pyarrow +@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "na_values,row_data", [ @@ -454,7 +445,7 @@ def test_na_values_scalar(all_parsers, na_values, row_data): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -470,7 +461,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -482,7 +473,7 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -512,18 +503,20 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) -# TODO: Missing support for na_filter kewyord -@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) -def test_no_na_filter_on_index(all_parsers, na_filter, index_data): +def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request): # see gh-5239 # # Don't parse NA-values in index unless na_filter=True parser = all_parsers data = "a,b,c\n1,,3\n4,5,6" + if parser.engine == "pyarrow" and na_filter is False: + mark = pytest.mark.xfail(reason="mismatched index result") + request.applymarker(mark) + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b")) result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter) tm.assert_frame_equal(result, expected) @@ -542,7 +535,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@skip_pyarrow +@xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -558,7 +551,7 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # mismatched exception message @pytest.mark.parametrize( "data, na_values", [ @@ -587,7 +580,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) -@skip_pyarrow +@xfail_pyarrow # mismatched shapes def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers @@ -616,7 +609,7 @@ def test_str_nan_dropped(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_nan_multi_index(all_parsers): # GH 42446 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9b911466b5034..554151841aa22 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -41,10 +41,6 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") - @xfail_pyarrow def test_read_csv_with_custom_date_parser(all_parsers): @@ -1009,11 +1005,11 @@ def test_parse_tz_aware(all_parsers): expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) - tm.assert_frame_equal(result, expected) if parser.engine == "pyarrow": expected_tz = pytz.utc else: expected_tz = timezone.utc + tm.assert_frame_equal(result, expected) assert result.index.tz is expected_tz @@ -1799,7 +1795,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1815,7 +1811,6 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1828,17 +1823,28 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): ], ) def test_parse_delimited_date_swap_no_warning( - all_parsers, date_string, dayfirst, expected + all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + if parser.engine == "pyarrow": + if not dayfirst: + mark = pytest.mark.xfail(reason="CSV parse error: Empty CSV file or block") + request.applymarker(mark) + msg = "The 'dayfirst' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) + return + result = parser.read_csv( StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] ) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1889,7 +1895,6 @@ def _helper_hypothesis_delimited_date(call, date_string, **kwargs): return msg, result -@skip_pyarrow @given(DATETIME_NO_TZ) @pytest.mark.parametrize("delimiter", list(" -./")) @pytest.mark.parametrize("dayfirst", [True, False]) @@ -1924,7 +1929,7 @@ def test_hypothesis_delimited_date( assert result == expected -@skip_pyarrow +@xfail_pyarrow # KeyErrors @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ @@ -1957,13 +1962,17 @@ def test_missing_parse_dates_column_raises( ) -@skip_pyarrow +@xfail_pyarrow # mismatched shape def test_date_parser_and_names(all_parsers): # GH#33699 parser = all_parsers data = StringIO("""x,y\n1,2""") + warn = UserWarning + if parser.engine == "pyarrow": + # DeprecationWarning for passing a Manager object + warn = (UserWarning, DeprecationWarning) result = parser.read_csv_check_warnings( - UserWarning, + warn, "Could not infer format", data, parse_dates=["B"], @@ -1973,7 +1982,7 @@ def test_date_parser_and_names(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_date_parser_multiindex_columns(all_parsers): parser = all_parsers data = """a,b @@ -1986,7 +1995,7 @@ def test_date_parser_multiindex_columns(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "parse_spec, col_name", [ @@ -2010,7 +2019,8 @@ def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, co tm.assert_frame_equal(result, expected) -@skip_pyarrow +# ValueError: The 'thousands' option is not supported with the 'pyarrow' engine +@xfail_pyarrow def test_date_parser_usecols_thousands(all_parsers): # GH#39365 data = """A,B,C @@ -2019,8 +2029,12 @@ def test_date_parser_usecols_thousands(all_parsers): """ parser = all_parsers + warn = UserWarning + if parser.engine == "pyarrow": + # DeprecationWarning for passing a Manager object + warn = (UserWarning, DeprecationWarning) result = parser.read_csv_check_warnings( - UserWarning, + warn, "Could not infer format", StringIO(data), parse_dates=[1], @@ -2031,7 +2045,7 @@ def test_date_parser_usecols_thousands(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # mismatched shape def test_parse_dates_and_keep_original_column(all_parsers): # GH#13378 parser = all_parsers @@ -2130,7 +2144,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): tm.assert_index_equal(expected, res) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_infer_first_column_as_index(all_parsers): # GH#11019 parser = all_parsers @@ -2143,7 +2157,7 @@ def test_infer_first_column_as_index(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # pyarrow engine doesn't support passing a dict for na_values @pytest.mark.parametrize( ("key", "value", "warn"), [ @@ -2183,7 +2197,7 @@ def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # string[python] instead of dt64[ns] def test_parse_dates_and_string_dtype(all_parsers): # GH#34066 parser = all_parsers @@ -2246,7 +2260,6 @@ def test_parse_dates_dict_format(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] ) @@ -2257,7 +2270,11 @@ def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): 31-,12-2019 31-,12-2020""" - with tm.assert_produces_warning(None): + warn = None + if parser.engine == "pyarrow": + warn = DeprecationWarning + msg = "Passing a BlockManager to DataFrame is deprecated" + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): result = parser.read_csv( StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates ) @@ -2269,7 +2286,7 @@ def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # object dtype index def test_parse_dates_dict_format_index(all_parsers): # GH#51240 parser = all_parsers @@ -2312,7 +2329,7 @@ def test_parse_dates_arrow_engine(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # object dtype index def test_from_csv_with_mixed_offsets(all_parsers): parser = all_parsers data = "a\n2020-01-01T00:00:00+01:00\n2020-01-01T00:00:00+00:00" diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index f223810772225..bcb1c6af80df6 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -13,10 +13,6 @@ ) import pandas._testing as tm -# TODO(1.4): Change these to xfails whenever parse_dates support(which was -# intentionally disable to keep small PR sizes) is added back -# pytestmark = pytest.mark.usefixtures("pyarrow_skip") - pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 07c94e301b37a..e1ae2e8f3655c 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -25,8 +25,11 @@ "Usecols do not match columns, columns expected but not found: {0}" ) -# TODO: Switch to xfails -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning" +) def test_raise_on_mixed_dtype_usecols(all_parsers): @@ -42,9 +45,8 @@ def test_raise_on_mixed_dtype_usecols(all_parsers): parser.read_csv(StringIO(data), usecols=usecols) -@skip_pyarrow @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) -def test_usecols(all_parsers, usecols): +def test_usecols(all_parsers, usecols, request): data = """\ a,b,c 1,2,3 @@ -52,13 +54,16 @@ def test_usecols(all_parsers, usecols): 7,8,9 10,11,12""" parser = all_parsers + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found") + request.applymarker(mark) result = parser.read_csv(StringIO(data), usecols=usecols) expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: expected bytes, int found def test_usecols_with_names(all_parsers): data = """\ a,b,c @@ -74,24 +79,28 @@ def test_usecols_with_names(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) -def test_usecols_relative_to_names(all_parsers, names, usecols): +def test_usecols_relative_to_names(all_parsers, names, usecols, request): data = """\ 1,2,3 4,5,6 7,8,9 10,11,12""" parser = all_parsers + if parser.engine == "pyarrow" and not isinstance(usecols[0], int): + mark = pytest.mark.xfail( + reason="ArrowKeyError: Column 'fb' in include_columns does not exist" + ) + request.applymarker(mark) + result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_relative_to_names2(all_parsers): # see gh-5766 data = """\ @@ -100,6 +109,7 @@ def test_usecols_relative_to_names2(all_parsers): 7,8,9 10,11,12""" parser = all_parsers + result = parser.read_csv( StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] ) @@ -108,7 +118,8 @@ def test_usecols_relative_to_names2(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow +# regex mismatch: "Length mismatch: Expected axis has 1 elements" +@xfail_pyarrow def test_usecols_name_length_conflict(all_parsers): data = """\ 1,2,3 @@ -133,7 +144,7 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@skip_pyarrow +@xfail_pyarrow # CSV parse error in one case, AttributeError in another @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -147,12 +158,14 @@ def test_usecols_index_col_false(all_parsers, data): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("index_col", ["b", 0]) @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) -def test_usecols_index_col_conflict(all_parsers, usecols, index_col): +def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request): # see gh-4201: test that index_col as integer reflects usecols parser = all_parsers + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + mark = pytest.mark.xfail(raises=TypeError, match="expected bytes, int found") + request.applymarker(mark) data = "a,b,c,d\nA,a,1,one\nB,b,2,two" expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) @@ -168,18 +181,13 @@ def test_usecols_index_col_conflict2(all_parsers): expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) expected = expected.set_index(["b", "c"]) - msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv( - StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] - ) + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_usecols_implicit_index_col(all_parsers): # see gh-2654 parser = all_parsers @@ -196,12 +204,7 @@ def test_usecols_index_col_middle(all_parsers): data = """a,b,c,d 1,2,3,4 """ - msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c") + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c") expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c")) tm.assert_frame_equal(result, expected) @@ -212,38 +215,43 @@ def test_usecols_index_col_end(all_parsers): data = """a,b,c,d 1,2,3,4 """ - msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d") + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d") expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d")) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + return + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + return + result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -256,17 +264,21 @@ def test_usecols_with_whitespace(all_parsers): ), ], ) -def test_usecols_with_integer_like_header(all_parsers, usecols, expected): +def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request): parser = all_parsers data = """2,0,1 1000,2000,3000 4000,5000,6000""" + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found") + request.applymarker(mark) + result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # mismatched shape def test_empty_usecols(all_parsers): data = "a,b,c\n1,2,3\n4,5,6" expected = DataFrame(columns=Index([])) @@ -283,16 +295,11 @@ def test_np_array_usecols(all_parsers): usecols = np.array(["a", "b"]) expected = DataFrame([[1, 2]], columns=usecols) - msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), usecols=usecols) + result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: 'function' object is not iterable @pytest.mark.parametrize( "usecols,expected", [ @@ -325,7 +332,8 @@ def test_callable_usecols(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow +# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file +@xfail_pyarrow @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) def test_incomplete_first_row(all_parsers, usecols): # see gh-6710 @@ -338,7 +346,7 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 @pytest.mark.parametrize( "data,usecols,kwargs,expected", [ @@ -371,7 +379,6 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "usecols,kwargs,expected,msg", [ @@ -415,11 +422,22 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): ), ], ) -def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): +def test_raises_on_usecols_names_mismatch( + all_parsers, usecols, kwargs, expected, msg, request +): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" kwargs.update(usecols=usecols) parser = all_parsers + if parser.engine == "pyarrow" and not ( + usecols is not None and expected is not None + ): + # everything but the first case + mark = pytest.mark.xfail( + reason="e.g. Column 'f' in include_columns does not exist in CSV file" + ) + request.applymarker(mark) + if expected is None: with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) @@ -428,7 +446,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" @@ -440,7 +458,7 @@ def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize("names", [None, ["a", "b"]]) def test_usecols_indices_out_of_bounds(all_parsers, names): # GH#25623 & GH 41130; enforced in 2.0 @@ -453,21 +471,30 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) -@skip_pyarrow def test_usecols_additional_columns(all_parsers): # GH#46997 parser = all_parsers usecols = lambda header: header.strip() in ["a", "b", "c"] + + if parser.engine == "pyarrow": + msg = "'function' object is not iterable" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) + return result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) expected = DataFrame({"a": ["x"], "b": "y"}) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_additional_columns_integer_columns(all_parsers): # GH#46997 parser = all_parsers usecols = lambda header: header.strip() in ["0", "1"] + if parser.engine == "pyarrow": + msg = "'function' object is not iterable" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) + return result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) expected = DataFrame({"0": ["x"], "1": "y"}) tm.assert_frame_equal(result, expected) @@ -480,16 +507,11 @@ def test_usecols_dtype(all_parsers): a,1,x b,2,y """ - msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv( - StringIO(data), - usecols=["col1", "col2"], - dtype={"col1": "string", "col2": "uint8", "col3": "string"}, - ) + result = parser.read_csv( + StringIO(data), + usecols=["col1", "col2"], + dtype={"col1": "string", "col2": "uint8", "col3": "string"}, + ) expected = DataFrame( {"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")} )