diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c5bf935b0d54d..74fe5166df65f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -117,6 +117,19 @@ def read_ext(engine_and_read_ext): return read_ext +def adjust_expected(expected: DataFrame, read_ext: str) -> None: + expected.index.name = None + + +def xfail_datetimes_with_pyxlsb(engine, request): + if engine == "pyxlsb": + request.applymarker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + + class TestReaders: @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch): @@ -196,14 +209,11 @@ def test_usecols_int(self, read_ext): ) def test_usecols_list(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext) - df_ref = df_ref.reindex(columns=["B", "C"]) df1 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3] ) @@ -216,18 +226,15 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): ) # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) def test_usecols_str(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["A", "B", "C"]] + adjust_expected(expected, read_ext) - df1 = df_ref.reindex(columns=["A", "B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D" ) @@ -240,10 +247,12 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): ) # TODO add index to xls, read xls ignores index name ? - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) + + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext) - df1 = df_ref.reindex(columns=["B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D" ) @@ -255,10 +264,9 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): usecols="A,C,D", ) # TODO add index to xls file - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) - df1 = df_ref.reindex(columns=["B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D" ) @@ -269,8 +277,8 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): index_col=0, usecols="A,C:D", ) - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) @pytest.mark.parametrize( "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] @@ -278,18 +286,15 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): def test_usecols_diff_positional_int_columns_order( self, request, engine, read_ext, usecols, df_ref ): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["A", "C"]] + adjust_expected(expected, read_ext) + result = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols ) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): @@ -297,33 +302,27 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r expected.index = range(len(expected)) result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref + adjust_expected(expected, read_ext) + result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["C", "D"]] + adjust_expected(expected, read_ext) + result = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E" ) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) def test_usecols_excel_range_str_invalid(self, read_ext): msg = "Invalid column name: E1" @@ -400,12 +399,7 @@ def test_excel_stop_iterator(self, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, request, engine, read_ext): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) # https://github.com/tafia/calamine/issues/355 if engine == "calamine" and read_ext == ".ods": @@ -418,20 +412,18 @@ def test_excel_cell_error_na(self, request, engine, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_table(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext) df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0 ) # TODO add index to file - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) df3 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1 @@ -439,12 +431,7 @@ def test_excel_table(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, request, engine, read_ext): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = DataFrame.from_dict( { @@ -527,7 +514,7 @@ def test_reader_dtype(self, read_ext): "c": [1, 2, 3, 4], "d": [1.0, 2.0, np.nan, 4.0], } - ).reindex(columns=["a", "b", "c", "d"]) + ) tm.assert_frame_equal(actual, expected) @@ -777,12 +764,7 @@ def test_exception_message_includes_sheet_name(self, read_ext): @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = DataFrame( [ @@ -808,22 +790,21 @@ def test_date_conversion_overflow(self, request, engine, read_ext): tm.assert_frame_equal(result, expected) def test_sheet_name(self, request, read_ext, engine, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + filename = "test1" sheet_name = "Sheet1" + expected = df_ref + adjust_expected(expected, read_ext) + df1 = pd.read_excel( filename + read_ext, sheet_name=sheet_name, index_col=0 ) # doc df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) def test_excel_read_buffer(self, read_ext): pth = "test1" + read_ext @@ -974,12 +955,7 @@ def test_close_from_py_localpath(self, read_ext): f.read() def test_reader_seconds(self, request, engine, read_ext): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) # GH 55045 if engine == "calamine" and read_ext == ".ods": @@ -1016,12 +992,7 @@ def test_reader_seconds(self, request, engine, read_ext): def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) # https://github.com/tafia/calamine/issues/354 if engine == "calamine" and read_ext == ".ods": @@ -1051,7 +1022,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): expected.columns = ["a", "b", "c", "d"] actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) # "both" sheet expected.columns = mi @@ -1059,7 +1030,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): actual = pd.read_excel( mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1] ) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] @@ -1115,12 +1086,7 @@ def test_read_excel_multiindex_blank_after_name( self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb (GH4679" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) mi_file = "testmultiindex" + read_ext mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) @@ -1227,7 +1193,7 @@ def test_excel_old_index_format(self, read_ext): expected.index = mi actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 @@ -1238,12 +1204,7 @@ def test_read_excel_bool_header_arg(self, read_ext): def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2] @@ -1293,12 +1254,7 @@ def test_read_excel_skiprows(self, request, engine, read_ext): def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) actual = pd.read_excel( "testskiprows" + read_ext, @@ -1565,24 +1521,22 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): tm.assert_frame_equal(parsed, expected) def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext) with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, sheet_name=0, index_col=0) df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) with pd.ExcelFile("test1" + read_ext) as excel: df1 = excel.parse(0, index_col=0) df2 = excel.parse(1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) with pd.ExcelFile("test1" + read_ext) as excel: df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1) @@ -1594,12 +1548,10 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_sheet_name(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext) filename = "test1" sheet_name = "Sheet1" @@ -1610,8 +1562,8 @@ def test_sheet_name(self, request, engine, read_ext, df_ref): with pd.ExcelFile(filename + read_ext) as excel: df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) - tm.assert_frame_equal(df1_parse, df_ref, check_names=False) - tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + tm.assert_frame_equal(df1_parse, expected) + tm.assert_frame_equal(df2_parse, expected) @pytest.mark.parametrize( "sheet_name", @@ -1686,12 +1638,7 @@ def test_header_with_index_col(self, filename): def test_read_datetime_multiindex(self, request, engine, read_ext): # GH 34748 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) f = "test_datetime_mi" + read_ext with pd.ExcelFile(f) as excel: