diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0acb82ffeca3e..c6fc3fc893379 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -567,6 +567,7 @@ I/O - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_excel` multiindex head with holes (:issue:`58898`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f83f9cb1c8d74..66cec9ba82df1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -817,95 +817,122 @@ def _parse_sheet( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, **kwds, ): - is_list_header = False - is_len_one_list_header = False - if is_list_like(header): - assert isinstance(header, Sequence) - is_list_header = True - if len(header) == 1: - is_len_one_list_header = True - - if is_len_one_list_header: - header = cast(Sequence[int], header)[0] - - # forward fill and pull out names for MultiIndex column - header_names = None - if header is not None and is_list_like(header): - assert isinstance(header, Sequence) - - header_names = [] - control_row = [True] * len(data[0]) - - for row in header: - if is_integer(skiprows): - assert isinstance(skiprows, int) - row += skiprows - - if row > len(data) - 1: - raise ValueError( - f"header index {row} exceeds maximum index " - f"{len(data) - 1} of data.", - ) - - data[row], control_row = fill_mi_header(data[row], control_row) + if callable(skiprows): + # In order to avoid calling skiprows multiple times on + # every row, we just do it here and keep the resulting + # list for passing it down to the parser. + skiprows = [ix for ix in range(len(data)) if skiprows(ix)] + if len(skiprows) == 0: + skiprows = None - if index_col is not None: - header_name, _ = pop_header_name(data[row], index_col) - header_names.append(header_name) + try: + # header indexes reference rows after removing skiprows, so we + # create an index map from the without-skiprows to the + # original indexes. + ixmap: range | list[int] + if skiprows is None: + ixmap = range(len(data)) + elif is_integer(skiprows): + ixmap = range(skiprows, len(data)) + elif is_list_like(skiprows): + skiprows_set = set(cast(Sequence[int], skiprows)) + ixmap = [ix for ix in range(len(data)) if ix not in skiprows_set] + else: + raise ValueError("skiprows must be an integer or a list of integers") + nixs = len(ixmap) - # If there is a MultiIndex header and an index then there is also - # a row containing just the index name(s) - has_index_names = False - if is_list_header and not is_len_one_list_header and index_col is not None: + index_col_has_names = False index_col_set: set[int] - if isinstance(index_col, int): + if index_col is None: + index_col_set = set() + elif isinstance(index_col, str): + index_col_set = set() + index_col_has_names = True + elif isinstance(index_col, int): index_col_set = {index_col} - else: - assert isinstance(index_col, Sequence) + elif is_list_like(index_col): index_col_set = set(index_col) - - # We have to handle mi without names. If any of the entries in the data - # columns are not empty, this is a regular row - assert isinstance(header, Sequence) - if len(header) < len(data): - potential_index_names = data[len(header)] - has_index_names = all( - x == "" or x is None - for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_set + else: + raise ValueError( + "index_col must be a string, an integer or a list of integers" ) + has_index = len(index_col_set) > 0 + has_index_names = False - if is_list_like(index_col): - # Forward fill values for MultiIndex index. + header_list: Sequence[int] if header is None: - offset = 0 + header_list = [] elif isinstance(header, int): - offset = 1 + header + header_list = [header] + elif is_list_like(header): + header_list = header else: - offset = 1 + max(header) + raise ValueError("header must be an integer or a list of integers") - # GH34673: if MultiIndex names present and not defined in the header, - # offset needs to be incremented so that forward filling starts - # from the first MI value instead of the name - if has_index_names: - offset += 1 + header_names = [] - # Check if we have an empty dataset - # before trying to collect data. - if offset < len(data): - assert isinstance(index_col, Sequence) + if len(header_list) == 0: + offset = 0 + else: + max_header = max(header_list) + offset = max_header + 1 - for col in index_col: - last = data[offset][col] + if max_header >= nixs: + raise ValueError( + f"header index {max_header} exceeds maximum index " + f"{nixs - 1} of data.", + ) + + if len(header_list) > 1: + if index_col_has_names: + raise ValueError( + "named index_col can not be used together " + "with multi-index header" + ) + + # Forward fill and pull out names for MultiIndex column + control_row = [True] * len(data[0]) + for row in header_list: + row1 = ixmap[row] + data[row1], control_row = fill_mi_header( + data[row1], control_row + ) + + if has_index: + header_name, _ = pop_header_name( + data[row1], sorted(index_col_set) + ) + if header_name: + header_names.append(header_name) + + # If there is a MultiIndex header and an index then + # there may also be a row containing just the index + # name(s) + if has_index and offset < nixs: + # We have to handle mi without names. If any + # of the entries in the data columns are not + # empty, this is a regular row. + + potential_index_names = data[ixmap[offset]] + has_index_names = all( + x == "" or x is None + for i, x in enumerate(potential_index_names) + if not control_row[i] and i not in index_col_set + ) + if has_index_names: + offset += 1 - for row in range(offset + 1, len(data)): - if data[row][col] == "" or data[row][col] is None: - data[row][col] = last + # Forward fill index columns: + # TODO: forward fill also when index columns are selected by name!!! + if has_index and offset < nixs: + for col in index_col_set: + last = data[ixmap[offset]][col] + for row1 in ixmap[offset + 1 :]: + if data[row1][col] == "" or data[row1][col] is None: + data[row1][col] = last else: - last = data[row][col] + last = data[row1][col] - # GH 12292 : error when read one empty column from excel file - try: parser = TextParser( data, names=names, diff --git a/pandas/tests/io/data/excel/test_boolean_types.xlsx b/pandas/tests/io/data/excel/test_boolean_types.xlsx index 234703c32f0ab..8deb2dc9f787d 100644 Binary files a/pandas/tests/io/data/excel/test_boolean_types.xlsx and b/pandas/tests/io/data/excel/test_boolean_types.xlsx differ diff --git a/pandas/tests/io/data/excel/test_mi_holes.xlsx b/pandas/tests/io/data/excel/test_mi_holes.xlsx new file mode 100644 index 0000000000000..d7c9fea62d836 Binary files /dev/null and b/pandas/tests/io/data/excel/test_mi_holes.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5ce78b1c90e76..301ca36b0757e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1759,3 +1759,53 @@ def test_corrupt_files_closed(self, engine, tmp_excel): pd.ExcelFile(tmp_excel, engine=engine) except errors: pass + + def test_mi_header_skiprows1(self, engine, read_ext): + if engine is None and read_ext == ".xlsx": + with open("test_mi_holes.xlsx", "rb") as f: + expected = pd.read_excel( + f, sheet_name="expected", header=[0, 1], index_col=[0, 1] + ) + + with open("test_mi_holes.xlsx", "rb") as f: + actual = pd.read_excel( + f, + sheet_name="skiprows1", + skiprows=1, + header=[0, 1], + index_col=[0, 1], + ) + + tm.assert_frame_equal(expected, actual) + + def test_mi_header_hole(self, engine, read_ext): + if engine is None and read_ext == ".xlsx": + with open("test_mi_holes.xlsx", "rb") as f: + expected = pd.read_excel( + f, sheet_name="expected", header=[0, 1], index_col=[0, 1] + ) + + with open("test_mi_holes.xlsx", "rb") as f: + actual = pd.read_excel( + f, + sheet_name="header_hole", + skiprows=[1], + header=[0, 1], + index_col=[0, 1], + ) + + tm.assert_frame_equal(expected, actual) + + def test_mi_header_and_index_holes(self, engine, read_ext): + if engine is None and read_ext == ".xlsx": + with open("test_mi_holes.xlsx", "rb") as f: + expected = pd.read_excel( + f, sheet_name="expected_index_hole", header=[0, 1], index_col=[0, 1] + ) + + with open("test_mi_holes.xlsx", "rb") as f: + actual = pd.read_excel( + f, sheet_name="index_hole", header=[0, 2], index_col=[0, 1] + ) + + tm.assert_frame_equal(expected, actual)