pandas-dev · PL-SalvadorFandino · Jun 2, 2024 · Jun 2, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -567,6 +567,7 @@ I/O
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
+- Bug in :meth:`read_excel` multiindex head with holes (:issue:`58898`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

@@ -817,95 +817,122 @@ def _parse_sheet(
         dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
         **kwds,
     ):
-        is_list_header = False
-        is_len_one_list_header = False
-        if is_list_like(header):
-            assert isinstance(header, Sequence)
-            is_list_header = True
-            if len(header) == 1:
-                is_len_one_list_header = True
-
-        if is_len_one_list_header:
-            header = cast(Sequence[int], header)[0]
-
-        # forward fill and pull out names for MultiIndex column
-        header_names = None
-        if header is not None and is_list_like(header):
-            assert isinstance(header, Sequence)
-
-            header_names = []
-            control_row = [True] * len(data[0])
-
-            for row in header:
-                if is_integer(skiprows):
-                    assert isinstance(skiprows, int)
-                    row += skiprows
-
-                if row > len(data) - 1:
-                    raise ValueError(
-                        f"header index {row} exceeds maximum index "
-                        f"{len(data) - 1} of data.",
-                    )
-
-                data[row], control_row = fill_mi_header(data[row], control_row)
+        if callable(skiprows):
+            # In order to avoid calling skiprows multiple times on
+            # every row, we just do it here and keep the resulting
+            # list for passing it down to the parser.
+            skiprows = [ix for ix in range(len(data)) if skiprows(ix)]
+            if len(skiprows) == 0:
+                skiprows = None
 
-                if index_col is not None:
-                    header_name, _ = pop_header_name(data[row], index_col)
-                    header_names.append(header_name)
+        try:
+            # header indexes reference rows after removing skiprows, so we
+            # create an index map from the without-skiprows to the
+            # original indexes.
+            ixmap: range | list[int]
+            if skiprows is None:
+                ixmap = range(len(data))
+            elif is_integer(skiprows):
+                ixmap = range(skiprows, len(data))
+            elif is_list_like(skiprows):
+                skiprows_set = set(cast(Sequence[int], skiprows))
+                ixmap = [ix for ix in range(len(data)) if ix not in skiprows_set]
+            else:
+                raise ValueError("skiprows must be an integer or a list of integers")
+            nixs = len(ixmap)
 
-        # If there is a MultiIndex header and an index then there is also
-        # a row containing just the index name(s)
-        has_index_names = False
-        if is_list_header and not is_len_one_list_header and index_col is not None:
+            index_col_has_names = False
             index_col_set: set[int]
-            if isinstance(index_col, int):
+            if index_col is None:
+                index_col_set = set()
+            elif isinstance(index_col, str):
+                index_col_set = set()
+                index_col_has_names = True
+            elif isinstance(index_col, int):
                 index_col_set = {index_col}
-            else:
-                assert isinstance(index_col, Sequence)
+            elif is_list_like(index_col):
                 index_col_set = set(index_col)
-
-            # We have to handle mi without names. If any of the entries in the data
-            # columns are not empty, this is a regular row
-            assert isinstance(header, Sequence)
-            if len(header) < len(data):
-                potential_index_names = data[len(header)]
-                has_index_names = all(
-                    x == "" or x is None
-                    for i, x in enumerate(potential_index_names)
-                    if not control_row[i] and i not in index_col_set
+            else:
+                raise ValueError(
+                    "index_col must be a string, an integer or a list of integers"
                 )
+            has_index = len(index_col_set) > 0
+            has_index_names = False
 
-        if is_list_like(index_col):
-            # Forward fill values for MultiIndex index.
+            header_list: Sequence[int]
             if header is None:
-                offset = 0
+                header_list = []
             elif isinstance(header, int):
-                offset = 1 + header
+                header_list = [header]
+            elif is_list_like(header):
+                header_list = header
             else:
-                offset = 1 + max(header)
+                raise ValueError("header must be an integer or a list of integers")
 
-            # GH34673: if MultiIndex names present and not defined in the header,
-            # offset needs to be incremented so that forward filling starts
-            # from the first MI value instead of the name
-            if has_index_names:
-                offset += 1
+            header_names = []
 
-            # Check if we have an empty dataset
-            # before trying to collect data.
-            if offset < len(data):
-                assert isinstance(index_col, Sequence)
+            if len(header_list) == 0:
+                offset = 0
+            else:
+                max_header = max(header_list)
+                offset = max_header + 1
 
-                for col in index_col:
-                    last = data[offset][col]
+                if max_header >= nixs:
+                    raise ValueError(
+                        f"header index {max_header} exceeds maximum index "
+                        f"{nixs - 1} of data.",
+                    )
+
+                if len(header_list) > 1:
+                    if index_col_has_names:
+                        raise ValueError(
+                            "named index_col can not be used together "
+                            "with multi-index header"
+                        )
+
+                    # Forward fill and pull out names for MultiIndex column
+                    control_row = [True] * len(data[0])
+                    for row in header_list:
+                        row1 = ixmap[row]
+                        data[row1], control_row = fill_mi_header(
+                            data[row1], control_row
+                        )
+
+                        if has_index:
+                            header_name, _ = pop_header_name(
+                                data[row1], sorted(index_col_set)
+                            )
+                            if header_name:
+                                header_names.append(header_name)
+
+                    # If there is a MultiIndex header and an index then
+                    # there may also be a row containing just the index
+                    # name(s)
+                    if has_index and offset < nixs:
+                        # We have to handle mi without names. If any
+                        # of the entries in the data columns are not
+                        # empty, this is a regular row.
+
+                        potential_index_names = data[ixmap[offset]]
+                        has_index_names = all(
+                            x == "" or x is None
+                            for i, x in enumerate(potential_index_names)
+                            if not control_row[i] and i not in index_col_set
+                        )
+            if has_index_names:
+                offset += 1
 
-                    for row in range(offset + 1, len(data)):
-                        if data[row][col] == "" or data[row][col] is None:
-                            data[row][col] = last
+            # Forward fill index columns:
+            # TODO: forward fill also when index columns are selected by name!!!
+            if has_index and offset < nixs:
+                for col in index_col_set:
+                    last = data[ixmap[offset]][col]
+                    for row1 in ixmap[offset + 1 :]:
+                        if data[row1][col] == "" or data[row1][col] is None:
+                            data[row1][col] = last
                         else:
-                            last = data[row][col]
+                            last = data[row1][col]
 
-        # GH 12292 : error when read one empty column from excel file
-        try:
             parser = TextParser(
                 data,
                 names=names,

diff --git a/pandas/tests/io/data/excel/test_boolean_types.xlsx b/pandas/tests/io/data/excel/test_boolean_types.xlsx
diff --git a/pandas/tests/io/data/excel/test_mi_holes.xlsx b/pandas/tests/io/data/excel/test_mi_holes.xlsx
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1759,3 +1759,53 @@ def test_corrupt_files_closed(self, engine, tmp_excel):
                 pd.ExcelFile(tmp_excel, engine=engine)
             except errors:
                 pass
+
+    def test_mi_header_skiprows1(self, engine, read_ext):
+        if engine is None and read_ext == ".xlsx":
+            with open("test_mi_holes.xlsx", "rb") as f:
+                expected = pd.read_excel(
+                    f, sheet_name="expected", header=[0, 1], index_col=[0, 1]
+                )
+
+            with open("test_mi_holes.xlsx", "rb") as f:
+                actual = pd.read_excel(
+                    f,
+                    sheet_name="skiprows1",
+                    skiprows=1,
+                    header=[0, 1],
+                    index_col=[0, 1],
+                )
+
+            tm.assert_frame_equal(expected, actual)
+
+    def test_mi_header_hole(self, engine, read_ext):
+        if engine is None and read_ext == ".xlsx":
+            with open("test_mi_holes.xlsx", "rb") as f:
+                expected = pd.read_excel(
+                    f, sheet_name="expected", header=[0, 1], index_col=[0, 1]
+                )
+
+            with open("test_mi_holes.xlsx", "rb") as f:
+                actual = pd.read_excel(
+                    f,
+                    sheet_name="header_hole",
+                    skiprows=[1],
+                    header=[0, 1],
+                    index_col=[0, 1],
+                )
+
+            tm.assert_frame_equal(expected, actual)
+
+    def test_mi_header_and_index_holes(self, engine, read_ext):
+        if engine is None and read_ext == ".xlsx":
+            with open("test_mi_holes.xlsx", "rb") as f:
+                expected = pd.read_excel(
+                    f, sheet_name="expected_index_hole", header=[0, 1], index_col=[0, 1]
+                )
+
+            with open("test_mi_holes.xlsx", "rb") as f:
+                actual = pd.read_excel(
+                    f, sheet_name="index_hole", header=[0, 2], index_col=[0, 1]
+                )
+
+            tm.assert_frame_equal(expected, actual)