Skip to content

Commit 173d27a

Browse files
committed
[BUG] excel _parse_sheet revamp
The logic in that method was not handling correctly all the possible combinations of skiprows, header and index_col arguments. Specifically: - it was not able to handle correctly multi index header with holes (for instance, `header=[0,2]`. - multi index header and skiprows given as lists. - forward filling index columns and skiprows gigen as lists. - inconsistences processing one-element list arguments (for instance, `header=1` and `header=[1]` or `index_col=0` and `index_col=[0]` where handled differently). The logic has been revamped, because it was not possible to fix all the errors with local changes. The mayor challenge was handling skiprows as a list, as it may remove rows at any place (before, between or after header(s), index names and data). Also, header row indexes reference rows **after** removing skiprows. To handle that we use an intermediate mapping `ixmap` which goes from the row indixes with skiprows removed to the row indixes in `data`. Finally, let me add that IMO, most of the functionality of _parse_sheet should be moved down into TextParser... but that's work for another day!
1 parent 2ed0db2 commit 173d27a

File tree

2 files changed

+99
-76
lines changed

2 files changed

+99
-76
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ I/O
567567
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
568568
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
569569
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
570+
- Bug in :meth:`read_excel` multiindex head with holes (:issue:`58898`)
570571

571572
Period
572573
^^^^^^

pandas/io/excel/_base.py

Lines changed: 98 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -817,95 +817,118 @@ def _parse_sheet(
817817
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
818818
**kwds,
819819
):
820-
is_list_header = False
821-
is_len_one_list_header = False
822-
if is_list_like(header):
823-
assert isinstance(header, Sequence)
824-
is_list_header = True
825-
if len(header) == 1:
826-
is_len_one_list_header = True
827-
828-
if is_len_one_list_header:
829-
header = cast(Sequence[int], header)[0]
830-
831-
# forward fill and pull out names for MultiIndex column
832-
header_names = None
833-
if header is not None and is_list_like(header):
834-
assert isinstance(header, Sequence)
835-
836-
header_names = []
837-
control_row = [True] * len(data[0])
838-
839-
for row in header:
840-
if is_integer(skiprows):
841-
assert isinstance(skiprows, int)
842-
row += skiprows
843-
844-
if row > len(data) - 1:
845-
raise ValueError(
846-
f"header index {row} exceeds maximum index "
847-
f"{len(data) - 1} of data.",
848-
)
849-
850-
data[row], control_row = fill_mi_header(data[row], control_row)
820+
try:
851821

852-
if index_col is not None:
853-
header_name, _ = pop_header_name(data[row], index_col)
854-
header_names.append(header_name)
822+
# header indexes reference rows after removing skiprows, so we
823+
# create an index map from the without-skiprows to the
824+
# original indexes.
825+
if skiprows is None:
826+
ixmap = list(range(len(data)))
827+
elif is_integer(skiprows):
828+
ixmap = list(range(skiprows, len(data)))
829+
elif is_list_like(skiprows):
830+
skiprows_set = set(cast(Sequence[int], skiprows))
831+
ixmap = [ix for ix, _ in enumerate(data) if ix not in skiprows_set]
832+
elif callable(skiprows):
833+
ixmap = [ix for ix, _ in enumerate(data) if not skiprows(ix)]
834+
else:
835+
raise ValueError(
836+
"skiprows must be an integer or a list of integers"
837+
)
838+
nixs = len(ixmap)
855839

856-
# If there is a MultiIndex header and an index then there is also
857-
# a row containing just the index name(s)
858-
has_index_names = False
859-
if is_list_header and not is_len_one_list_header and index_col is not None:
840+
index_col_has_names = False
860841
index_col_set: set[int]
861-
if isinstance(index_col, int):
842+
if index_col is None:
843+
index_col_set = set()
844+
elif isinstance(index_col, str):
845+
index_col_set = set()
846+
index_col_has_names = True
847+
elif isinstance(index_col, int):
862848
index_col_set = {index_col}
863-
else:
864-
assert isinstance(index_col, Sequence)
849+
elif is_list_like(index_col):
865850
index_col_set = set(index_col)
866-
867-
# We have to handle mi without names. If any of the entries in the data
868-
# columns are not empty, this is a regular row
869-
assert isinstance(header, Sequence)
870-
if len(header) < len(data):
871-
potential_index_names = data[len(header)]
872-
has_index_names = all(
873-
x == "" or x is None
874-
for i, x in enumerate(potential_index_names)
875-
if not control_row[i] and i not in index_col_set
851+
else:
852+
raise ValueError(
853+
"index_col must be a string, an integer or a list of integers"
876854
)
855+
has_index = len(index_col_set) > 0
856+
has_index_names = False
877857

878-
if is_list_like(index_col):
879-
# Forward fill values for MultiIndex index.
858+
header_list: Sequence[int]
880859
if header is None:
881-
offset = 0
860+
header_list = []
882861
elif isinstance(header, int):
883-
offset = 1 + header
862+
header_list = [header]
863+
elif is_list_like(header):
864+
header_list = header
884865
else:
885-
offset = 1 + max(header)
866+
raise ValueError(
867+
"header must be an integer or a list of integers"
868+
)
886869

887-
# GH34673: if MultiIndex names present and not defined in the header,
888-
# offset needs to be incremented so that forward filling starts
889-
# from the first MI value instead of the name
890-
if has_index_names:
891-
offset += 1
870+
header_names = []
871+
872+
if len(header_list) == 0:
873+
offset = 0
874+
else:
875+
max_header = max(header_list)
876+
offset = max_header + 1
892877

893-
# Check if we have an empty dataset
894-
# before trying to collect data.
895-
if offset < len(data):
896-
assert isinstance(index_col, Sequence)
878+
if max_header >= nixs:
879+
raise ValueError(
880+
f"header index {max_header} exceeds maximum index "
881+
f"{nixs - 1} of data.",
882+
)
897883

898-
for col in index_col:
899-
last = data[offset][col]
884+
if len(header_list) > 1:
885+
if index_col_has_names:
886+
raise ValueError(
887+
"named index_col can not be used together "
888+
"with multi-index header"
889+
)
890+
891+
# Forward fill and pull out names for MultiIndex column
892+
control_row = [True] * len(data[0])
893+
for row in header_list:
894+
row1 = ixmap[row]
895+
data[row1], control_row = fill_mi_header(data[row1],
896+
control_row)
897+
898+
if has_index:
899+
header_name, _ = pop_header_name(data[row1],
900+
sorted(index_col_set))
901+
if header_name:
902+
header_names.append(header_name)
903+
904+
# If there is a MultiIndex header and an index then
905+
# there may also be a row containing just the index
906+
# name(s)
907+
if has_index and offset < nixs:
908+
# We have to handle mi without names. If any
909+
# of the entries in the data columns are not
910+
# empty, this is a regular row.
911+
912+
potential_index_names = data[ixmap[offset]]
913+
has_index_names = all(
914+
x == "" or x is None
915+
for i, x in enumerate(potential_index_names)
916+
if not control_row[i] and i not in index_col_set
917+
)
918+
if has_index_names:
919+
offset += 1
900920

901-
for row in range(offset + 1, len(data)):
902-
if data[row][col] == "" or data[row][col] is None:
903-
data[row][col] = last
921+
# Forward fill index columns:
922+
# TODO: forward fill also when index columns are selected by name!!!
923+
if has_index and offset < nixs:
924+
for col in index_col_set:
925+
last = data[ixmap[offset]][col]
926+
for row1 in ixmap[offset+1:]:
927+
if data[row1][col] == "" or data[row1][col] is None:
928+
data[row1][col] = last
904929
else:
905-
last = data[row][col]
930+
last = data[row1][col]
906931

907-
# GH 12292 : error when read one empty column from excel file
908-
try:
909932
parser = TextParser(
910933
data,
911934
names=names,
@@ -933,9 +956,8 @@ def _parse_sheet(
933956
output[asheetname] = parser.read(nrows=nrows)
934957

935958
if header_names:
936-
output[asheetname].columns = output[asheetname].columns.set_names(
937-
header_names
938-
)
959+
output[asheetname].columns = \
960+
output[asheetname].columns.set_names(header_names)
939961

940962
except EmptyDataError:
941963
# No Data, return an empty DataFrame

0 commit comments

Comments
 (0)