Skip to content

Commit 272e69c

Browse files
committed
[BUG] excel _parse_sheet revamp
The logic in that method was not handling correctly all the possible combinations of skiprows, header and index_col arguments. Specifically: - it was not able to handle correctly multi index header with holes (for instance, `header=[0,2]`. - multi index header and skiprows given as lists. - forward filling index columns and skiprows gigen as lists. - inconsistences processing one-element list arguments (for instance, `header=1` and `header=[1]` or `index_col=0` and `index_col=[0]` where handled differently). The logic has been revamped, because it was not possible to fix all the errors with local changes. The mayor challenge was handling skiprows as a list, as it may remove rows at any place (before, between or after header(s), index names and data). Also, header row indexes reference rows **after** removing skiprows. To handle that we use an intermediate mapping `ixmap` which goes from the row indixes with skiprows removed to the row indixes in `data`. Finally, let me add that IMO, most of the functionality of _parse_sheet should be moved down into TextParser... but that's work for another day!
1 parent ddc75ab commit 272e69c

File tree

1 file changed

+95
-76
lines changed

1 file changed

+95
-76
lines changed

pandas/io/excel/_base.py

Lines changed: 95 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -817,95 +817,115 @@ def _parse_sheet(
817817
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
818818
**kwds,
819819
):
820-
is_list_header = False
821-
is_len_one_list_header = False
822-
if is_list_like(header):
823-
assert isinstance(header, Sequence)
824-
is_list_header = True
825-
if len(header) == 1:
826-
is_len_one_list_header = True
827-
828-
if is_len_one_list_header:
829-
header = cast(Sequence[int], header)[0]
830-
831-
# forward fill and pull out names for MultiIndex column
832-
header_names = None
833-
if header is not None and is_list_like(header):
834-
assert isinstance(header, Sequence)
835-
836-
header_names = []
837-
control_row = [True] * len(data[0])
838-
839-
for row in header:
840-
if is_integer(skiprows):
841-
assert isinstance(skiprows, int)
842-
row += skiprows
843-
844-
if row > len(data) - 1:
845-
raise ValueError(
846-
f"header index {row} exceeds maximum index "
847-
f"{len(data) - 1} of data.",
848-
)
849-
850-
data[row], control_row = fill_mi_header(data[row], control_row)
820+
try:
851821

852-
if index_col is not None:
853-
header_name, _ = pop_header_name(data[row], index_col)
854-
header_names.append(header_name)
822+
# header indexes reference rows after removing skiprows, so we
823+
# create an index map from the without-skiprows to the
824+
# original indexes.
825+
if skiprows is None:
826+
ixmap = range(len(data))
827+
elif is_integer(skiprows):
828+
ixmap = range(skiprows, len(data))
829+
elif is_list_like(skiprows):
830+
skiprows = set(skiprows)
831+
ixmap = [ix for ix, _ in enumerate(data) if ix not in skiprows]
832+
elif callable(skiprows):
833+
ixmap = [ix for ix, _ in enumerate(data) if not skiprows(ix)]
834+
else:
835+
raise ValueError(
836+
"skiprows must be an integer or a list of integers"
837+
)
838+
nixs = len(ixmap)
855839

856-
# If there is a MultiIndex header and an index then there is also
857-
# a row containing just the index name(s)
858-
has_index_names = False
859-
if is_list_header and not is_len_one_list_header and index_col is not None:
840+
index_col_has_names = False
860841
index_col_set: set[int]
861-
if isinstance(index_col, int):
842+
if index_col is None:
843+
index_col_set = set([])
844+
elif isinstance(index_col, str):
845+
index_col_set = set([])
846+
index_col_has_names = True
847+
elif is_integer(index_col):
862848
index_col_set = {index_col}
863-
else:
864-
assert isinstance(index_col, Sequence)
849+
elif is_list_like(index_col):
865850
index_col_set = set(index_col)
866-
867-
# We have to handle mi without names. If any of the entries in the data
868-
# columns are not empty, this is a regular row
869-
assert isinstance(header, Sequence)
870-
if len(header) < len(data):
871-
potential_index_names = data[len(header)]
872-
has_index_names = all(
873-
x == "" or x is None
874-
for i, x in enumerate(potential_index_names)
875-
if not control_row[i] and i not in index_col_set
851+
else:
852+
raise ValueError(
853+
"index_col must be a string, an integer or a list of integers"
876854
)
855+
has_index = len(index_col_set) > 0
856+
has_index_names = False
877857

878-
if is_list_like(index_col):
879-
# Forward fill values for MultiIndex index.
880858
if header is None:
859+
header_list = []
860+
elif is_integer(header):
861+
header_list = [header]
862+
elif is_list_like(header):
863+
header_list = header
864+
else:
865+
raise ValueError(
866+
"header must be an integer or a list of integers"
867+
)
868+
869+
header_names = []
870+
871+
if len(header_list) == 0:
881872
offset = 0
882-
elif isinstance(header, int):
883-
offset = 1 + header
884873
else:
885-
offset = 1 + max(header)
874+
max_header = max(header_list)
875+
offset = max_header + 1
886876

887-
# GH34673: if MultiIndex names present and not defined in the header,
888-
# offset needs to be incremented so that forward filling starts
889-
# from the first MI value instead of the name
877+
if max_header >= nixs:
878+
raise ValueError(
879+
f"header index {max_header} exceeds maximum index "
880+
f"{nixs - 1} of data.",
881+
)
882+
883+
if len(header_list) > 1:
884+
if index_col_has_names:
885+
raise ValueError(
886+
"named index_col can not be used together with multi-index header"
887+
)
888+
889+
# Forward fill and pull out names for MultiIndex column
890+
control_row = [True] * len(data[0])
891+
for row in header:
892+
row1 = ixmap[row]
893+
data[row1], control_row = fill_mi_header(data[row1],
894+
control_row)
895+
896+
if has_index:
897+
header_name, _ = pop_header_name(data[row1],
898+
index_col)
899+
header_names.append(header_name)
900+
901+
# If there is a MultiIndex header and an index then
902+
# there may also be a row containing just the index
903+
# name(s)
904+
if has_index and offset < nixs:
905+
# We have to handle mi without names. If any
906+
# of the entries in the data columns are not
907+
# empty, this is a regular row.
908+
909+
potential_index_names = data[ixmap[offset]]
910+
has_index_names = all(
911+
x == "" or x is None
912+
for i, x in enumerate(potential_index_names)
913+
if not control_row[i] and i not in index_col_set
914+
)
890915
if has_index_names:
891916
offset += 1
892917

893-
# Check if we have an empty dataset
894-
# before trying to collect data.
895-
if offset < len(data):
896-
assert isinstance(index_col, Sequence)
897-
898-
for col in index_col:
899-
last = data[offset][col]
900-
901-
for row in range(offset + 1, len(data)):
902-
if data[row][col] == "" or data[row][col] is None:
903-
data[row][col] = last
918+
# Forward fill index columns:
919+
# TODO: forward fill also when index columns are selected by name!!!
920+
if has_index and offset < nixs:
921+
for col in index_col_set:
922+
last = data[ixmap[offset]][col]
923+
for row1 in ixmap[offset+1:]:
924+
if data[row1][col] == "" or data[row1][col] is None:
925+
data[row1][col] = last
904926
else:
905-
last = data[row][col]
927+
last = data[row1][col]
906928

907-
# GH 12292 : error when read one empty column from excel file
908-
try:
909929
parser = TextParser(
910930
data,
911931
names=names,
@@ -933,9 +953,8 @@ def _parse_sheet(
933953
output[asheetname] = parser.read(nrows=nrows)
934954

935955
if header_names:
936-
output[asheetname].columns = output[asheetname].columns.set_names(
937-
header_names
938-
)
956+
output[asheetname].columns = \
957+
output[asheetname].columns.set_names(header_names)
939958

940959
except EmptyDataError:
941960
# No Data, return an empty DataFrame

0 commit comments

Comments
 (0)