Skip to content

[BUG] read_excel: fixes handling of multi index header and other corner cases #58899

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ I/O
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_excel` multiindex head with holes (:issue:`58898`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
Expand Down
173 changes: 100 additions & 73 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,95 +817,122 @@ def _parse_sheet(
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
**kwds,
):
is_list_header = False
is_len_one_list_header = False
if is_list_like(header):
assert isinstance(header, Sequence)
is_list_header = True
if len(header) == 1:
is_len_one_list_header = True

if is_len_one_list_header:
header = cast(Sequence[int], header)[0]

# forward fill and pull out names for MultiIndex column
header_names = None
if header is not None and is_list_like(header):
assert isinstance(header, Sequence)

header_names = []
control_row = [True] * len(data[0])

for row in header:
if is_integer(skiprows):
assert isinstance(skiprows, int)
row += skiprows

if row > len(data) - 1:
raise ValueError(
f"header index {row} exceeds maximum index "
f"{len(data) - 1} of data.",
)

data[row], control_row = fill_mi_header(data[row], control_row)
if callable(skiprows):
# In order to avoid calling skiprows multiple times on
# every row, we just do it here and keep the resulting
# list for passing it down to the parser.
skiprows = [ix for ix in range(len(data)) if skiprows(ix)]
if len(skiprows) == 0:
skiprows = None

if index_col is not None:
header_name, _ = pop_header_name(data[row], index_col)
header_names.append(header_name)
try:
# header indexes reference rows after removing skiprows, so we
# create an index map from the without-skiprows to the
# original indexes.
ixmap: range | list[int]
if skiprows is None:
ixmap = range(len(data))
elif is_integer(skiprows):
ixmap = range(skiprows, len(data))
elif is_list_like(skiprows):
skiprows_set = set(cast(Sequence[int], skiprows))
ixmap = [ix for ix in range(len(data)) if ix not in skiprows_set]
else:
raise ValueError("skiprows must be an integer or a list of integers")
nixs = len(ixmap)

# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_has_names = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only used one place below; instead can you replace it with isinstance(index_col, str) to simplify the code.

index_col_set: set[int]
if isinstance(index_col, int):
if index_col is None:
index_col_set = set()
elif isinstance(index_col, str):
index_col_set = set()
index_col_has_names = True
elif isinstance(index_col, int):
index_col_set = {index_col}
else:
assert isinstance(index_col, Sequence)
elif is_list_like(index_col):
index_col_set = set(index_col)

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
has_index_names = all(
x == "" or x is None
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_set
else:
raise ValueError(
"index_col must be a string, an integer or a list of integers"
)
has_index = len(index_col_set) > 0
has_index_names = False

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
header_list: Sequence[int]
if header is None:
offset = 0
header_list = []
elif isinstance(header, int):
offset = 1 + header
header_list = [header]
elif is_list_like(header):
header_list = header
else:
offset = 1 + max(header)
raise ValueError("header must be an integer or a list of integers")

# GH34673: if MultiIndex names present and not defined in the header,
# offset needs to be incremented so that forward filling starts
# from the first MI value instead of the name
if has_index_names:
offset += 1
header_names = []

# Check if we have an empty dataset
# before trying to collect data.
if offset < len(data):
assert isinstance(index_col, Sequence)
if len(header_list) == 0:
offset = 0
else:
max_header = max(header_list)
offset = max_header + 1

for col in index_col:
last = data[offset][col]
if max_header >= nixs:
raise ValueError(
f"header index {max_header} exceeds maximum index "
f"{nixs - 1} of data.",
)

if len(header_list) > 1:
if index_col_has_names:
raise ValueError(
"named index_col can not be used together "
"with multi-index header"
)
Comment on lines +888 to +891
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this restriction new?


# Forward fill and pull out names for MultiIndex column
control_row = [True] * len(data[0])
for row in header_list:
row1 = ixmap[row]
data[row1], control_row = fill_mi_header(
data[row1], control_row
)

if has_index:
header_name, _ = pop_header_name(
data[row1], sorted(index_col_set)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why sort?

)
if header_name:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the case header_name being None hit by tests?

header_names.append(header_name)

# If there is a MultiIndex header and an index then
# there may also be a row containing just the index
# name(s)
if has_index and offset < nixs:
# We have to handle mi without names. If any
# of the entries in the data columns are not
# empty, this is a regular row.

potential_index_names = data[ixmap[offset]]
has_index_names = all(
x == "" or x is None
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_set
)
if has_index_names:
offset += 1

for row in range(offset + 1, len(data)):
if data[row][col] == "" or data[row][col] is None:
data[row][col] = last
# Forward fill index columns:
# TODO: forward fill also when index columns are selected by name!!!
if has_index and offset < nixs:
for col in index_col_set:
last = data[ixmap[offset]][col]
for row1 in ixmap[offset + 1 :]:
if data[row1][col] == "" or data[row1][col] is None:
data[row1][col] = last
else:
last = data[row][col]
last = data[row1][col]

# GH 12292 : error when read one empty column from excel file
try:
parser = TextParser(
data,
names=names,
Expand Down
Binary file modified pandas/tests/io/data/excel/test_boolean_types.xlsx
Binary file not shown.
Binary file added pandas/tests/io/data/excel/test_mi_holes.xlsx
Binary file not shown.
50 changes: 50 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1759,3 +1759,53 @@ def test_corrupt_files_closed(self, engine, tmp_excel):
pd.ExcelFile(tmp_excel, engine=engine)
except errors:
pass

def test_mi_header_skiprows1(self, engine, read_ext):
if engine is None and read_ext == ".xlsx":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add references to the issue as a comment in the first line of each test:

# GH#58898

with open("test_mi_holes.xlsx", "rb") as f:
expected = pd.read_excel(
f, sheet_name="expected", header=[0, 1], index_col=[0, 1]
)
Comment on lines +1765 to +1768
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of adding files to the data/ directory, is it possible to generate these sheets on-the-fly? E.g.

df = pd.DataFrame(...)
df.to_excel(tmp_excel, sheet_name="test", index=False)

tmp_excel here is a pytest fixture.


with open("test_mi_holes.xlsx", "rb") as f:
actual = pd.read_excel(
f,
sheet_name="skiprows1",
skiprows=1,
header=[0, 1],
index_col=[0, 1],
)

tm.assert_frame_equal(expected, actual)

def test_mi_header_hole(self, engine, read_ext):
if engine is None and read_ext == ".xlsx":
with open("test_mi_holes.xlsx", "rb") as f:
expected = pd.read_excel(
f, sheet_name="expected", header=[0, 1], index_col=[0, 1]
)

with open("test_mi_holes.xlsx", "rb") as f:
actual = pd.read_excel(
f,
sheet_name="header_hole",
skiprows=[1],
header=[0, 1],
index_col=[0, 1],
)

tm.assert_frame_equal(expected, actual)

def test_mi_header_and_index_holes(self, engine, read_ext):
if engine is None and read_ext == ".xlsx":
with open("test_mi_holes.xlsx", "rb") as f:
expected = pd.read_excel(
f, sheet_name="expected_index_hole", header=[0, 1], index_col=[0, 1]
)

with open("test_mi_holes.xlsx", "rb") as f:
actual = pd.read_excel(
f, sheet_name="index_hole", header=[0, 2], index_col=[0, 1]
)

tm.assert_frame_equal(expected, actual)