-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
[BUG] read_excel: fixes handling of multi index header and other corner cases #58899
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ba8e8a1
0e47b18
6ad5db5
a23e91f
33e0d06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -817,95 +817,122 @@ def _parse_sheet( | |
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, | ||
**kwds, | ||
): | ||
is_list_header = False | ||
is_len_one_list_header = False | ||
if is_list_like(header): | ||
assert isinstance(header, Sequence) | ||
is_list_header = True | ||
if len(header) == 1: | ||
is_len_one_list_header = True | ||
|
||
if is_len_one_list_header: | ||
header = cast(Sequence[int], header)[0] | ||
|
||
# forward fill and pull out names for MultiIndex column | ||
header_names = None | ||
if header is not None and is_list_like(header): | ||
assert isinstance(header, Sequence) | ||
|
||
header_names = [] | ||
control_row = [True] * len(data[0]) | ||
|
||
for row in header: | ||
if is_integer(skiprows): | ||
assert isinstance(skiprows, int) | ||
row += skiprows | ||
|
||
if row > len(data) - 1: | ||
raise ValueError( | ||
f"header index {row} exceeds maximum index " | ||
f"{len(data) - 1} of data.", | ||
) | ||
|
||
data[row], control_row = fill_mi_header(data[row], control_row) | ||
if callable(skiprows): | ||
# In order to avoid calling skiprows multiple times on | ||
# every row, we just do it here and keep the resulting | ||
# list for passing it down to the parser. | ||
skiprows = [ix for ix in range(len(data)) if skiprows(ix)] | ||
if len(skiprows) == 0: | ||
skiprows = None | ||
|
||
if index_col is not None: | ||
header_name, _ = pop_header_name(data[row], index_col) | ||
header_names.append(header_name) | ||
try: | ||
# header indexes reference rows after removing skiprows, so we | ||
# create an index map from the without-skiprows to the | ||
# original indexes. | ||
ixmap: range | list[int] | ||
if skiprows is None: | ||
ixmap = range(len(data)) | ||
elif is_integer(skiprows): | ||
ixmap = range(skiprows, len(data)) | ||
elif is_list_like(skiprows): | ||
skiprows_set = set(cast(Sequence[int], skiprows)) | ||
ixmap = [ix for ix in range(len(data)) if ix not in skiprows_set] | ||
else: | ||
raise ValueError("skiprows must be an integer or a list of integers") | ||
nixs = len(ixmap) | ||
|
||
# If there is a MultiIndex header and an index then there is also | ||
# a row containing just the index name(s) | ||
has_index_names = False | ||
if is_list_header and not is_len_one_list_header and index_col is not None: | ||
index_col_has_names = False | ||
index_col_set: set[int] | ||
if isinstance(index_col, int): | ||
if index_col is None: | ||
index_col_set = set() | ||
elif isinstance(index_col, str): | ||
index_col_set = set() | ||
index_col_has_names = True | ||
elif isinstance(index_col, int): | ||
index_col_set = {index_col} | ||
else: | ||
assert isinstance(index_col, Sequence) | ||
elif is_list_like(index_col): | ||
index_col_set = set(index_col) | ||
|
||
# We have to handle mi without names. If any of the entries in the data | ||
# columns are not empty, this is a regular row | ||
assert isinstance(header, Sequence) | ||
if len(header) < len(data): | ||
potential_index_names = data[len(header)] | ||
has_index_names = all( | ||
x == "" or x is None | ||
for i, x in enumerate(potential_index_names) | ||
if not control_row[i] and i not in index_col_set | ||
else: | ||
raise ValueError( | ||
"index_col must be a string, an integer or a list of integers" | ||
) | ||
has_index = len(index_col_set) > 0 | ||
has_index_names = False | ||
|
||
if is_list_like(index_col): | ||
# Forward fill values for MultiIndex index. | ||
header_list: Sequence[int] | ||
if header is None: | ||
offset = 0 | ||
header_list = [] | ||
elif isinstance(header, int): | ||
offset = 1 + header | ||
header_list = [header] | ||
elif is_list_like(header): | ||
header_list = header | ||
else: | ||
offset = 1 + max(header) | ||
raise ValueError("header must be an integer or a list of integers") | ||
|
||
# GH34673: if MultiIndex names present and not defined in the header, | ||
# offset needs to be incremented so that forward filling starts | ||
# from the first MI value instead of the name | ||
if has_index_names: | ||
offset += 1 | ||
header_names = [] | ||
|
||
# Check if we have an empty dataset | ||
# before trying to collect data. | ||
if offset < len(data): | ||
assert isinstance(index_col, Sequence) | ||
if len(header_list) == 0: | ||
offset = 0 | ||
else: | ||
max_header = max(header_list) | ||
offset = max_header + 1 | ||
|
||
for col in index_col: | ||
last = data[offset][col] | ||
if max_header >= nixs: | ||
raise ValueError( | ||
f"header index {max_header} exceeds maximum index " | ||
f"{nixs - 1} of data.", | ||
) | ||
|
||
if len(header_list) > 1: | ||
if index_col_has_names: | ||
raise ValueError( | ||
"named index_col can not be used together " | ||
"with multi-index header" | ||
) | ||
Comment on lines
+888
to
+891
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this restriction new? |
||
|
||
# Forward fill and pull out names for MultiIndex column | ||
control_row = [True] * len(data[0]) | ||
for row in header_list: | ||
row1 = ixmap[row] | ||
data[row1], control_row = fill_mi_header( | ||
data[row1], control_row | ||
) | ||
|
||
if has_index: | ||
header_name, _ = pop_header_name( | ||
data[row1], sorted(index_col_set) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why sort? |
||
) | ||
if header_name: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the case header_name being None hit by tests? |
||
header_names.append(header_name) | ||
|
||
# If there is a MultiIndex header and an index then | ||
# there may also be a row containing just the index | ||
# name(s) | ||
if has_index and offset < nixs: | ||
# We have to handle mi without names. If any | ||
# of the entries in the data columns are not | ||
# empty, this is a regular row. | ||
|
||
potential_index_names = data[ixmap[offset]] | ||
has_index_names = all( | ||
x == "" or x is None | ||
for i, x in enumerate(potential_index_names) | ||
if not control_row[i] and i not in index_col_set | ||
) | ||
if has_index_names: | ||
offset += 1 | ||
|
||
for row in range(offset + 1, len(data)): | ||
if data[row][col] == "" or data[row][col] is None: | ||
data[row][col] = last | ||
# Forward fill index columns: | ||
# TODO: forward fill also when index columns are selected by name!!! | ||
if has_index and offset < nixs: | ||
for col in index_col_set: | ||
last = data[ixmap[offset]][col] | ||
for row1 in ixmap[offset + 1 :]: | ||
if data[row1][col] == "" or data[row1][col] is None: | ||
data[row1][col] = last | ||
else: | ||
last = data[row][col] | ||
last = data[row1][col] | ||
|
||
# GH 12292 : error when read one empty column from excel file | ||
try: | ||
parser = TextParser( | ||
data, | ||
names=names, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1759,3 +1759,53 @@ def test_corrupt_files_closed(self, engine, tmp_excel): | |
pd.ExcelFile(tmp_excel, engine=engine) | ||
except errors: | ||
pass | ||
|
||
def test_mi_header_skiprows1(self, engine, read_ext): | ||
if engine is None and read_ext == ".xlsx": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add references to the issue as a comment in the first line of each test:
|
||
with open("test_mi_holes.xlsx", "rb") as f: | ||
expected = pd.read_excel( | ||
f, sheet_name="expected", header=[0, 1], index_col=[0, 1] | ||
) | ||
Comment on lines
+1765
to
+1768
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of adding files to the data/ directory, is it possible to generate these sheets on-the-fly? E.g. df = pd.DataFrame(...)
df.to_excel(tmp_excel, sheet_name="test", index=False)
|
||
|
||
with open("test_mi_holes.xlsx", "rb") as f: | ||
actual = pd.read_excel( | ||
f, | ||
sheet_name="skiprows1", | ||
skiprows=1, | ||
header=[0, 1], | ||
index_col=[0, 1], | ||
) | ||
|
||
tm.assert_frame_equal(expected, actual) | ||
|
||
def test_mi_header_hole(self, engine, read_ext): | ||
if engine is None and read_ext == ".xlsx": | ||
with open("test_mi_holes.xlsx", "rb") as f: | ||
expected = pd.read_excel( | ||
f, sheet_name="expected", header=[0, 1], index_col=[0, 1] | ||
) | ||
|
||
with open("test_mi_holes.xlsx", "rb") as f: | ||
actual = pd.read_excel( | ||
f, | ||
sheet_name="header_hole", | ||
skiprows=[1], | ||
header=[0, 1], | ||
index_col=[0, 1], | ||
) | ||
|
||
tm.assert_frame_equal(expected, actual) | ||
|
||
def test_mi_header_and_index_holes(self, engine, read_ext): | ||
if engine is None and read_ext == ".xlsx": | ||
with open("test_mi_holes.xlsx", "rb") as f: | ||
expected = pd.read_excel( | ||
f, sheet_name="expected_index_hole", header=[0, 1], index_col=[0, 1] | ||
) | ||
|
||
with open("test_mi_holes.xlsx", "rb") as f: | ||
actual = pd.read_excel( | ||
f, sheet_name="index_hole", header=[0, 2], index_col=[0, 1] | ||
) | ||
|
||
tm.assert_frame_equal(expected, actual) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is only used one place below; instead can you replace it with
isinstance(index_col, str)
to simplify the code.