Skip to content

BUG: cells are missing in the excel file when exporting excel using xlsxwriter with option constant_memory set to True (#15392) #27624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ MultiIndex
I/O
^^^

-
- Bug in :func:`DataFrame.to_excel()` where cells are missing in the excel file when exporting excel using ``xlsxwriter`` with option ``constant_memory`` set to ``True`` ((:issue:`15392`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need the parens after to_excel.

Mismatched parens in the :issue:.

-
-

Expand Down
84 changes: 48 additions & 36 deletions pandas/io/formats/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ def __init__(
self.header = header
self.merge_cells = merge_cells
self.inf_rep = inf_rep
self._constant_memory = False

@property
def header_style(self):
Expand Down Expand Up @@ -466,14 +467,13 @@ def _format_header_mi(self):
coloffset = len(self.df.index[0]) - 1

if self.merge_cells:
# Format multi-index as a merged cells.
for lnum in range(len(level_lengths)):
name = columns.names[lnum]
yield ExcelCell(lnum, coloffset, name, self.header_style)

for lnum, (spans, levels, level_codes) in enumerate(
zip(level_lengths, columns.levels, columns.codes)
):
# Format multi-index as a merged cells.
name = columns.names[lnum]
yield ExcelCell(lnum, coloffset, name, self.header_style)

values = levels.take(level_codes)
for i in spans:
if spans[i] > 1:
Expand Down Expand Up @@ -578,23 +578,23 @@ def _format_regular_rows(self):
if isinstance(self.df.index, ABCPeriodIndex):
index_values = self.df.index.to_timestamp()

coloffset = 1
body = self._generate_body(coloffset)
_, ncol = self.df.shape
for idx, idxval in enumerate(index_values):
yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style)

coloffset = 1
for _ in range(ncol):
yield next(body)
else:
coloffset = 0

for cell in self._generate_body(coloffset):
yield cell
for cell in self._generate_body(coloffset):
yield cell

def _format_hierarchical_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
self.rowcounter += 1

gcolidx = 0

if self.index:
index_labels = self.df.index.names
# check for aliases
Expand All @@ -616,13 +616,28 @@ def _format_hierarchical_rows(self):
for cidx, name in enumerate(index_labels):
yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)

gen_non_merge_idx = (
ExcelCell(self.rowcounter + ridx, cidx, item, self.header_style)
for ridx, row in enumerate(self.df.index)
for cidx, item in enumerate(row)
)
gen_body = self._generate_body(self.df.index.nlevels)
nrow, ncol = self.df.shape
for _ in range(nrow):
if self._constant_memory or not self.merge_cells:
for _ in range(self.df.index.nlevels):
yield next(gen_non_merge_idx)
for _ in range(ncol):
yield next(gen_body)

if self.merge_cells:
# Format hierarchical rows as merged cells.
level_strs = self.df.index.format(
sparsify=True, adjoin=False, names=False
)
level_lengths = get_level_lengths(level_strs)

colidx = 0
for spans, levels, level_codes in zip(
level_lengths, self.df.index.levels, self.df.index.codes
):
Expand All @@ -635,35 +650,24 @@ def _format_hierarchical_rows(self):
if spans[i] > 1:
yield ExcelCell(
self.rowcounter + i,
gcolidx,
colidx,
values[i],
self.header_style,
self.rowcounter + i + spans[i] - 1,
gcolidx,
colidx,
)
else:
yield ExcelCell(
self.rowcounter + i,
gcolidx,
colidx,
values[i],
self.header_style,
)
gcolidx += 1

else:
# Format hierarchical rows with non-merged values.
for indexcolvals in zip(*self.df.index):
for idx, indexcolval in enumerate(indexcolvals):
yield ExcelCell(
self.rowcounter + idx,
gcolidx,
indexcolval,
self.header_style,
)
gcolidx += 1
colidx += 1

for cell in self._generate_body(gcolidx):
yield cell
else:
for cell in self._generate_body(0):
yield cell

def _generate_body(self, coloffset):
if self.styler is None:
Expand All @@ -674,13 +678,16 @@ def _generate_body(self, coloffset):
styles = None
xlstyle = None

# Write the body of the frame data series by series.
for colidx in range(len(self.columns)):
series = self.df.iloc[:, colidx]
for i, val in enumerate(series):
# Write the body of the frame data row by row.
nrow, ncol = self.df.shape
for rowidx in range(nrow):
row = self.df.iloc[rowidx, :]
for colidx, val in enumerate(row):
if styles is not None:
xlstyle = self.style_converter(";".join(styles[i, colidx]))
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle)
xlstyle = self.style_converter(";".join(styles[rowidx, colidx]))
yield ExcelCell(
self.rowcounter + rowidx, colidx + coloffset, val, xlstyle
)

def get_formatted_cells(self):
for cell in itertools.chain(self._format_header(), self._format_body()):
Expand Down Expand Up @@ -730,6 +737,11 @@ def write(
writer = ExcelWriter(_stringify_path(writer), engine=engine)
need_save = True

from pandas.io.excel._xlsxwriter import _XlsxWriter

if isinstance(writer, _XlsxWriter) and writer.book.constant_memory:
self._constant_memory = True

formatted_cells = self.get_formatted_cells()
writer.write_cells(
formatted_cells,
Expand Down
61 changes: 59 additions & 2 deletions pandas/tests/io/excel/test_xlsxwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import pytest

from pandas import DataFrame
from pandas.util.testing import ensure_clean
from pandas import DataFrame, read_excel, MultiIndex
from pandas.util.testing import ensure_clean, assert_frame_equal

from pandas.io.excel import ExcelWriter

Expand Down Expand Up @@ -62,3 +62,60 @@ def test_write_append_mode_raises(ext):
with ensure_clean(ext) as f:
with pytest.raises(ValueError, match=msg):
ExcelWriter(f, engine="xlsxwriter", mode="a")


def test_constant_memory_regularindex(ext):
# Test if cells are written row by row which is the requirement
# when exporting excel using xlsxwriter with constant_memory
# set True, for regular index.
# Test for issue #15392.
# Applicable to xlsxwriter only.
with ensure_clean(ext) as path:
df = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})

with ExcelWriter(
path, engine="xlsxwriter", options=dict(constant_memory=True)
) as writer:
df.to_excel(writer)

read_df = read_excel(path, header=0, index_col=0)

assert_frame_equal(df, read_df)


def test_constant_memory_multiindex(ext):
# Test if cells are written row by row which is the requirement
# when exporting excel using xlsxwriter with constant_memory set
# True, for MultiIndex.
# Test for issue #15392.
# Applicable to xlsxwriter only.
with ensure_clean(ext) as path:
df = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})
df.index = MultiIndex.from_arrays([["a", "a"], [1, 2]])

with ExcelWriter(
path, engine="xlsxwriter", options=dict(constant_memory=True)
) as writer:
df.to_excel(writer)

read_df = read_excel(path, header=0, index_col=[0, 1])

assert_frame_equal(df, read_df)


def test_constant_memory_multiheader(ext):
# Test if cells of a header of MultiIndex are written row by row
# Test for issue #15392.
# Applicable to xlsxwriter only.
with ensure_clean(ext) as path:
df = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})
df.columns = MultiIndex.from_arrays([["a", "a"], [1, 2]])

with ExcelWriter(
path, engine="xlsxwriter", options=dict(constant_memory=True)
) as writer:
df.to_excel(writer)

read_df = read_excel(path, header=[0, 1], index_col=0)

assert_frame_equal(df, read_df)