Skip to content

EHN encoding parameter for to_latex #11914

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,9 @@ Other API Changes

- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)

- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`)


Changes to eval
^^^^^^^^^^^^^^^

Expand Down
217 changes: 125 additions & 92 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,105 +619,20 @@ def _join_multiline(self, *strcols):
st = ed
return '\n\n'.join(str_lst)

def to_latex(self, column_format=None, longtable=False):
def to_latex(self, column_format=None, longtable=False, encoding=None):
"""
Render a DataFrame to a LaTeX tabular/longtable environment output.
"""
self.escape = self.kwds.get('escape', True)

def get_col_type(dtype):
if issubclass(dtype.type, np.number):
return 'r'
else:
return 'l'

frame = self.frame

if len(frame.columns) == 0 or len(frame.index) == 0:
info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
% (type(self.frame).__name__,
frame.columns, frame.index))
strcols = [[info_line]]
else:
strcols = self._to_str_columns()

if self.index and isinstance(self.frame.index, MultiIndex):
clevels = self.frame.columns.nlevels
strcols.pop(0)
name = any(self.frame.index.names)
for i, lev in enumerate(self.frame.index.levels):
lev2 = lev.format()
blank = ' ' * len(lev2[0])
lev3 = [blank] * clevels
if name:
lev3.append(lev.name)
for level_idx, group in itertools.groupby(
self.frame.index.labels[i]):
count = len(list(group))
lev3.extend([lev2[level_idx]] + [blank] * (count - 1))
strcols.insert(i, lev3)

if column_format is None:
dtypes = self.frame.dtypes._values
column_format = ''.join(map(get_col_type, dtypes))
if self.index:
index_format = 'l' * self.frame.index.nlevels
column_format = index_format + column_format
elif not isinstance(column_format,
compat.string_types): # pragma: no cover
raise AssertionError('column_format must be str or unicode, not %s'
% type(column_format))

def write(buf, frame, column_format, strcols, longtable=False):
if not longtable:
buf.write('\\begin{tabular}{%s}\n' % column_format)
buf.write('\\toprule\n')
else:
buf.write('\\begin{longtable}{%s}\n' % column_format)
buf.write('\\toprule\n')

nlevels = frame.columns.nlevels
if any(frame.index.names):
nlevels += 1
for i, row in enumerate(zip(*strcols)):
if i == nlevels and self.header:
buf.write('\\midrule\n') # End of header
if longtable:
buf.write('\\endhead\n')
buf.write('\\midrule\n')
buf.write('\\multicolumn{3}{r}{{Continued on next '
'page}} \\\\\n')
buf.write('\midrule\n')
buf.write('\endfoot\n\n')
buf.write('\\bottomrule\n')
buf.write('\\endlastfoot\n')
if self.escape:
crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first
.replace('_', '\\_')
.replace('%', '\\%')
.replace('$', '\\$')
.replace('#', '\\#')
.replace('{', '\\{')
.replace('}', '\\}')
.replace('~', '\\textasciitilde')
.replace('^', '\\textasciicircum')
.replace('&', '\\&') if x else '{}') for x in row]
else:
crow = [x if x else '{}' for x in row]
buf.write(' & '.join(crow))
buf.write(' \\\\\n')

if not longtable:
buf.write('\\bottomrule\n')
buf.write('\\end{tabular}\n')
else:
buf.write('\\end{longtable}\n')
latex_renderer = LatexFormatter(self, column_format=column_format,
longtable=longtable)

if hasattr(self.buf, 'write'):
write(self.buf, frame, column_format, strcols, longtable)
latex_renderer.write_result(self.buf)
elif isinstance(self.buf, compat.string_types):
with open(self.buf, 'w') as f:
write(f, frame, column_format, strcols, longtable)
import codecs
with codecs.open(self.buf, 'w', encoding=encoding) as f:
latex_renderer.write_result(f)
else:
raise TypeError('buf is not a file name and it has no write '
'method')
Expand Down Expand Up @@ -851,6 +766,124 @@ def _get_column_name_list(self):
return names


class LatexFormatter(TableFormatter):
""" Used to render a DataFrame to a LaTeX tabular/longtable environment
output.

Parameters
----------
formatter : `DataFrameFormatter`
column_format : str, default None
The columns format as specified in `LaTeX table format
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
longtable : boolean, default False
Use a longtable environment instead of tabular.

See also
--------
HTMLFormatter
"""

def __init__(self, formatter, column_format=None, longtable=False):
self.fmt = formatter
self.frame = self.fmt.frame
self.column_format = column_format
self.longtable = longtable

def write_result(self, buf):
"""
Render a DataFrame to a LaTeX tabular/longtable environment output.
"""

# string representation of the columns
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
% (type(self.frame).__name__,
self.frame.columns, self.frame.index))
strcols = [[info_line]]
else:
strcols = self.fmt._to_str_columns()

def get_col_type(dtype):
if issubclass(dtype.type, np.number):
return 'r'
else:
return 'l'

if self.fmt.index and isinstance(self.frame.index, MultiIndex):
clevels = self.frame.columns.nlevels
strcols.pop(0)
name = any(self.frame.index.names)
for i, lev in enumerate(self.frame.index.levels):
lev2 = lev.format()
blank = ' ' * len(lev2[0])
lev3 = [blank] * clevels
if name:
lev3.append(lev.name)
for level_idx, group in itertools.groupby(
self.frame.index.labels[i]):
count = len(list(group))
lev3.extend([lev2[level_idx]] + [blank] * (count - 1))
strcols.insert(i, lev3)

column_format = self.column_format
if column_format is None:
dtypes = self.frame.dtypes._values
column_format = ''.join(map(get_col_type, dtypes))
if self.fmt.index:
index_format = 'l' * self.frame.index.nlevels
column_format = index_format + column_format
elif not isinstance(column_format,
compat.string_types): # pragma: no cover
raise AssertionError('column_format must be str or unicode, not %s'
% type(column_format))

if not self.longtable:
buf.write('\\begin{tabular}{%s}\n' % column_format)
buf.write('\\toprule\n')
else:
buf.write('\\begin{longtable}{%s}\n' % column_format)
buf.write('\\toprule\n')

nlevels = self.frame.columns.nlevels
if any(self.frame.index.names):
nlevels += 1
for i, row in enumerate(zip(*strcols)):
if i == nlevels and self.fmt.header:
buf.write('\\midrule\n') # End of header
if self.longtable:
buf.write('\\endhead\n')
buf.write('\\midrule\n')
buf.write('\\multicolumn{3}{r}{{Continued on next '
'page}} \\\\\n')
buf.write('\\midrule\n')
buf.write('\\endfoot\n\n')
buf.write('\\bottomrule\n')
buf.write('\\endlastfoot\n')
if self.fmt.kwds.get('escape', True):
# escape backslashes first
crow = [(x.replace('\\', '\\textbackslash')
.replace('_', '\\_')
.replace('%', '\\%')
.replace('$', '\\$')
.replace('#', '\\#')
.replace('{', '\\{')
.replace('}', '\\}')
.replace('~', '\\textasciitilde')
.replace('^', '\\textasciicircum')
.replace('&', '\\&') if x else '{}') for x in row]
else:
crow = [x if x else '{}' for x in row]
buf.write(' & '.join(crow))
buf.write(' \\\\\n')

if not self.longtable:
buf.write('\\bottomrule\n')
buf.write('\\end{tabular}\n')
else:
buf.write('\\end{longtable}\n')


class HTMLFormatter(TableFormatter):

indent_delta = 2
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1547,7 +1547,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
header=True, index=True, na_rep='NaN', formatters=None,
float_format=None, sparsify=None, index_names=True,
bold_rows=True, column_format=None,
longtable=None, escape=None):
longtable=None, escape=None, encoding=None):
"""
Render a DataFrame to a tabular environment table. You can splice
this into a LaTeX document. Requires \\usepackage{booktabs}.
Expand All @@ -1567,7 +1567,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
default: True
When set to False prevents from escaping latex special
characters in column names.

encoding : str, default None
Default encoding is ascii in Python 2 and utf-8 in Python 3
"""

if colSpace is not None: # pragma: no cover
Expand All @@ -1589,7 +1590,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
sparsify=sparsify,
index_names=index_names,
escape=escape)
formatter.to_latex(column_format=column_format, longtable=longtable)
formatter.to_latex(column_format=column_format, longtable=longtable,
encoding=encoding)

if buf is None:
return formatter.buf.getvalue()
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from numpy.random import randn
import numpy as np

import codecs

div_style = ''
try:
import IPython
Expand Down Expand Up @@ -2554,6 +2556,24 @@ def test_to_latex_filename(self):
with open(path, 'r') as f:
self.assertEqual(self.frame.to_latex(), f.read())

# test with utf-8 and encoding option (GH 7061)
df = DataFrame([[u'au\xdfgangen']])
with tm.ensure_clean('test.tex') as path:
df.to_latex(path, encoding='utf-8')
with codecs.open(path, 'r', encoding='utf-8') as f:
self.assertEqual(df.to_latex(), f.read())

# test with utf-8 without encoding option
if compat.PY3: # python3 default encoding is utf-8
with tm.ensure_clean('test.tex') as path:
df.to_latex(path)
with codecs.open(path, 'r') as f:
self.assertEqual(df.to_latex(), f.read())
else:
# python2 default encoding is ascii, so an error should be raised
with tm.ensure_clean('test.tex') as path:
self.assertRaises(UnicodeEncodeError, df.to_latex, path)

def test_to_latex(self):
# it works!
self.frame.to_latex()
Expand Down