diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 4ce2ce5b69cb4..1a99c53988b43 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -303,6 +303,9 @@ Other API Changes - ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`) +- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`) + + Changes to eval ^^^^^^^^^^^^^^^ diff --git a/pandas/core/format.py b/pandas/core/format.py index 86d39c139fb51..a50edd9462431 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -619,105 +619,20 @@ def _join_multiline(self, *strcols): st = ed return '\n\n'.join(str_lst) - def to_latex(self, column_format=None, longtable=False): + def to_latex(self, column_format=None, longtable=False, encoding=None): """ Render a DataFrame to a LaTeX tabular/longtable environment output. """ - self.escape = self.kwds.get('escape', True) - def get_col_type(dtype): - if issubclass(dtype.type, np.number): - return 'r' - else: - return 'l' - - frame = self.frame - - if len(frame.columns) == 0 or len(frame.index) == 0: - info_line = (u('Empty %s\nColumns: %s\nIndex: %s') - % (type(self.frame).__name__, - frame.columns, frame.index)) - strcols = [[info_line]] - else: - strcols = self._to_str_columns() - - if self.index and isinstance(self.frame.index, MultiIndex): - clevels = self.frame.columns.nlevels - strcols.pop(0) - name = any(self.frame.index.names) - for i, lev in enumerate(self.frame.index.levels): - lev2 = lev.format() - blank = ' ' * len(lev2[0]) - lev3 = [blank] * clevels - if name: - lev3.append(lev.name) - for level_idx, group in itertools.groupby( - self.frame.index.labels[i]): - count = len(list(group)) - lev3.extend([lev2[level_idx]] + [blank] * (count - 1)) - strcols.insert(i, lev3) - - if column_format is None: - dtypes = self.frame.dtypes._values - column_format = ''.join(map(get_col_type, dtypes)) - if self.index: - index_format = 'l' * self.frame.index.nlevels - column_format = index_format + column_format - elif not isinstance(column_format, - compat.string_types): # pragma: no cover - raise AssertionError('column_format must be str or unicode, not %s' - % type(column_format)) - - def write(buf, frame, column_format, strcols, longtable=False): - if not longtable: - buf.write('\\begin{tabular}{%s}\n' % column_format) - buf.write('\\toprule\n') - else: - buf.write('\\begin{longtable}{%s}\n' % column_format) - buf.write('\\toprule\n') - - nlevels = frame.columns.nlevels - if any(frame.index.names): - nlevels += 1 - for i, row in enumerate(zip(*strcols)): - if i == nlevels and self.header: - buf.write('\\midrule\n') # End of header - if longtable: - buf.write('\\endhead\n') - buf.write('\\midrule\n') - buf.write('\\multicolumn{3}{r}{{Continued on next ' - 'page}} \\\\\n') - buf.write('\midrule\n') - buf.write('\endfoot\n\n') - buf.write('\\bottomrule\n') - buf.write('\\endlastfoot\n') - if self.escape: - crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first - .replace('_', '\\_') - .replace('%', '\\%') - .replace('$', '\\$') - .replace('#', '\\#') - .replace('{', '\\{') - .replace('}', '\\}') - .replace('~', '\\textasciitilde') - .replace('^', '\\textasciicircum') - .replace('&', '\\&') if x else '{}') for x in row] - else: - crow = [x if x else '{}' for x in row] - buf.write(' & '.join(crow)) - buf.write(' \\\\\n') - - if not longtable: - buf.write('\\bottomrule\n') - buf.write('\\end{tabular}\n') - else: - buf.write('\\end{longtable}\n') + latex_renderer = LatexFormatter(self, column_format=column_format, + longtable=longtable) if hasattr(self.buf, 'write'): - write(self.buf, frame, column_format, strcols, longtable) + latex_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): - with open(self.buf, 'w') as f: - write(f, frame, column_format, strcols, longtable) + import codecs + with codecs.open(self.buf, 'w', encoding=encoding) as f: + latex_renderer.write_result(f) else: raise TypeError('buf is not a file name and it has no write ' 'method') @@ -851,6 +766,124 @@ def _get_column_name_list(self): return names +class LatexFormatter(TableFormatter): + """ Used to render a DataFrame to a LaTeX tabular/longtable environment + output. + + Parameters + ---------- + formatter : `DataFrameFormatter` + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + longtable : boolean, default False + Use a longtable environment instead of tabular. + + See also + -------- + HTMLFormatter + """ + + def __init__(self, formatter, column_format=None, longtable=False): + self.fmt = formatter + self.frame = self.fmt.frame + self.column_format = column_format + self.longtable = longtable + + def write_result(self, buf): + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + + # string representation of the columns + if len(self.frame.columns) == 0 or len(self.frame.index) == 0: + info_line = (u('Empty %s\nColumns: %s\nIndex: %s') + % (type(self.frame).__name__, + self.frame.columns, self.frame.index)) + strcols = [[info_line]] + else: + strcols = self.fmt._to_str_columns() + + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return 'r' + else: + return 'l' + + if self.fmt.index and isinstance(self.frame.index, MultiIndex): + clevels = self.frame.columns.nlevels + strcols.pop(0) + name = any(self.frame.index.names) + for i, lev in enumerate(self.frame.index.levels): + lev2 = lev.format() + blank = ' ' * len(lev2[0]) + lev3 = [blank] * clevels + if name: + lev3.append(lev.name) + for level_idx, group in itertools.groupby( + self.frame.index.labels[i]): + count = len(list(group)) + lev3.extend([lev2[level_idx]] + [blank] * (count - 1)) + strcols.insert(i, lev3) + + column_format = self.column_format + if column_format is None: + dtypes = self.frame.dtypes._values + column_format = ''.join(map(get_col_type, dtypes)) + if self.fmt.index: + index_format = 'l' * self.frame.index.nlevels + column_format = index_format + column_format + elif not isinstance(column_format, + compat.string_types): # pragma: no cover + raise AssertionError('column_format must be str or unicode, not %s' + % type(column_format)) + + if not self.longtable: + buf.write('\\begin{tabular}{%s}\n' % column_format) + buf.write('\\toprule\n') + else: + buf.write('\\begin{longtable}{%s}\n' % column_format) + buf.write('\\toprule\n') + + nlevels = self.frame.columns.nlevels + if any(self.frame.index.names): + nlevels += 1 + for i, row in enumerate(zip(*strcols)): + if i == nlevels and self.fmt.header: + buf.write('\\midrule\n') # End of header + if self.longtable: + buf.write('\\endhead\n') + buf.write('\\midrule\n') + buf.write('\\multicolumn{3}{r}{{Continued on next ' + 'page}} \\\\\n') + buf.write('\\midrule\n') + buf.write('\\endfoot\n\n') + buf.write('\\bottomrule\n') + buf.write('\\endlastfoot\n') + if self.fmt.kwds.get('escape', True): + # escape backslashes first + crow = [(x.replace('\\', '\\textbackslash') + .replace('_', '\\_') + .replace('%', '\\%') + .replace('$', '\\$') + .replace('#', '\\#') + .replace('{', '\\{') + .replace('}', '\\}') + .replace('~', '\\textasciitilde') + .replace('^', '\\textasciicircum') + .replace('&', '\\&') if x else '{}') for x in row] + else: + crow = [x if x else '{}' for x in row] + buf.write(' & '.join(crow)) + buf.write(' \\\\\n') + + if not self.longtable: + buf.write('\\bottomrule\n') + buf.write('\\end{tabular}\n') + else: + buf.write('\\end{longtable}\n') + + class HTMLFormatter(TableFormatter): indent_delta = 2 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7220b25daf318..b27c4268796dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1547,7 +1547,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, bold_rows=True, column_format=None, - longtable=None, escape=None): + longtable=None, escape=None, encoding=None): """ Render a DataFrame to a tabular environment table. You can splice this into a LaTeX document. Requires \\usepackage{booktabs}. @@ -1567,7 +1567,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, default: True When set to False prevents from escaping latex special characters in column names. - + encoding : str, default None + Default encoding is ascii in Python 2 and utf-8 in Python 3 """ if colSpace is not None: # pragma: no cover @@ -1589,7 +1590,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, sparsify=sparsify, index_names=index_names, escape=escape) - formatter.to_latex(column_format=column_format, longtable=longtable) + formatter.to_latex(column_format=column_format, longtable=longtable, + encoding=encoding) if buf is None: return formatter.buf.getvalue() diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 4d17610d87bea..a73b459459321 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -15,6 +15,8 @@ from numpy.random import randn import numpy as np +import codecs + div_style = '' try: import IPython @@ -2554,6 +2556,24 @@ def test_to_latex_filename(self): with open(path, 'r') as f: self.assertEqual(self.frame.to_latex(), f.read()) + # test with utf-8 and encoding option (GH 7061) + df = DataFrame([[u'au\xdfgangen']]) + with tm.ensure_clean('test.tex') as path: + df.to_latex(path, encoding='utf-8') + with codecs.open(path, 'r', encoding='utf-8') as f: + self.assertEqual(df.to_latex(), f.read()) + + # test with utf-8 without encoding option + if compat.PY3: # python3 default encoding is utf-8 + with tm.ensure_clean('test.tex') as path: + df.to_latex(path) + with codecs.open(path, 'r') as f: + self.assertEqual(df.to_latex(), f.read()) + else: + # python2 default encoding is ascii, so an error should be raised + with tm.ensure_clean('test.tex') as path: + self.assertRaises(UnicodeEncodeError, df.to_latex, path) + def test_to_latex(self): # it works! self.frame.to_latex()