ENH: Added multicolumn/multirow support for latex

sgsaenger · sgsaenger · commit ced00bc4728b · 2017-03-02T14:17:34.000+01:00
- [X] closes pandas-dev#13508 - [X] tests added / passed - [X] passes `git diff upstream/master | flake8 --diff` - [X] whatsnew entry Print names of MultiIndex columns. Added "multicolumn" and "multirow" flags to to_latex which trigger the corresponding feature. "multicolumn_format" is used to select alignment. Multirow adds clines to visually separate sections.
diff --git a/doc/source/options.rst b/doc/source/options.rst
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -181,6 +181,7 @@ Other enhancements
 - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs <timedeltas.isoformat>` (:issue:`15136`)
 - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
 - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`)
+- The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements
 - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
 - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
 - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -239,14 +239,35 @@
 : bool
     This specifies if the to_latex method of a Dataframe uses escapes special
     characters.
-    method. Valid values: False,True
+    Valid values: False,True
 """
 
 pc_latex_longtable = """
 :bool
     This specifies if the to_latex method of a Dataframe uses the longtable
     format.
-    method. Valid values: False,True
+    Valid values: False,True
+"""
+
+pc_latex_multicolumn = """
+: bool
+    This specifies if the to_latex method of a Dataframe uses multicolumns
+    to pretty-print MultiIndex columns.
+    Valid values: False,True
+"""
+
+pc_latex_multicolumn_format = """
+: string
+    This specifies the format for multicolumn headers.
+    Can be surrounded with '|'.
+    Valid values: 'l', 'c', 'r', 'p{<width>}'
+"""
+
+pc_latex_multirow = """
+: bool
+    This specifies if the to_latex method of a Dataframe uses multirows
+    to pretty-print MultiIndex rows.
+    Valid values: False,True
 """
 
 style_backup = dict()
@@ -339,6 +360,12 @@ def mpl_style_cb(key):
                        validator=is_bool)
     cf.register_option('latex.longtable', False, pc_latex_longtable,
                        validator=is_bool)
+    cf.register_option('latex.multicolumn', True, pc_latex_multicolumn,
+                       validator=is_bool)
+    cf.register_option('latex.multicolumn_format', 'l', pc_latex_multicolumn,
+                       validator=is_text)
+    cf.register_option('latex.multirow', False, pc_latex_multirow,
+                       validator=is_bool)
 
 cf.deprecate_option('display.line_width',
                     msg=pc_line_width_deprecation_warning,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1597,10 +1597,11 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True,
                  index=True, na_rep='NaN', formatters=None, float_format=None,
                  sparsify=None, index_names=True, bold_rows=True,
                  column_format=None, longtable=None, escape=None,
-                 encoding=None, decimal='.'):
-        """
+                 encoding=None, decimal='.', multicolumn=None,
+                 multicolumn_format=None, multirow=None):
+        r"""
         Render a DataFrame to a tabular environment table. You can splice
-        this into a LaTeX document. Requires \\usepackage{booktabs}.
+        this into a LaTeX document. Requires \usepackage{booktabs}.
 
         `to_latex`-specific options:
 
@@ -1611,27 +1612,54 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True,
             <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3
             columns
         longtable : boolean, default will be read from the pandas config module
-            default: False
+            Default: False.
             Use a longtable environment instead of tabular. Requires adding
-            a \\usepackage{longtable} to your LaTeX preamble.
+            a \usepackage{longtable} to your LaTeX preamble.
         escape : boolean, default will be read from the pandas config module
-            default: True
+            Default: True.
             When set to False prevents from escaping latex special
             characters in column names.
         encoding : str, default None
             A string representing the encoding to use in the output file,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
         decimal : string, default '.'
-            Character recognized as decimal separator, e.g. ',' in Europe
+            Character recognized as decimal separator, e.g. ',' in Europe.
 
             .. versionadded:: 0.18.0
 
+        multicolumn : boolean, default True
+            Use \multicolumn to enhance MultiIndex columns.
+            The default will be read from the config module.
+
+            .. versionadded:: 0.20.0
+
+        multicolumn_format : str, default 'l'
+            The alignment for multicolumns, similar to `column_format`
+            The default will be read from the config module.
+
+            .. versionadded:: 0.20.0
+
+        multirow : boolean, default False
+            Use \multirow to enhance MultiIndex rows.
+            Requires adding a \usepackage{multirow} to your LaTeX preamble.
+            Will print centered labels (instead of top-aligned)
+            across the contained rows, separating groups via clines.
+            The default will be read from the pandas config module.
+
+            .. versionadded:: 0.20.0
+
         """
         # Get defaults from the pandas config
         if longtable is None:
             longtable = get_option("display.latex.longtable")
         if escape is None:
             escape = get_option("display.latex.escape")
+        if multicolumn is None:
+            multicolumn = get_option("display.latex.multicolumn")
+        if multicolumn_format is None:
+            multicolumn_format = get_option("display.latex.multicolumn_format")
+        if multirow is None:
+            multirow = get_option("display.latex.multirow")
 
         formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
                                            col_space=col_space, na_rep=na_rep,
@@ -1643,7 +1671,9 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True,
                                            index_names=index_names,
                                            escape=escape, decimal=decimal)
         formatter.to_latex(column_format=column_format, longtable=longtable,
-                           encoding=encoding)
+                           encoding=encoding, multicolumn=multicolumn,
+                           multicolumn_format=multicolumn_format,
+                           multirow=multirow)
 
         if buf is None:
             return formatter.buf.getvalue()
diff --git a/pandas/formats/format.py b/pandas/formats/format.py
@@ -650,13 +650,17 @@ def _join_multiline(self, *strcols):
             st = ed
         return '\n\n'.join(str_lst)
 
-    def to_latex(self, column_format=None, longtable=False, encoding=None):
+    def to_latex(self, column_format=None, longtable=False, encoding=None,
+                 multicolumn=False, multicolumn_format=None, multirow=False):
         """
         Render a DataFrame to a LaTeX tabular/longtable environment output.
         """
 
         latex_renderer = LatexFormatter(self, column_format=column_format,
-                                        longtable=longtable)
+                                        longtable=longtable,
+                                        multicolumn=multicolumn,
+                                        multicolumn_format=multicolumn_format,
+                                        multirow=multirow)
 
         if encoding is None:
             encoding = 'ascii' if compat.PY2 else 'utf-8'
@@ -824,11 +828,15 @@ class LatexFormatter(TableFormatter):
     HTMLFormatter
     """
 
-    def __init__(self, formatter, column_format=None, longtable=False):
+    def __init__(self, formatter, column_format=None, longtable=False,
+                 multicolumn=False, multicolumn_format=None, multirow=False):
         self.fmt = formatter
         self.frame = self.fmt.frame
         self.column_format = column_format
         self.longtable = longtable
+        self.multicolumn = multicolumn
+        self.multicolumn_format = multicolumn_format
+        self.multirow = multirow
 
     def write_result(self, buf):
         """
@@ -850,14 +858,21 @@ def get_col_type(dtype):
             else:
                 return 'l'
 
+        # reestablish the MultiIndex that has been joined by _to_str_column
         if self.fmt.index and isinstance(self.frame.index, MultiIndex):
             clevels = self.frame.columns.nlevels
             strcols.pop(0)
             name = any(self.frame.index.names)
+            cname = any(self.frame.columns.names)
+            lastcol = self.frame.index.nlevels - 1
             for i, lev in enumerate(self.frame.index.levels):
                 lev2 = lev.format()
                 blank = ' ' * len(lev2[0])
-                lev3 = [blank] * clevels
+                # display column names in last index-column
+                if cname and i == lastcol:
+                    lev3 = [x if x else '{}' for x in self.frame.columns.names]
+                else:
+                    lev3 = [blank] * clevels
                 if name:
                     lev3.append(lev.name)
                 for level_idx, group in itertools.groupby(
@@ -885,10 +900,15 @@ def get_col_type(dtype):
             buf.write('\\begin{longtable}{%s}\n' % column_format)
             buf.write('\\toprule\n')
 
-        nlevels = self.frame.columns.nlevels
+        ilevels = self.frame.index.nlevels
+        clevels = self.frame.columns.nlevels
+        nlevels = clevels
         if any(self.frame.index.names):
             nlevels += 1
-        for i, row in enumerate(zip(*strcols)):
+        strrows = list(zip(*strcols))
+        self.clinebuf = []
+
+        for i, row in enumerate(strrows):
             if i == nlevels and self.fmt.header:
                 buf.write('\\midrule\n')  # End of header
                 if self.longtable:
@@ -910,15 +930,94 @@ def get_col_type(dtype):
                          if x else '{}') for x in row]
             else:
                 crow = [x if x else '{}' for x in row]
+            if i < clevels and self.fmt.header and self.multicolumn:
+                # sum up columns to multicolumns
+                crow = self._format_multicolumn(crow, ilevels)
+            if i >= nlevels and self.fmt.index and self.multirow and\
+                    ilevels > 1:
+                # sum up rows to multirows
+                crow = self._format_multirow(crow, ilevels, i, strrows)
             buf.write(' & '.join(crow))
             buf.write(' \\\\\n')
+            if self.multirow and i < len(strrows) - 1:
+                self._print_cline(buf, i, len(strcols))
 
         if not self.longtable:
             buf.write('\\bottomrule\n')
             buf.write('\\end{tabular}\n')
         else:
             buf.write('\\end{longtable}\n')
 
+    def _format_multicolumn(self, row, ilevels):
+        """
+        Combine columns belonging to a group to a single multicolumn entry
+        according to self.multicolumn_format
+
+        e.g.:
+        a &  &  & b & c &
+        will become
+        \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
+        """
+        row2 = list(row[:ilevels])
+        ncol = 1
+        coltext = ''
+
+        def append_col():
+            # write multicolumn if needed
+            if ncol > 1:
+                row2.append('\\multicolumn{{{0:d}}}{{{1:s}}}{{{2:s}}}'
+                            .format(ncol, self.multicolumn_format,
+                                    coltext.strip()))
+            # don't modify where not needed
+            else:
+                row2.append(coltext)
+        for c in row[ilevels:]:
+            if c.strip():  # if next col has text, write the previous
+                if coltext:
+                    append_col()
+                coltext = c
+                ncol = 1
+            else:  # if not, add it to the previous multicolumn
+                ncol += 1
+        if coltext:  # write last column name
+            append_col()
+        return row2
+
+    def _format_multirow(self, row, ilevels, i, rows):
+        """
+        Check following rows, whether row should be a multirow
+
+        e.g.:     becomes:
+        a & 0 &   \multirow{2}{*}{a} & 0 &
+          & 1 &     & 1 &
+        b & 0 &   \cline{1-2}
+                  b & 0 &
+        """
+        for j in range(ilevels):
+            if row[j].strip():
+                nrow = 1
+                for r in rows[i + 1:]:
+                    if not r[j].strip():
+                        nrow += 1
+                    else:
+                        break
+                if nrow > 1:
+                    # overwrite non-multirow entry
+                    row[j] = '\\multirow{{{0:d}}}{{*}}{{{1:s}}}'\
+                             .format(nrow, row[j].strip())
+                    # save when to end the current block with \cline
+                    self.clinebuf.append([i + nrow - 1, j + 1])
+        return row
+
+    def _print_cline(self, buf, i, l):
+        """
+        Print clines after multirow-blocks are finished
+        """
+        for cl in self.clinebuf:
+            if cl[0] == i:
+                buf.write('\cline{{{0:d}-{1:d}}}\n'.format(cl[1], l))
+        self.clinebuf = [x for x in self.clinebuf if x[0] != i]
+
 
 class HTMLFormatter(TableFormatter):
 
diff --git a/pandas/tests/formats/test_to_latex.py b/pandas/tests/formats/test_to_latex.py