diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 4a39dd73da7d0..95bf2918f8992 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -216,6 +216,7 @@ API changes as the ``left`` argument. (:issue:`7737`) - Histogram from ``DataFrame.plot`` with ``kind='hist'`` (:issue:`7809`), See :ref:`the docs`. +- Boxplot from ``DataFrame.plot`` with ``kind='box'`` (:issue:`7998`), See :ref:`the docs`. - Consistency when indexing with ``.loc`` and a list-like indexer when no values are found. .. ipython:: python diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index d517e08a34b2d..1cce55cd53e11 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -124,6 +124,7 @@ These include: * :ref:`'bar' ` or :ref:`'barh' ` for bar plots * :ref:`'hist' ` for histogram +* :ref:`'box' ` for boxplot * :ref:`'kde' ` or ``'density'`` for density plots * :ref:`'area' ` for area plots * :ref:`'scatter' ` for scatter plots @@ -244,7 +245,7 @@ See the :meth:`hist ` method and the `matplotlib hist documenation `__ for more. -The previous interface ``DataFrame.hist`` to plot histogram still can be used. +The existing interface ``DataFrame.hist`` to plot histogram still can be used. .. ipython:: python @@ -288,12 +289,65 @@ The ``by`` keyword can be specified to plot grouped histograms: Box Plots ~~~~~~~~~ -DataFrame has a :meth:`~DataFrame.boxplot` method that allows you to visualize the -distribution of values within each column. +Boxplot can be drawn calling a ``Series`` and ``DataFrame.plot`` with ``kind='box'``, +or ``DataFrame.boxplot`` to visualize the distribution of values within each column. + +.. versionadded:: 0.15.0 + +``plot`` method now supports ``kind='box'`` to draw boxplot. For instance, here is a boxplot representing five trials of 10 observations of a uniform random variable on [0,1). +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + df = DataFrame(rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + + @savefig box_plot_new.png + df.plot(kind='box') + +Boxplot can be colorized by passing ``color`` keyword. You can pass a ``dict`` +whose keys are ``boxes``, ``whiskers``, ``medians`` and ``caps``. +If some keys are missing in the ``dict``, default colors are used +for the corresponding artists. Also, boxplot has ``sym`` keyword to specify fliers style. + +When you pass other type of arguments via ``color`` keyword, it will be directly +passed to matplotlib for all the ``boxes``, ``whiskers``, ``medians`` and ``caps`` +colorization. + +The colors are applied to every boxes to be drawn. If you want +more complicated colorization, you can get each drawn artists by passing +:ref:`return_type `. + +.. ipython:: python + + color = dict(boxes='DarkGreen', whiskers='DarkOrange', + medians='DarkBlue', caps='Gray') + + @savefig box_new_colorize.png + df.plot(kind='box', color=color, sym='r+') + +Also, you can pass other keywords supported by matplotlib ``boxplot``. +For example, horizontal and custom-positioned boxplot can be drawn by +``vert=False`` and ``positions`` keywords. + +.. ipython:: python + + @savefig box_new_kwargs.png + df.plot(kind='box', vert=False, positions=[1, 4, 5, 6, 8]) + + +See the :meth:`boxplot ` method and the +`matplotlib boxplot documenation `__ for more. + + +The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. + .. ipython:: python :suppress: @@ -354,18 +408,23 @@ columns: .. _visualization.box.return: -The return type of ``boxplot`` depends on two keyword arguments: ``by`` and ``return_type``. -When ``by`` is ``None``: +Basically, plot functions return :class:`matplotlib Axes ` as a return value. +In ``boxplot``, the return type can be changed by argument ``return_type``, and whether the subplots is enabled (``subplots=True`` in ``plot`` or ``by`` is specified in ``boxplot``). + +When ``subplots=False`` / ``by`` is ``None``: * if ``return_type`` is ``'dict'``, a dictionary containing the :class:`matplotlib Lines ` is returned. The keys are "boxes", "caps", "fliers", "medians", and "whiskers". - This is the default. + This is the default of ``boxplot`` in historical reason. + Note that ``plot(kind='box')`` returns ``Axes`` as default as the same as other plots. * if ``return_type`` is ``'axes'``, a :class:`matplotlib Axes ` containing the boxplot is returned. * if ``return_type`` is ``'both'`` a namedtuple containging the :class:`matplotlib Axes ` and :class:`matplotlib Lines ` is returned -When ``by`` is some column of the DataFrame, a dict of ``return_type`` is returned, where -the keys are the columns of the DataFrame. The plot has a facet for each column of -the DataFrame, with a separate box for each value of ``by``. +When ``subplots=True`` / ``by`` is some column of the DataFrame: + +* A dict of ``return_type`` is returned, where the keys are the columns + of the DataFrame. The plot has a facet for each column of + the DataFrame, with a separate box for each value of ``by``. Finally, when calling boxplot on a :class:`Groupby` object, a dict of ``return_type`` is returned, where the keys are the same as the Groupby object. The plot has a diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 6435f8e741f96..2c99b9befd42b 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -365,7 +365,8 @@ def _check_has_errorbars(self, axes, xerr=0, yerr=0): self.assertEqual(xerr, xerr_count) self.assertEqual(yerr, yerr_count) - def _check_box_return_type(self, returned, return_type, expected_keys=None): + def _check_box_return_type(self, returned, return_type, expected_keys=None, + check_ax_title=True): """ Check box returned type is correct @@ -377,6 +378,10 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None): expected_keys : list-like, optional group labels in subplot case. If not passed, the function checks assuming boxplot uses single ax + check_ax_title : bool + Whether to check the ax.title is the same as expected_key + Intended to be checked by calling from ``boxplot``. + Normal ``plot`` doesn't attach ``ax.title``, it must be disabled. """ from matplotlib.axes import Axes types = {'dict': dict, 'axes': Axes, 'both': tuple} @@ -402,14 +407,17 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None): self.assertTrue(isinstance(value, types[return_type])) # check returned dict has correct mapping if return_type == 'axes': - self.assertEqual(value.get_title(), key) + if check_ax_title: + self.assertEqual(value.get_title(), key) elif return_type == 'both': - self.assertEqual(value.ax.get_title(), key) + if check_ax_title: + self.assertEqual(value.ax.get_title(), key) self.assertIsInstance(value.ax, Axes) self.assertIsInstance(value.lines, dict) elif return_type == 'dict': line = value['medians'][0] - self.assertEqual(line.get_axes().get_title(), key) + if check_ax_title: + self.assertEqual(line.get_axes().get_title(), key) else: raise AssertionError @@ -452,7 +460,7 @@ def test_plot(self): _check_plot_works(self.ts.plot, kind='area', stacked=False) _check_plot_works(self.iseries.plot) - for kind in ['line', 'bar', 'barh', 'kde', 'hist']: + for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: if not _ok_for_gaussian_kde(kind): continue _check_plot_works(self.series[:5].plot, kind=kind) @@ -767,6 +775,15 @@ def test_hist_kde_color(self): self.assertEqual(len(lines), 1) self._check_colors(lines, ['r']) + @slow + def test_boxplot_series(self): + ax = self.ts.plot(kind='box', logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [self.ts.name]) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + @slow def test_autocorrelation_plot(self): from pandas.tools.plotting import autocorrelation_plot @@ -1650,6 +1667,99 @@ def test_bar_log_subplots(self): @slow def test_boxplot(self): + df = self.hist_df + series = df['height'] + numeric_cols = df._get_numeric_data().columns + labels = [com.pprint_thing(c) for c in numeric_cols] + + ax = _check_plot_works(df.plot, kind='box') + self._check_text_labels(ax.get_xticklabels(), labels) + assert_array_equal(ax.xaxis.get_ticklocs(), np.arange(1, len(numeric_cols) + 1)) + self.assertEqual(len(ax.lines), 8 * len(numeric_cols)) + + axes = _check_plot_works(df.plot, kind='box', subplots=True, logy=True) + self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) + self._check_ax_scales(axes, yaxis='log') + for ax, label in zip(axes, labels): + self._check_text_labels(ax.get_xticklabels(), [label]) + self.assertEqual(len(ax.lines), 8) + + axes = series.plot(kind='box', rot=40) + self._check_ticks_props(axes, xrot=40, yrot=0) + tm.close() + + ax = _check_plot_works(series.plot, kind='box') + + positions = np.array([1, 6, 7]) + ax = df.plot(kind='box', positions=positions) + numeric_cols = df._get_numeric_data().columns + labels = [com.pprint_thing(c) for c in numeric_cols] + self._check_text_labels(ax.get_xticklabels(), labels) + assert_array_equal(ax.xaxis.get_ticklocs(), positions) + self.assertEqual(len(ax.lines), 8 * len(numeric_cols)) + + @slow + def test_boxplot_vertical(self): + df = self.hist_df + series = df['height'] + numeric_cols = df._get_numeric_data().columns + labels = [com.pprint_thing(c) for c in numeric_cols] + + # if horizontal, yticklabels are rotated + ax = df.plot(kind='box', rot=50, fontsize=8, vert=False) + self._check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) + self._check_text_labels(ax.get_yticklabels(), labels) + self.assertEqual(len(ax.lines), 8 * len(numeric_cols)) + + axes = _check_plot_works(df.plot, kind='box', subplots=True, + vert=False, logx=True) + self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) + self._check_ax_scales(axes, xaxis='log') + for ax, label in zip(axes, labels): + self._check_text_labels(ax.get_yticklabels(), [label]) + self.assertEqual(len(ax.lines), 8) + + positions = np.array([3, 2, 8]) + ax = df.plot(kind='box', positions=positions, vert=False) + self._check_text_labels(ax.get_yticklabels(), labels) + assert_array_equal(ax.yaxis.get_ticklocs(), positions) + self.assertEqual(len(ax.lines), 8 * len(numeric_cols)) + + @slow + def test_boxplot_return_type(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + with tm.assertRaises(ValueError): + df.plot(kind='box', return_type='NOTATYPE') + + result = df.plot(kind='box', return_type='dict') + self._check_box_return_type(result, 'dict') + + result = df.plot(kind='box', return_type='axes') + self._check_box_return_type(result, 'axes') + + result = df.plot(kind='box', return_type='both') + self._check_box_return_type(result, 'both') + + @slow + def test_boxplot_subplots_return_type(self): + df = self.hist_df + + # normal style: return_type=None + result = df.plot(kind='box', subplots=True) + self.assertIsInstance(result, np.ndarray) + self._check_box_return_type(result, None, + expected_keys=['height', 'weight', 'category']) + + for t in ['dict', 'axes', 'both']: + returned = df.plot(kind='box', return_type=t, subplots=True) + self._check_box_return_type(returned, t, + expected_keys=['height', 'weight', 'category'], + check_ax_title=False) + + @slow + def test_boxplot_legacy(self): df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), columns=['one', 'two', 'three', 'four']) @@ -1693,7 +1803,7 @@ def test_boxplot(self): self.assertEqual(len(ax.get_lines()), len(lines)) @slow - def test_boxplot_return_type(self): + def test_boxplot_return_type_legacy(self): # API change in https://github.com/pydata/pandas/pull/7096 import matplotlib as mpl @@ -2315,6 +2425,61 @@ def test_kde_colors(self): rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) self._check_colors(ax.get_lines(), linecolors=rgba_colors) + @slow + def test_boxplot_colors(self): + + def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k', fliers_c='b'): + self._check_colors(bp['boxes'], linecolors=[box_c] * len(bp['boxes'])) + self._check_colors(bp['whiskers'], linecolors=[whiskers_c] * len(bp['whiskers'])) + self._check_colors(bp['medians'], linecolors=[medians_c] * len(bp['medians'])) + self._check_colors(bp['fliers'], linecolors=[fliers_c] * len(bp['fliers'])) + self._check_colors(bp['caps'], linecolors=[caps_c] * len(bp['caps'])) + + default_colors = self.plt.rcParams.get('axes.color_cycle') + + df = DataFrame(randn(5, 5)) + bp = df.plot(kind='box', return_type='dict') + _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) + tm.close() + + dict_colors = dict(boxes='#572923', whiskers='#982042', + medians='#804823', caps='#123456') + bp = df.plot(kind='box', color=dict_colors, sym='r+', return_type='dict') + _check_colors(bp, dict_colors['boxes'], dict_colors['whiskers'], + dict_colors['medians'], dict_colors['caps'], 'r') + tm.close() + + # partial colors + dict_colors = dict(whiskers='c', medians='m') + bp = df.plot(kind='box', color=dict_colors, return_type='dict') + _check_colors(bp, default_colors[0], 'c', 'm') + tm.close() + + from matplotlib import cm + # Test str -> colormap functionality + bp = df.plot(kind='box', colormap='jet', return_type='dict') + jet_colors = lmap(cm.jet, np.linspace(0, 1, 3)) + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # Test colormap functionality + bp = df.plot(kind='box', colormap=cm.jet, return_type='dict') + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # string color is applied to all artists except fliers + bp = df.plot(kind='box', color='DodgerBlue', return_type='dict') + _check_colors(bp, 'DodgerBlue', 'DodgerBlue', 'DodgerBlue', + 'DodgerBlue') + + # tuple is also applied to all artists except fliers + bp = df.plot(kind='box', color=(0, 1, 0), sym='#123456', return_type='dict') + _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), '#123456') + + with tm.assertRaises(ValueError): + # Color contains invalid key results in ValueError + df.plot(kind='box', color=dict(boxes='red', xxxx='blue')) + def test_default_color_cycle(self): import matplotlib.pyplot as plt plt.rcParams['axes.color_cycle'] = list('rgbk') diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 5fa326a88b682..11f267d55fa09 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1256,10 +1256,7 @@ def _get_style(self, i, col_name): def _get_colors(self, num_colors=None, color_kwds='color'): from pandas.core.frame import DataFrame if num_colors is None: - if isinstance(self.data, DataFrame): - num_colors = len(self.data.columns) - else: - num_colors = 1 + num_colors = self.nseries return _get_standard_colors(num_colors=num_colors, colormap=self.colormap, @@ -1980,7 +1977,6 @@ def _post_plot_logic(self): class PiePlot(MPLPlot): - _layout_type = 'horizontal' def __init__(self, data, kind=None, **kwargs): @@ -2031,12 +2027,152 @@ def _make_plot(self): self._add_legend_handle(p, l) -class BoxPlot(MPLPlot): - pass +class BoxPlot(LinePlot): + _layout_type = 'horizontal' + + _valid_return_types = (None, 'axes', 'dict', 'both') + # namedtuple to hold results + BP = namedtuple("Boxplot", ['ax', 'lines']) + + def __init__(self, data, return_type=None, **kwargs): + # Do not call LinePlot.__init__ which may fill nan + if return_type not in self._valid_return_types: + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") + + self.return_type = return_type + MPLPlot.__init__(self, data, **kwargs) + + def _args_adjust(self): + if self.subplots: + # Disable label ax sharing. Otherwise, all subplots shows last column label + if self.orientation == 'vertical': + self.sharex = False + else: + self.sharey = False + + def _get_plot_function(self): + def plotf(ax, y, column_num=None, **kwds): + if y.ndim == 2: + y = [remove_na(v) for v in y] + else: + y = remove_na(y) + bp = ax.boxplot(y, **kwds) + + if self.return_type == 'dict': + return bp, bp + elif self.return_type == 'both': + return self.BP(ax=ax, lines=bp), bp + else: + return ax, bp + return plotf + + def _validate_color_args(self): + if 'color' in self.kwds: + if self.colormap is not None: + warnings.warn("'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'") + self.color = self.kwds.pop('color') + + if isinstance(self.color, dict): + valid_keys = ['boxes', 'whiskers', 'medians', 'caps'] + for key, values in compat.iteritems(self.color): + if key not in valid_keys: + raise ValueError("color dict contains invalid key '{0}' " + "The key must be either {1}".format(key, valid_keys)) + else: + self.color = None + + # get standard colors for default + colors = _get_standard_colors(num_colors=3, + colormap=self.colormap, + color=None) + # use 2 colors by default, for box/whisker and median + # flier colors isn't needed here + # because it can be specified by ``sym`` kw + self._boxes_c = colors[0] + self._whiskers_c = colors[0] + self._medians_c = colors[2] + self._caps_c = 'k' # mpl default + + def _get_colors(self, num_colors=None, color_kwds='color'): + pass + + def maybe_color_bp(self, bp): + if isinstance(self.color, dict): + boxes = self.color.get('boxes', self._boxes_c) + whiskers = self.color.get('whiskers', self._whiskers_c) + medians = self.color.get('medians', self._medians_c) + caps = self.color.get('caps', self._caps_c) + else: + # Other types are forwarded to matplotlib + # If None, use default colors + boxes = self.color or self._boxes_c + whiskers = self.color or self._whiskers_c + medians = self.color or self._medians_c + caps = self.color or self._caps_c + + from matplotlib.artist import setp + setp(bp['boxes'], color=boxes, alpha=1) + setp(bp['whiskers'], color=whiskers, alpha=1) + setp(bp['medians'], color=medians, alpha=1) + setp(bp['caps'], color=caps, alpha=1) + + def _make_plot(self): + plotf = self._get_plot_function() + if self.subplots: + self._return_obj = compat.OrderedDict() + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + kwds = self.kwds.copy() + + ret, bp = plotf(ax, y, column_num=i, **kwds) + self.maybe_color_bp(bp) + self._return_obj[label] = ret + + label = [com.pprint_thing(label)] + self._set_ticklabels(ax, label) + else: + y = self.data.values.T + ax = self._get_ax(0) + kwds = self.kwds.copy() + + ret, bp = plotf(ax, y, column_num=0, **kwds) + self.maybe_color_bp(bp) + self._return_obj = ret + + labels = [l for l, y in self._iter_data()] + labels = [com.pprint_thing(l) for l in labels] + if not self.use_index: + labels = [com.pprint_thing(key) for key in range(len(labels))] + self._set_ticklabels(ax, labels) + + def _set_ticklabels(self, ax, labels): + if self.orientation == 'vertical': + ax.set_xticklabels(labels) + else: + ax.set_yticklabels(labels) + + def _post_plot_logic(self): + pass + + @property + def orientation(self): + if self.kwds.get('vert', True): + return 'vertical' + else: + return 'horizontal' + + @property + def result(self): + if self.return_type is None: + return super(BoxPlot, self).result + else: + return self._return_obj # kinds supported by both dataframe and series -_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area', 'hist'] +_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area', 'hist', 'box'] # kinds supported by dataframe _dataframe_kinds = ['scatter', 'hexbin'] # kinds supported only by series or dataframe single column @@ -2044,7 +2180,7 @@ class BoxPlot(MPLPlot): _all_kinds = _common_kinds + _dataframe_kinds + _series_kinds _plot_klass = {'line': LinePlot, 'bar': BarPlot, 'barh': BarPlot, - 'kde': KdePlot, 'hist': HistPlot, + 'kde': KdePlot, 'hist': HistPlot, 'box': BoxPlot, 'scatter': ScatterPlot, 'hexbin': HexBinPlot, 'area': AreaPlot, 'pie': PiePlot} @@ -2091,13 +2227,14 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, ax : matplotlib axis object, default None style : list or dict matplotlib line style per column - kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area', 'scatter', 'hexbin'} + kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area', 'box', 'scatter', 'hexbin'} line : line plot bar : vertical bar plot barh : horizontal bar plot hist : histogram kde/density : Kernel Density Estimation plot area : area plot + box : box plot scatter : scatter plot hexbin : hexbin plot logx : boolean, default False @@ -2237,13 +2374,14 @@ def plot_series(series, label=None, kind='line', use_index=True, rot=None, Parameters ---------- label : label argument to provide to plot - kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area'} + kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area', 'box'} line : line plot bar : vertical bar plot barh : horizontal bar plot hist : histogram kde/density : Kernel Density Estimation plot area : area plot + box : box plot use_index : boolean, default True Plot index as axis tick labels rot : int, default None @@ -2373,8 +2511,8 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, # validate return_type: valid_types = (None, 'axes', 'dict', 'both') - if return_type not in valid_types: - raise ValueError("return_type") + if return_type not in BoxPlot._valid_return_types: + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") from pandas import Series, DataFrame if isinstance(data, Series): @@ -2391,8 +2529,6 @@ def maybe_color_bp(bp): setp(bp['whiskers'],color=colors[0],alpha=1) setp(bp['medians'],color=colors[2],alpha=1) - BP = namedtuple("Boxplot", ['ax', 'lines']) # namedtuple to hold results - def plot_group(keys, values, ax): keys = [com.pprint_thing(x) for x in keys] values = [remove_na(v) for v in values] @@ -2407,7 +2543,7 @@ def plot_group(keys, values, ax): if return_type == 'dict': return bp elif return_type == 'both': - return BP(ax=ax, lines=bp) + return BoxPlot.BP(ax=ax, lines=bp) else: return ax