pandas-dev · jreback · Jul 12, 2021 · Dec 3, 2018 · Jan 19, 2019 · Jul 30, 2019
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -233,6 +233,7 @@ Other enhancements
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
 - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
 - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
+- Add support for assigning values to ``by`` argument in :meth:``DataFrame.plot.hist`` and :meth:``DataFrame.plot.box`` (:issue:`15079`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -1277,6 +1277,9 @@ def hist(self, by=None, bins=10, **kwargs):
         ----------
         by : str or sequence, optional
             Column in the DataFrame to group by.
+
+            .. versionadded:: 1.3.0
+
         bins : int, default 10
             Number of histogram bins to be used.
         **kwargs
@@ -1308,6 +1311,16 @@ def hist(self, by=None, bins=10, **kwargs):
             ...     columns = ['one'])
             >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
             >>> ax = df.plot.hist(bins=12, alpha=0.5)
+
+        A grouped histogram can be generated by providing the parameter `by` (which
+        can be a column name, or a list of column names):
+
+        .. plot::
+            :context: close-figs
+
+            >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
+            >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
+            >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8))
         """
         return self(kind="hist", by=by, bins=bins, **kwargs)
 

diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
@@ -18,6 +18,7 @@
     LinePlot,
     MPLPlot,
 )
+from pandas.plotting._matplotlib.groupby import create_iter_data_given_by
 from pandas.plotting._matplotlib.style import get_standard_colors
 from pandas.plotting._matplotlib.tools import (
     create_subplots,
@@ -135,17 +136,29 @@ def _make_plot(self):
         if self.subplots:
             self._return_obj = pd.Series(dtype=object)
 
-            for i, (label, y) in enumerate(self._iter_data()):
+            data = create_iter_data_given_by(self.data, self.by, self._kind)
+            for i, (label, y) in enumerate(self._iter_data(data=data)):
                 ax = self._get_ax(i)
                 kwds = self.kwds.copy()
 
+                # When by is applied, show title for subplots to know which group it is
+                # just like df.boxplot, and need to apply T on y to provide right input
+                if self.by is not None:
+                    y = y.T
+                    ax.set_title(pprint_thing(label))
+
                 ret, bp = self._plot(
                     ax, y, column_num=i, return_type=self.return_type, **kwds
                 )
                 self.maybe_color_bp(bp)
                 self._return_obj[label] = ret
 
                 label = [pprint_thing(label)]
+
+                # When `by` is assigned, the ticklabels will become unique grouped
+                # values, instead of label which is used as subtitle in this case.
+                if self.by is not None:
+                    label = [pprint_thing(col) for col in self.data.columns.levels[0]]
                 self._set_ticklabels(ax, label)
         else:
             y = self.data.values.T

diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
@@ -9,6 +9,7 @@
 from matplotlib.artist import Artist
 import numpy as np
 
+from pandas._typing import IndexLabel
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
 
@@ -42,6 +43,7 @@
 from pandas.io.formats.printing import pprint_thing
 from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0
 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters
+from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by
 from pandas.plotting._matplotlib.style import get_standard_colors
 from pandas.plotting._matplotlib.timeseries import (
     decorate_axes,
@@ -99,7 +101,7 @@ def __init__(
         self,
         data,
         kind=None,
-        by=None,
+        by: IndexLabel | None = None,
         subplots=False,
         sharex=None,
         sharey=False,
@@ -124,13 +126,34 @@ def __init__(
         table=False,
         layout=None,
         include_bool=False,
+        column: IndexLabel | None = None,
         **kwds,
     ):
 
         import matplotlib.pyplot as plt
 
         self.data = data
-        self.by = by
+        self.by = com.maybe_make_list(by)
+
+        # Assign the rest of columns into self.columns if by is explicitly defined
+        # while column is not, so as to keep the same behaviour with current df.hist
+        # or df.boxplot.
+        if self.by and column is None:
+            self.columns = [
+                col
+                for col in data.columns
+                if col not in self.by and is_numeric_dtype(data[col])
+            ]
+        else:
+            self.columns = com.maybe_make_list(column)
+
+        # When `by` is explicitly assigned, grouped data size will be defined, and
+        # this will determine number of subplots to have, aka `self.nseries`
+        if self.by:
+            if self._kind == "hist":
+                self._grouped_data_size = len(data.groupby(self.by))
+            elif self._kind == "box":
+                self._grouped_data_size = len(self.columns)
 
         self.kind = kind
 
@@ -139,7 +162,9 @@ def __init__(
         self.subplots = subplots
 
         if sharex is None:
-            if ax is None:
+
+            # if by is defined, subplots are used and sharex should be False
+            if ax is None and by is None:
                 self.sharex = True
             else:
                 # if we get an axis, the users should do the visibility
@@ -275,8 +300,10 @@ def _iter_data(self, data=None, keep_index=False, fillna=None):
     def nseries(self) -> int:
         if self.data.ndim == 1:
             return 1
-        else:
+        elif self.by is None:
             return self.data.shape[1]
+        else:
+            return self._grouped_data_size
 
     def draw(self):
         self.plt.draw_if_interactive()
@@ -421,6 +448,11 @@ def _compute_plot_data(self):
                 label = "None"
             data = data.to_frame(name=label)
 
+        # GH15079 reconstruct data if by is defined
+        if self.by is not None:
+            self.subplots = True
+            data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns)
+
         # GH16953, _convert is needed as fallback, for ``Series``
         # with ``dtype == object``
         data = data._convert(datetime=True, timedelta=True)

diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py
@@ -0,0 +1,136 @@
+from typing import (
+    Dict,
+    Optional,
+    Union,
+)
+
+import numpy as np
+
+from pandas._typing import (
+    FrameOrSeriesUnion,
+    IndexLabel,
+)
+
+from pandas.core.dtypes.missing import remove_na_arraylike
+
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    concat,
+)
+
+
+def create_iter_data_given_by(
+    data: DataFrame, by: Optional[IndexLabel] = None, kind: str = "hist"
+) -> Union[DataFrame, Dict[str, FrameOrSeriesUnion]]:
+    """
+    Create data for iteration given `by` is assigned or not, and it is only
+    used in both hist and boxplot.
+
+    If `by` is assigned, return a dictionary of DataFrames in which the key of
+    dictionary is the values in groups.
+    If `by` is not assigned, return input as is, and this preserves current
+    status of iter_data.
+
+    Parameters
+    ----------
+    data: reformatted grouped data from `_compute_plot_data` method.
+    by: list or None, value assigned to `by`.
+    kind: str, plot kind. This function is only used for `hist` and `box` plots.
+
+    Returns
+    -------
+    iter_data: DataFrame or Dictionary of DataFrames
+
+    Examples
+    --------
+    If `by` is assigned:
+
+    >>> import numpy as np
+    >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')]
+    >>> mi = MultiIndex.from_tuples(tuples)
+    >>> value = [[1, 3, np.nan, np.nan],
+    ...          [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]]
+    >>> data = DataFrame(value, columns=mi)
+    >>> create_iter_data_given_by(data, by=["col1"])
+    {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}),
+     'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})}
+    """
+    if kind == "hist":
+        level = 0
+    elif kind == "box":
+        level = 1
+    else:
+        raise ValueError("This function is only used for hist and box plot")
+
+    iter_data: Union[DataFrame, Dict[str, FrameOrSeriesUnion]]
+    if not by:
+        iter_data = data
+    else:
+        # Select sub-columns based on the value of first level of MI
+        assert isinstance(data.columns, MultiIndex)
+        cols = data.columns.levels[level]
+        iter_data = {
+            col: data.loc[:, data.columns.get_level_values(level) == col]
+            for col in cols
+        }
+    return iter_data
+
+
+def reconstruct_data_with_by(
+    data: DataFrame, by: IndexLabel, cols: IndexLabel
+) -> DataFrame:
+    """
+    Internal function to group data, and reassign multiindex column names onto the
+    result in order to let grouped data be used in _compute_plot_data method.
+
+    Parameters
+    ----------
+    data: Original DataFrame to plot
+    by: grouped `by` parameter selected by users
+    cols: columns of data set (excluding columns used in `by`)
+
+    Returns
+    -------
+    Output is the reconstructed DataFrame with MultiIndex columns. The first level
+    of MI is unique values of groups, and second level of MI is the columns
+    selected by users.
+
+    Examples
+    --------
+    >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]}
+    >>> df = DataFrame(d)
+    >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b'])
+       h1      h2
+       a   b   a   b
+    0  1   3   NaN NaN
+    1  3   4   NaN NaN
+    2  NaN NaN 5   6
+    """
+    grouped = data.groupby(by)
+
+    data_list = []
+    for key, group in grouped:
+        columns = MultiIndex.from_product([[key], cols])
+        sub_group = group[cols]
+        sub_group.columns = columns
+        data_list.append(sub_group)
+
+    data = concat(data_list, axis=1)
+    return data
+
+
+def reformat_hist_y_given_by(
+    y: Union[Series, np.ndarray], by: Optional[IndexLabel] = None
+) -> Union[Series, np.ndarray]:
+    """Internal function to reformat y given `by` is applied or not for hist plot.
+
+    If by is None, input y is 1-d with NaN removed; and if by is not None, groupby
+    will take place and input y is multi-dimensional array.
+    """
+    if by is not None and len(y.shape) > 1:
+        y = np.array([remove_na_arraylike(col) for col in y.T]).T
+    else:
+        y = remove_na_arraylike(y)
+    return y