From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 001/142] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From b2f45a61958c22d11e03de621a09c47169a07d03 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 18:38:58 +0200 Subject: [PATCH 002/142] fix by in hist --- pandas/plotting/_core.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 837b01974be93..82809f9d9ebef 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -679,6 +679,8 @@ def _get_call_args(backend_name, data, args, kwargs): ("xerr", None), ("secondary_y", False), ("sort_columns", False), + ("by", None), + ("column", None), ] else: raise TypeError( @@ -790,6 +792,12 @@ def __call__(self, *args, **kwargs): ) label_name = label_kw or data.columns data.columns = label_name + if kwargs.get("by") is not None: + grouped = data.groupby(kwargs.get("by")) + if kwargs.get("column") is not None: + grouped = grouped[kwargs.get("column")] + + data = grouped return plot_backend.plot(data, kind=kind, **kwargs) From 8b6e00a59268b2e3977d0106f3815fd4b08612e5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 20:28:07 +0200 Subject: [PATCH 003/142] make plot work --- pandas/plotting/_core.py | 9 +++-- pandas/plotting/_matplotlib/hist.py | 63 +++++++++++++++++++---------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 82809f9d9ebef..e45c3e511f25d 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -793,12 +793,15 @@ def __call__(self, *args, **kwargs): label_name = label_kw or data.columns data.columns = label_name if kwargs.get("by") is not None: + import pandas as pd grouped = data.groupby(kwargs.get("by")) if kwargs.get("column") is not None: grouped = grouped[kwargs.get("column")] - - data = grouped - + d = {} + for key, group in grouped: + d[key] = group + data = pd.DataFrame(d) + kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) def line(self, x=None, y=None, **kwargs): diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 5213e09f14067..1defa6116bbcc 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -20,22 +20,24 @@ class HistPlot(LinePlot): def __init__(self, data, bins=10, bottom=0, **kwargs): self.bins = bins # use mpl default self.bottom = bottom + self.by = kwargs["by"] # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): - if is_integer(self.bins): - # create common bin edge - values = self.data._convert(datetime=True)._get_numeric_data() - values = np.ravel(values) - values = values[~isna(values)] - - hist, self.bins = np.histogram( - values, - bins=self.bins, - range=self.kwds.get("range", None), - weights=self.kwds.get("weights", None), - ) + if self.by is None: + if is_integer(self.bins): + # create common bin edge + values = self.data._convert(datetime=True)._get_numeric_data() + values = np.ravel(values) + values = values[~isna(values)] + + hist, self.bins = np.histogram( + values, + bins=self.bins, + range=self.kwds.get("range", None), + weights=self.kwds.get("weights", None), + ) if is_list_like(self.bottom): self.bottom = np.array(self.bottom) @@ -67,21 +69,38 @@ def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() - for i, (label, y) in enumerate(self._iter_data()): - ax = self._get_ax(i) + if self.by is None: + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + + kwds = self.kwds.copy() + label = pprint_thing(label) + kwds["label"] = label + + style, kwds = self._apply_style_colors(colors, kwds, i, label) + if style is not None: + kwds["style"] = style + kwds = self._make_plot_keywords(kwds, y) + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) + self._add_legend_handle(artists[0], label, index=i) + + else: kwds = self.kwds.copy() + kwds = self._make_plot_keywords(kwds, None) + naxes = len(list(self._iter_data())) - label = pprint_thing(label) - kwds["label"] = label + fig, axes = _subplots(naxes=naxes) + _axes = _flatten(axes) + for i, (label, y) in enumerate(self._iter_data()): + ax = _axes[i] - style, kwds = self._apply_style_colors(colors, kwds, i, label) - if style is not None: - kwds["style"] = style + ax.hist(y, **kwds) + ax.set_title(pprint_thing(label)) - kwds = self._make_plot_keywords(kwds, y) - artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) - self._add_legend_handle(artists[0], label, index=i) + fig.subplots_adjust( + bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 + ) def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" From dc0c2ec9efe31a9963deb81b6588aef036e1224c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:05:23 +0200 Subject: [PATCH 004/142] add _group_plot function --- pandas/plotting/_matplotlib/hist.py | 38 +++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 1defa6116bbcc..18a9398c6d365 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -65,6 +65,27 @@ def _plot( cls._update_stacker(ax, stacking_id, n) return patches + @classmethod + def _group_plot(cls, ax, data, naxes, rot=90, xrot=None, **kwds): + converter._WARN = False # no warning for pandas plots + xrot = xrot or rot + fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False) + _axes = _flatten(axes) + + for i, (label, y) in enumerate(data): + ax = _axes[i] + + ax.hist(y, **kwds) + ax.set_title(pprint_thing(label)) + + _set_ticks_props( + axes, xrot=xrot + ) + + fig.subplots_adjust( + bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 + ) + def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() @@ -86,21 +107,12 @@ def _make_plot(self): self._add_legend_handle(artists[0], label, index=i) else: + naxes = len(list(self._iter_data())) + data = self._iter_data() kwds = self.kwds.copy() kwds = self._make_plot_keywords(kwds, None) - naxes = len(list(self._iter_data())) - - fig, axes = _subplots(naxes=naxes) - _axes = _flatten(axes) - for i, (label, y) in enumerate(self._iter_data()): - ax = _axes[i] - - ax.hist(y, **kwds) - ax.set_title(pprint_thing(label)) - - fig.subplots_adjust( - bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 - ) + ax = self._get_ax(0) + self._group_plot(ax, data, naxes, **kwds) def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" From d8039389eeb21423b0731e79a665dd78fd3f690c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:10:46 +0200 Subject: [PATCH 005/142] check function --- pandas/plotting/_matplotlib/hist.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 18a9398c6d365..2d08d765efbd1 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -78,9 +78,7 @@ def _group_plot(cls, ax, data, naxes, rot=90, xrot=None, **kwds): ax.hist(y, **kwds) ax.set_title(pprint_thing(label)) - _set_ticks_props( - axes, xrot=xrot - ) + _set_ticks_props(axes, xrot=xrot) fig.subplots_adjust( bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 From 33dd762f0c4b49ae3c3999c630141a105e567a9e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:18:21 +0200 Subject: [PATCH 006/142] reformat --- pandas/plotting/_core.py | 1 + pandas/plotting/_matplotlib/hist.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index e45c3e511f25d..1a3999e278a12 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -794,6 +794,7 @@ def __call__(self, *args, **kwargs): data.columns = label_name if kwargs.get("by") is not None: import pandas as pd + grouped = data.groupby(kwargs.get("by")) if kwargs.get("column") is not None: grouped = grouped[kwargs.get("column")] diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 2d08d765efbd1..0f790bce663fe 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -101,7 +101,9 @@ def _make_plot(self): kwds["style"] = style kwds = self._make_plot_keywords(kwds, y) - artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) + artists = self._plot( + ax, y, column_num=i, stacking_id=stacking_id, **kwds + ) self._add_legend_handle(artists[0], label, index=i) else: From d59d64284036cb0e6f41fa2a73550875180f5fb4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:19:45 +0200 Subject: [PATCH 007/142] put import up --- pandas/plotting/_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 1a3999e278a12..849d049336235 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -6,6 +6,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender +from pandas import DataFrame from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -793,7 +794,6 @@ def __call__(self, *args, **kwargs): label_name = label_kw or data.columns data.columns = label_name if kwargs.get("by") is not None: - import pandas as pd grouped = data.groupby(kwargs.get("by")) if kwargs.get("column") is not None: @@ -801,7 +801,7 @@ def __call__(self, *args, **kwargs): d = {} for key, group in grouped: d[key] = group - data = pd.DataFrame(d) + data = DataFrame(d) kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) From 66eb06c487c5fd7dae85876cc3bbd4732d62b9e1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:21:57 +0200 Subject: [PATCH 008/142] add comments --- pandas/plotting/_core.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 849d049336235..9cb9f24fdf82b 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -793,15 +793,19 @@ def __call__(self, *args, **kwargs): ) label_name = label_kw or data.columns data.columns = label_name + + # process groupby if by argument is defined if kwargs.get("by") is not None: grouped = data.groupby(kwargs.get("by")) if kwargs.get("column") is not None: grouped = grouped[kwargs.get("column")] - d = {} + + # recreate data according to groupby object + data_dict = {} for key, group in grouped: - d[key] = group - data = DataFrame(d) + data_dict[key] = group + data = DataFrame(data_dict) kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) From ea267adbbef68507f99bbca60fc68bd19cb5949f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:32:22 +0200 Subject: [PATCH 009/142] Mimic group plot --- pandas/plotting/_matplotlib/hist.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 0f790bce663fe..b48b3da80908b 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -66,10 +66,21 @@ def _plot( return patches @classmethod - def _group_plot(cls, ax, data, naxes, rot=90, xrot=None, **kwds): + def _group_plot( + cls, ax, data, naxes, rot=90, xrot=None, sharex=False, sharey=False, **kwds + ): + if "figure" in kwds: + raise ValueError( + "Cannot pass 'figure' when using the " + "'by' argument, since a new 'Figure' instance " + "will be created" + ) + converter._WARN = False # no warning for pandas plots xrot = xrot or rot - fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False) + fig, axes = _subplots( + naxes=naxes, ax=ax, squeeze=False, sharex=sharex, sharey=sharey + ) _axes = _flatten(axes) for i, (label, y) in enumerate(data): From 809522447681ab29048184109d36e439d45fd4c5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:50:41 +0200 Subject: [PATCH 010/142] fix import failure --- pandas/plotting/_core.py | 4 ++-- pandas/plotting/_matplotlib/hist.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 9cb9f24fdf82b..b26254eda9a26 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -6,7 +6,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender -from pandas import DataFrame +import pandas as pd from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -805,7 +805,7 @@ def __call__(self, *args, **kwargs): data_dict = {} for key, group in grouped: data_dict[key] = group - data = DataFrame(data_dict) + data = pd.DataFrame(data_dict) kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index b48b3da80908b..91fd09e4a19cd 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -12,7 +12,7 @@ from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots - +import matplotlib.pyplot as plt class HistPlot(LinePlot): _kind = "hist" @@ -67,7 +67,7 @@ def _plot( @classmethod def _group_plot( - cls, ax, data, naxes, rot=90, xrot=None, sharex=False, sharey=False, **kwds + cls, ax, data, naxes, rot=90, xrot=None, sharex=False, sharey=False, layout=None, **kwds ): if "figure" in kwds: raise ValueError( @@ -79,7 +79,7 @@ def _group_plot( converter._WARN = False # no warning for pandas plots xrot = xrot or rot fig, axes = _subplots( - naxes=naxes, ax=ax, squeeze=False, sharex=sharex, sharey=sharey + naxes=naxes, ax=ax, squeeze=False, sharex=sharex, sharey=sharey, layout=layout ) _axes = _flatten(axes) From 31decc1056e66d7150cc9e9c2fc3cdf7c745b399 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 21:55:31 +0200 Subject: [PATCH 011/142] reformat --- pandas/plotting/_matplotlib/hist.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 91fd09e4a19cd..1d123315e59ea 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -12,7 +12,7 @@ from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots -import matplotlib.pyplot as plt + class HistPlot(LinePlot): _kind = "hist" @@ -67,7 +67,16 @@ def _plot( @classmethod def _group_plot( - cls, ax, data, naxes, rot=90, xrot=None, sharex=False, sharey=False, layout=None, **kwds + cls, + ax, + data, + naxes, + rot=90, + xrot=None, + sharex=False, + sharey=False, + layout=None, + **kwds ): if "figure" in kwds: raise ValueError( @@ -79,7 +88,12 @@ def _group_plot( converter._WARN = False # no warning for pandas plots xrot = xrot or rot fig, axes = _subplots( - naxes=naxes, ax=ax, squeeze=False, sharex=sharex, sharey=sharey, layout=layout + naxes=naxes, + ax=ax, + squeeze=False, + sharex=sharex, + sharey=sharey, + layout=layout, ) _axes = _flatten(axes) From e4bdbd0df564817887cf041c811ece5c9a3f4109 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 22:27:14 +0200 Subject: [PATCH 012/142] fix test --- pandas/plotting/_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index b26254eda9a26..07b93f442047c 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -680,8 +680,6 @@ def _get_call_args(backend_name, data, args, kwargs): ("xerr", None), ("secondary_y", False), ("sort_columns", False), - ("by", None), - ("column", None), ] else: raise TypeError( From 4033159d67a1fbc309c6b6903ed0dee767049dc2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 22:45:57 +0200 Subject: [PATCH 013/142] hacky fix --- pandas/plotting/_matplotlib/hist.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 1d123315e59ea..b890a6aea2ed9 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,5 +1,6 @@ import warnings +import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -76,6 +77,9 @@ def _group_plot( sharex=False, sharey=False, layout=None, + xlabelsize=None, + ylabelsize=None, + yrot=None, **kwds ): if "figure" in kwds: @@ -103,11 +107,14 @@ def _group_plot( ax.hist(y, **kwds) ax.set_title(pprint_thing(label)) - _set_ticks_props(axes, xrot=xrot) + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) fig.subplots_adjust( bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 ) + plt.show() def _make_plot(self): colors = self._get_colors() From 57a3bdf23ecd845c61c2a179e69362bfa7c41751 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 10 Sep 2019 22:46:48 +0200 Subject: [PATCH 014/142] fix isrot --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 07b93f442047c..0c1e25a8cfe82 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -6,10 +6,10 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender -import pandas as pd from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +import pandas as pd from pandas.core.base import PandasObject # Trigger matplotlib import, which implicitly registers our From 80602233d6a5a42bfe63991ff78865a4d09fa5f2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 11 Sep 2019 11:26:34 +0200 Subject: [PATCH 015/142] fix tests --- pandas/plotting/_core.py | 4 ++-- pandas/plotting/_matplotlib/hist.py | 10 ++++------ pandas/tests/plotting/test_frame.py | 7 +++++++ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 0c1e25a8cfe82..343c44a424089 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -import pandas as pd +from pandas import DataFrame from pandas.core.base import PandasObject # Trigger matplotlib import, which implicitly registers our @@ -803,7 +803,7 @@ def __call__(self, *args, **kwargs): data_dict = {} for key, group in grouped: data_dict[key] = group - data = pd.DataFrame(data_dict) + data = DataFrame(data_dict) kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index b890a6aea2ed9..c6103c2e891cb 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -21,7 +21,7 @@ class HistPlot(LinePlot): def __init__(self, data, bins=10, bottom=0, **kwargs): self.bins = bins # use mpl default self.bottom = bottom - self.by = kwargs["by"] + self.by = kwargs.get("by") # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) @@ -99,11 +99,10 @@ def _group_plot( sharey=sharey, layout=layout, ) - _axes = _flatten(axes) + _axes = _flatten(axes) for i, (label, y) in enumerate(data): ax = _axes[i] - ax.hist(y, **kwds) ax.set_title(pprint_thing(label)) @@ -114,7 +113,7 @@ def _group_plot( fig.subplots_adjust( bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 ) - plt.show() + return axes def _make_plot(self): colors = self._get_colors() @@ -143,8 +142,7 @@ def _make_plot(self): data = self._iter_data() kwds = self.kwds.copy() kwds = self._make_plot_keywords(kwds, None) - ax = self._get_ax(0) - self._group_plot(ax, data, naxes, **kwds) + self._group_plot(self._get_ax(0), data, naxes, **kwds) def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index f672cd3a6aa58..7390a4497ee2a 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3229,6 +3229,13 @@ def test_subplots_sharex_false(self): tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) + def test_hist_plot_by_argument(self): + # GH 15079 + df = DataFrame(np.random.randn(30, 2), columns=['A', 'B']) + df["C"] = np.random.choice(["a", "b", "c"], 30) + + _check_plot_works(df.plot.hist, column='A', by='C') + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt From d66633494638536c39156fc4460d32afc7a5f976 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 11 Sep 2019 11:42:43 +0200 Subject: [PATCH 016/142] fix import failure --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 343c44a424089..5d8f28842845f 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas import DataFrame +from pandas.core.frame import DataFrame from pandas.core.base import PandasObject # Trigger matplotlib import, which implicitly registers our From 3216d5984a5bf42ab5490dc3ce23d7ad627b5a30 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 11 Sep 2019 13:15:17 +0200 Subject: [PATCH 017/142] fix import error --- pandas/plotting/_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 5d8f28842845f..a35f23ef7f7e0 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1,6 +1,7 @@ import importlib import warnings +import pandas as pd from pandas._config import get_option from pandas.compat._optional import import_optional_dependency @@ -9,7 +10,6 @@ from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.frame import DataFrame from pandas.core.base import PandasObject # Trigger matplotlib import, which implicitly registers our @@ -803,7 +803,7 @@ def __call__(self, *args, **kwargs): data_dict = {} for key, group in grouped: data_dict[key] = group - data = DataFrame(data_dict) + data = pd.DataFrame(data_dict) kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) From 45f4b7fa2cc9b937a7f1548b04ead5b360e43006 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 11 Sep 2019 14:50:06 +0200 Subject: [PATCH 018/142] Update imports --- pandas/plotting/_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a35f23ef7f7e0..b45899e22f712 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1,7 +1,6 @@ import importlib import warnings -import pandas as pd from pandas._config import get_option from pandas.compat._optional import import_optional_dependency @@ -11,6 +10,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.base import PandasObject +from pandas.core.frame import DataFrame # Trigger matplotlib import, which implicitly registers our # converts. Implicit registration is deprecated, and when enforced @@ -803,7 +803,7 @@ def __call__(self, *args, **kwargs): data_dict = {} for key, group in grouped: data_dict[key] = group - data = pd.DataFrame(data_dict) + data = DataFrame(data_dict) kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) From 2b0785b3282072c891c932a1637fd069ba4e1e39 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 11 Sep 2019 16:09:35 +0200 Subject: [PATCH 019/142] test imports --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index b45899e22f712..da9894a5eb5d1 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -10,7 +10,6 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.base import PandasObject -from pandas.core.frame import DataFrame # Trigger matplotlib import, which implicitly registers our # converts. Implicit registration is deprecated, and when enforced @@ -794,6 +793,7 @@ def __call__(self, *args, **kwargs): # process groupby if by argument is defined if kwargs.get("by") is not None: + from pandas.core.frame import DataFrame grouped = data.groupby(kwargs.get("by")) if kwargs.get("column") is not None: From d79dba3e170badf8cc4d8da57be33a81db18dfe3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 12 Sep 2019 20:10:08 +0200 Subject: [PATCH 020/142] new change --- pandas/plotting/_core.py | 15 --------------- pandas/plotting/_matplotlib/core.py | 16 ++++++++++++++++ pandas/plotting/_matplotlib/hist.py | 4 +++- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index da9894a5eb5d1..7ec7a2b596abb 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -790,21 +790,6 @@ def __call__(self, *args, **kwargs): ) label_name = label_kw or data.columns data.columns = label_name - - # process groupby if by argument is defined - if kwargs.get("by") is not None: - from pandas.core.frame import DataFrame - - grouped = data.groupby(kwargs.get("by")) - if kwargs.get("column") is not None: - grouped = grouped[kwargs.get("column")] - - # recreate data according to groupby object - data_dict = {} - for key, group in grouped: - data_dict[key] = group - data = DataFrame(data_dict) - kwargs.pop("column") return plot_backend.plot(data, kind=kind, **kwargs) def line(self, x=None, y=None, **kwargs): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 346949cb82c4d..02f6069bd63d6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -25,6 +25,7 @@ ) from pandas.core.dtypes.missing import isna, notna +from pandas.core.frame import DataFrame import pandas.core.common as com from pandas.io.formats.printing import pprint_thing @@ -107,6 +108,7 @@ def __init__( table=False, layout=None, include_bool=False, + column=None, **kwds ): @@ -115,6 +117,7 @@ def __init__( converter._WARN = False # no warning for pandas plots self.data = data self.by = by + self.column = column self.kind = kind @@ -399,6 +402,19 @@ def _compute_plot_data(self): label = "None" data = data.to_frame(name=label) + # GH15079 restructure data if by is defined + if self.by is not None: + grouped = data.groupby(self.by) + + if self.column is not None: + grouped = grouped[self.column] + + # recreate data according to groupby object + data_dict = {} + for key, group in grouped: + data_dict[key] = group + data = DataFrame(data_dict) + # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index c6103c2e891cb..82246065bd3c3 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,6 +1,5 @@ import warnings -import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -13,6 +12,7 @@ from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots +from pandas.core.frame import DataFrame class HistPlot(LinePlot): @@ -22,6 +22,8 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): self.bins = bins # use mpl default self.bottom = bottom self.by = kwargs.get("by") + self.column = kwargs.get("column") + # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) From 321fbd24a7566a4a29af5b46d0255c0d8898a883 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 12 Sep 2019 20:12:21 +0200 Subject: [PATCH 021/142] restore removed line --- pandas/plotting/_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 7ec7a2b596abb..837b01974be93 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -790,6 +790,7 @@ def __call__(self, *args, **kwargs): ) label_name = label_kw or data.columns data.columns = label_name + return plot_backend.plot(data, kind=kind, **kwargs) def line(self, x=None, y=None, **kwargs): From a7b9ae556cdedb6ce506b19ccba4421e5578bbbc Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 12 Sep 2019 20:14:13 +0200 Subject: [PATCH 022/142] Remove unused line --- pandas/plotting/_matplotlib/hist.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 82246065bd3c3..a016e3f9efe71 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -22,7 +22,6 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): self.bins = bins # use mpl default self.bottom = bottom self.by = kwargs.get("by") - self.column = kwargs.get("column") # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) From d2d13fd8fb25dd80b527a7ad80babbd552512273 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 12 Sep 2019 23:04:29 +0200 Subject: [PATCH 023/142] Disruptive change --- pandas/plotting/_matplotlib/core.py | 51 +++++++++++++++++++++-------- pandas/plotting/_matplotlib/hist.py | 28 +++++----------- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c697cbe7ddaae..d3d5f307e7b01 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -25,8 +25,10 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas.core.frame import DataFrame +from pandas import concat import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.index import MultiIndex from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib import converter @@ -126,7 +128,7 @@ def __init__( self.subplots = subplots if sharex is None: - if ax is None: + if ax is None and by is None: self.sharex = True else: # if we get an axis, the users should do the visibility @@ -263,18 +265,30 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): # else: # columns = data.columns - for col, values in data.items(): - if keep_index is True: - yield col, values - else: - yield col, values.values + if not isinstance(data.columns, ABCMultiIndex): + for col, values in data.items(): + if keep_index is True: + yield col, values + else: + yield col, values.values + else: + cols = data.columns.get_level_values(0).unique() + + for col in cols: + if keep_index is True: + yield col, data[col] + else: + yield col, data[col].values @property def nseries(self): if self.data.ndim == 1: return 1 else: - return self.data.shape[1] + if not isinstance(self.data.columns, ABCMultiIndex): + return self.data.shape[1] + else: + return len(set(self.data.columns.get_level_values(0))) def draw(self): self.plt.draw_if_interactive() @@ -404,17 +418,28 @@ def _compute_plot_data(self): # GH15079 restructure data if by is defined if self.by is not None: + self.subplots = True grouped = data.groupby(self.by) if self.column is not None: grouped = grouped[self.column] - # recreate data according to groupby object - data_dict = {} - for key, group in grouped: - data_dict[key] = group - data = DataFrame(data_dict) + if len(self.column) == 1: + # recreate data according to groupby object + data_dict = {} + for key, group in grouped: + data_dict[key] = group + data = DataFrame(data_dict) + else: + l = [] + for key, group in grouped: + columns = MultiIndex.from_product([[key], self.column]) + group = group[self.column] + group.columns = columns + l.append(group) + + data = concat(l, axis=1) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index a016e3f9efe71..ecc8dfc64f284 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -12,7 +12,6 @@ from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots -from pandas.core.frame import DataFrame class HistPlot(LinePlot): @@ -70,14 +69,12 @@ def _plot( @classmethod def _group_plot( cls, - ax, + axes, data, - naxes, + fig, + labels, rot=90, xrot=None, - sharex=False, - sharey=False, - layout=None, xlabelsize=None, ylabelsize=None, yrot=None, @@ -92,19 +89,11 @@ def _group_plot( converter._WARN = False # no warning for pandas plots xrot = xrot or rot - fig, axes = _subplots( - naxes=naxes, - ax=ax, - squeeze=False, - sharex=sharex, - sharey=sharey, - layout=layout, - ) - _axes = _flatten(axes) for i, (label, y) in enumerate(data): - ax = _axes[i] - ax.hist(y, **kwds) + ax = axes[i] + # TODO: now df.hist also has no value for this + ax.hist(y, label=labels, **kwds) ax.set_title(pprint_thing(label)) _set_ticks_props( @@ -112,7 +101,7 @@ def _group_plot( ) fig.subplots_adjust( - bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 + bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.8, wspace=0.3 ) return axes @@ -139,11 +128,10 @@ def _make_plot(self): self._add_legend_handle(artists[0], label, index=i) else: - naxes = len(list(self._iter_data())) data = self._iter_data() kwds = self.kwds.copy() kwds = self._make_plot_keywords(kwds, None) - self._group_plot(self._get_ax(0), data, naxes, **kwds) + self._group_plot(self.axes, data, self.fig, self.column, **kwds) def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" From 5abedb6ba7d87cd40bf67b99d20b4658c0bfe6c2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 16:59:56 +0200 Subject: [PATCH 024/142] should work this time --- pandas/plotting/_matplotlib/core.py | 6 ++-- pandas/plotting/_matplotlib/hist.py | 45 ++++++++++++++++++----------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index d3d5f307e7b01..df9143061765b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -432,14 +432,14 @@ def _compute_plot_data(self): data = DataFrame(data_dict) else: - l = [] + data_list = [] for key, group in grouped: columns = MultiIndex.from_product([[key], self.column]) group = group[self.column] group.columns = columns - l.append(group) + data_list.append(group) - data = concat(l, axis=1) + data = concat(data_list, axis=1) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ecc8dfc64f284..0c46a1f703f84 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -26,23 +26,35 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): - if self.by is None: - if is_integer(self.bins): - # create common bin edge - values = self.data._convert(datetime=True)._get_numeric_data() - values = np.ravel(values) - values = values[~isna(values)] - - hist, self.bins = np.histogram( - values, - bins=self.bins, - range=self.kwds.get("range", None), - weights=self.kwds.get("weights", None), - ) + if is_integer(self.bins): + if self.by is None: + self.bins = self._caculcate_bins(self.data) + + else: + grouped = self.data.groupby(self.by)[self.column] + bins_list = [] + for key, group in grouped: + print(key) + print(group) + bins_list.append(self._caculcate_bins(group)) + self.bins = bins_list if is_list_like(self.bottom): self.bottom = np.array(self.bottom) + def _caculcate_bins(self, data): + values = data._convert(datetime=True)._get_numeric_data() + values = np.ravel(values) + values = values[~isna(values)] + + hist, bins = np.histogram( + values, + bins=self.bins, + range=self.kwds.get("range", None), + weights=self.kwds.get("weights", None), + ) + return bins + @classmethod def _plot( cls, @@ -73,6 +85,7 @@ def _group_plot( data, fig, labels, + bins=None, rot=90, xrot=None, xlabelsize=None, @@ -92,8 +105,7 @@ def _group_plot( for i, (label, y) in enumerate(data): ax = axes[i] - # TODO: now df.hist also has no value for this - ax.hist(y, label=labels, **kwds) + ax.hist(y, bins[i], label=labels, **kwds) ax.set_title(pprint_thing(label)) _set_ticks_props( @@ -108,7 +120,6 @@ def _group_plot( def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() - if self.by is None: for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) @@ -128,9 +139,9 @@ def _make_plot(self): self._add_legend_handle(artists[0], label, index=i) else: - data = self._iter_data() kwds = self.kwds.copy() kwds = self._make_plot_keywords(kwds, None) + data = self._iter_data() self._group_plot(self.axes, data, self.fig, self.column, **kwds) def _make_plot_keywords(self, kwds, y): From d73115a3ff683ab22a6b09bfc7c2010d6655980e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 17:05:36 +0200 Subject: [PATCH 025/142] Add in-code comments --- pandas/plotting/_matplotlib/core.py | 6 ++++++ pandas/plotting/_matplotlib/hist.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index df9143061765b..74687f1eff26d 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -128,6 +128,8 @@ def __init__( self.subplots = subplots if sharex is None: + + # if by is defined, subplots are used and sharex should be False if ax is None and by is None: self.sharex = True else: @@ -285,6 +287,9 @@ def nseries(self): if self.data.ndim == 1: return 1 else: + + # If MultiIndex column, only return the first level which + # corresponds to by argument if not isinstance(self.data.columns, ABCMultiIndex): return self.data.shape[1] else: @@ -440,6 +445,7 @@ def _compute_plot_data(self): data_list.append(group) data = concat(data_list, axis=1) + # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 0c46a1f703f84..49350e1bac2c8 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -26,6 +26,9 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): + + # calculate bin number separately in different subplots + # where subplots are created based on by argument if is_integer(self.bins): if self.by is None: self.bins = self._caculcate_bins(self.data) @@ -43,6 +46,8 @@ def _args_adjust(self): self.bottom = np.array(self.bottom) def _caculcate_bins(self, data): + """Calculate bins given data""" + values = data._convert(datetime=True)._get_numeric_data() values = np.ravel(values) values = values[~isna(values)] From d7998bb515d267f8ba8ce7825bed358a43b1f8d9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 17:07:35 +0200 Subject: [PATCH 026/142] remove print --- pandas/plotting/_matplotlib/hist.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 49350e1bac2c8..0e39c9e61467a 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -37,8 +37,6 @@ def _args_adjust(self): grouped = self.data.groupby(self.by)[self.column] bins_list = [] for key, group in grouped: - print(key) - print(group) bins_list.append(self._caculcate_bins(group)) self.bins = bins_list From 1bbf7ea8bbbf656e0dc1f3c42f29521f20429a81 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 19:41:13 +0200 Subject: [PATCH 027/142] reformat --- pandas/tests/plotting/test_frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index cdb24ab1b2987..f623aad310319 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3231,10 +3231,10 @@ def test_subplots_sharex_false(self): def test_hist_plot_by_argument(self): # GH 15079 - df = DataFrame(np.random.randn(30, 2), columns=['A', 'B']) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) - _check_plot_works(df.plot.hist, column='A', by='C') + _check_plot_works(df.plot.hist, column="A", by="C") def test_plot_no_rows(self): # GH 27758 From a279f45fa680155febd2aeecc4446fef61470659 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 19:43:44 +0200 Subject: [PATCH 028/142] Dropna --- pandas/plotting/_matplotlib/hist.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 0e39c9e61467a..9474366eae4aa 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -108,6 +108,7 @@ def _group_plot( for i, (label, y) in enumerate(data): ax = axes[i] + y = y[~isna(y)] ax.hist(y, bins[i], label=labels, **kwds) ax.set_title(pprint_thing(label)) From 2b793eaf0571e7012aff5fe0df123dbdcac30999 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 14 Sep 2019 18:56:57 +0200 Subject: [PATCH 029/142] Add isna for multi column --- pandas/plotting/_matplotlib/hist.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 9474366eae4aa..fc9dc718e0929 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -20,7 +20,6 @@ class HistPlot(LinePlot): def __init__(self, data, bins=10, bottom=0, **kwargs): self.bins = bins # use mpl default self.bottom = bottom - self.by = kwargs.get("by") # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) @@ -108,8 +107,11 @@ def _group_plot( for i, (label, y) in enumerate(data): ax = axes[i] - y = y[~isna(y)] - ax.hist(y, bins[i], label=labels, **kwds) + if len(y.shape) > 1: + y_notna = np.array(col[~isna(col)] for col in y.T).T + else: + y_notna = y[~isna(y)] + ax.hist(y_notna, bins[i], label=labels, **kwds) ax.set_title(pprint_thing(label)) _set_ticks_props( From 04de066e99565e8ae657227fde1026e3cacd1b2c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Sep 2019 22:20:13 +0200 Subject: [PATCH 030/142] try to remove warning --- pandas/plotting/_matplotlib/hist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index fc9dc718e0929..62d27a5d7509f 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -102,13 +102,13 @@ def _group_plot( "will be created" ) - converter._WARN = False # no warning for pandas plots xrot = xrot or rot for i, (label, y) in enumerate(data): ax = axes[i] if len(y.shape) > 1: - y_notna = np.array(col[~isna(col)] for col in y.T).T + notna = [col[~isna(col)] for col in y.T] + y_notna = np.array(np.array(notna).T) else: y_notna = y[~isna(y)] ax.hist(y_notna, bins[i], label=labels, **kwds) From 4adc3240daf0d0e588a18a7aab68de11cee407e1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 16 Sep 2019 09:24:20 +0200 Subject: [PATCH 031/142] test if removing pd works --- pandas/tests/plotting/test_frame.py | 40 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index f623aad310319..3194859e428f1 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -582,7 +582,7 @@ def test_subplots_timeseries_y_axis_not_supported(self): pd.to_datetime("2017-08-02 00:00:00"), ], } - testdata = pd.DataFrame(data) + testdata = DataFrame(data) ax_period = testdata.plot(x="numeric", y="period") assert ( ax_period.get_lines()[0].get_data()[1] == testdata["period"].values @@ -952,7 +952,7 @@ def test_bar_colors(self): tm.close() def test_bar_user_colors(self): - df = pd.DataFrame( + df = DataFrame( {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} ) # This should *only* work when `y` is specified, else @@ -1114,13 +1114,13 @@ def test_bar_nan(self): @pytest.mark.slow def test_bar_categorical(self): # GH 13019 - df1 = pd.DataFrame( + df1 = DataFrame( np.random.randn(6, 5), index=pd.Index(list("ABCDEF")), columns=pd.Index(list("abcde")), ) # categorical index must behave the same - df2 = pd.DataFrame( + df2 = DataFrame( np.random.randn(6, 5), index=pd.CategoricalIndex(list("ABCDEF")), columns=pd.CategoricalIndex(list("abcde")), @@ -1167,7 +1167,7 @@ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) ax1 = df.plot.scatter(x="A label", y="B label") ax2 = df.plot.scatter(x="A label", y="B label", c="C label") @@ -1190,7 +1190,7 @@ def test_if_hexbin_xaxis_label_is_visible(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) ax = df.plot.hexbin("A label", "B label", gridsize=12) assert all(vis.get_visible() for vis in ax.xaxis.get_minorticklabels()) @@ -1202,7 +1202,7 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): import matplotlib.pyplot as plt random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) fig, axes = plt.subplots(1, 2) df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) @@ -1218,7 +1218,7 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): @pytest.mark.slow def test_plot_scatter_with_categorical_data(self): # GH 16199 - df = pd.DataFrame( + df = DataFrame( {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} ) @@ -1883,7 +1883,7 @@ def test_df_legend_labels(self): def test_missing_marker_multi_plots_on_same_ax(self): # GH 18222 - df = pd.DataFrame( + df = DataFrame( data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"] ) fig, ax = self.plt.subplots(nrows=1, ncols=3) @@ -2023,7 +2023,7 @@ def test_line_colors(self): @pytest.mark.slow def test_dont_modify_colors(self): colors = ["r", "g", "b"] - pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) + DataFrame(np.random.rand(10, 2)).plot(color=colors) assert len(colors) == 3 @pytest.mark.slow @@ -3114,7 +3114,7 @@ def test_passed_bar_colors(self): color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] colormap = mpl.colors.ListedColormap(color_tuples) - barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) + barplot = DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) assert color_tuples == [c.get_facecolor() for c in barplot.patches] def test_rcParams_bar_colors(self): @@ -3122,14 +3122,14 @@ def test_rcParams_bar_colors(self): color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): - barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") + barplot = DataFrame([[1, 2, 3]]).plot(kind="bar") assert color_tuples == [c.get_facecolor() for c in barplot.patches] @pytest.mark.parametrize("method", ["line", "barh", "bar"]) def test_secondary_axis_font_size(self, method): # GH: 12565 df = ( - pd.DataFrame(np.random.randn(15, 2), columns=list("AB")) + DataFrame(np.random.randn(15, 2), columns=list("AB")) .assign(C=lambda df: df.B.cumsum()) .assign(D=lambda df: df.C * 1.1) ) @@ -3145,7 +3145,7 @@ def test_secondary_axis_font_size(self, method): def test_x_string_values_ticks(self): # Test if string plot index have a fixed xtick position # GH: 7612, GH: 22334 - df = pd.DataFrame( + df = DataFrame( { "sales": [3, 2, 3], "visits": [20, 42, 28], @@ -3166,7 +3166,7 @@ def test_x_multiindex_values_ticks(self): # Test if multiindex plot index have a fixed xtick position # GH: 15912 index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]]) - df = pd.DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) + df = DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) ax = df.plot() ax.set_xlim(-1, 4) xticklabels = [t.get_text() for t in ax.get_xticklabels()] @@ -3181,7 +3181,7 @@ def test_x_multiindex_values_ticks(self): def test_xlim_plot_line(self, kind): # test if xlim is set correctly in plot.line and plot.area # GH 27686 - df = pd.DataFrame([2, 4], index=[1, 2]) + df = DataFrame([2, 4], index=[1, 2]) ax = df.plot(kind=kind) xlims = ax.get_xlim() assert xlims[0] < 1 @@ -3193,7 +3193,7 @@ def test_xlim_plot_line_correctly_in_mixed_plot_type(self): fig, ax = self.plt.subplots() indexes = ["k1", "k2", "k3", "k4"] - df = pd.DataFrame( + df = DataFrame( { "s1": [1000, 2000, 1500, 2000], "s2": [900, 1400, 2000, 3000], @@ -3216,7 +3216,7 @@ def test_xlim_plot_line_correctly_in_mixed_plot_type(self): def test_subplots_sharex_false(self): # test when sharex is set to False, two plots should have different # labels, GH 25160 - df = pd.DataFrame(np.random.rand(10, 2)) + df = DataFrame(np.random.rand(10, 2)) df.iloc[5:, 1] = np.nan df.iloc[:5, 0] = np.nan @@ -3238,7 +3238,7 @@ def test_hist_plot_by_argument(self): def test_plot_no_rows(self): # GH 27758 - df = pd.DataFrame(columns=["foo"], dtype=int) + df = DataFrame(columns=["foo"], dtype=int) assert df.empty ax = df.plot() assert len(ax.get_lines()) == 1 @@ -3247,7 +3247,7 @@ def test_plot_no_rows(self): assert len(line.get_ydata()) == 0 def test_plot_no_numeric_data(self): - df = pd.DataFrame(["a", "b", "c"]) + df = DataFrame(["a", "b", "c"]) with pytest.raises(TypeError): df.plot() From d0103a4b5fc227e325b9d952b8a3ff8f1fb5e4e3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 16 Sep 2019 09:54:15 +0200 Subject: [PATCH 032/142] revert changes --- pandas/tests/plotting/test_frame.py | 40 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 3194859e428f1..f623aad310319 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -582,7 +582,7 @@ def test_subplots_timeseries_y_axis_not_supported(self): pd.to_datetime("2017-08-02 00:00:00"), ], } - testdata = DataFrame(data) + testdata = pd.DataFrame(data) ax_period = testdata.plot(x="numeric", y="period") assert ( ax_period.get_lines()[0].get_data()[1] == testdata["period"].values @@ -952,7 +952,7 @@ def test_bar_colors(self): tm.close() def test_bar_user_colors(self): - df = DataFrame( + df = pd.DataFrame( {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} ) # This should *only* work when `y` is specified, else @@ -1114,13 +1114,13 @@ def test_bar_nan(self): @pytest.mark.slow def test_bar_categorical(self): # GH 13019 - df1 = DataFrame( + df1 = pd.DataFrame( np.random.randn(6, 5), index=pd.Index(list("ABCDEF")), columns=pd.Index(list("abcde")), ) # categorical index must behave the same - df2 = DataFrame( + df2 = pd.DataFrame( np.random.randn(6, 5), index=pd.CategoricalIndex(list("ABCDEF")), columns=pd.CategoricalIndex(list("abcde")), @@ -1167,7 +1167,7 @@ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) ax1 = df.plot.scatter(x="A label", y="B label") ax2 = df.plot.scatter(x="A label", y="B label", c="C label") @@ -1190,7 +1190,7 @@ def test_if_hexbin_xaxis_label_is_visible(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) ax = df.plot.hexbin("A label", "B label", gridsize=12) assert all(vis.get_visible() for vis in ax.xaxis.get_minorticklabels()) @@ -1202,7 +1202,7 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): import matplotlib.pyplot as plt random_array = np.random.random((1000, 3)) - df = DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) fig, axes = plt.subplots(1, 2) df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) @@ -1218,7 +1218,7 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): @pytest.mark.slow def test_plot_scatter_with_categorical_data(self): # GH 16199 - df = DataFrame( + df = pd.DataFrame( {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} ) @@ -1883,7 +1883,7 @@ def test_df_legend_labels(self): def test_missing_marker_multi_plots_on_same_ax(self): # GH 18222 - df = DataFrame( + df = pd.DataFrame( data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"] ) fig, ax = self.plt.subplots(nrows=1, ncols=3) @@ -2023,7 +2023,7 @@ def test_line_colors(self): @pytest.mark.slow def test_dont_modify_colors(self): colors = ["r", "g", "b"] - DataFrame(np.random.rand(10, 2)).plot(color=colors) + pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) assert len(colors) == 3 @pytest.mark.slow @@ -3114,7 +3114,7 @@ def test_passed_bar_colors(self): color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] colormap = mpl.colors.ListedColormap(color_tuples) - barplot = DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) + barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) assert color_tuples == [c.get_facecolor() for c in barplot.patches] def test_rcParams_bar_colors(self): @@ -3122,14 +3122,14 @@ def test_rcParams_bar_colors(self): color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): - barplot = DataFrame([[1, 2, 3]]).plot(kind="bar") + barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") assert color_tuples == [c.get_facecolor() for c in barplot.patches] @pytest.mark.parametrize("method", ["line", "barh", "bar"]) def test_secondary_axis_font_size(self, method): # GH: 12565 df = ( - DataFrame(np.random.randn(15, 2), columns=list("AB")) + pd.DataFrame(np.random.randn(15, 2), columns=list("AB")) .assign(C=lambda df: df.B.cumsum()) .assign(D=lambda df: df.C * 1.1) ) @@ -3145,7 +3145,7 @@ def test_secondary_axis_font_size(self, method): def test_x_string_values_ticks(self): # Test if string plot index have a fixed xtick position # GH: 7612, GH: 22334 - df = DataFrame( + df = pd.DataFrame( { "sales": [3, 2, 3], "visits": [20, 42, 28], @@ -3166,7 +3166,7 @@ def test_x_multiindex_values_ticks(self): # Test if multiindex plot index have a fixed xtick position # GH: 15912 index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]]) - df = DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) + df = pd.DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) ax = df.plot() ax.set_xlim(-1, 4) xticklabels = [t.get_text() for t in ax.get_xticklabels()] @@ -3181,7 +3181,7 @@ def test_x_multiindex_values_ticks(self): def test_xlim_plot_line(self, kind): # test if xlim is set correctly in plot.line and plot.area # GH 27686 - df = DataFrame([2, 4], index=[1, 2]) + df = pd.DataFrame([2, 4], index=[1, 2]) ax = df.plot(kind=kind) xlims = ax.get_xlim() assert xlims[0] < 1 @@ -3193,7 +3193,7 @@ def test_xlim_plot_line_correctly_in_mixed_plot_type(self): fig, ax = self.plt.subplots() indexes = ["k1", "k2", "k3", "k4"] - df = DataFrame( + df = pd.DataFrame( { "s1": [1000, 2000, 1500, 2000], "s2": [900, 1400, 2000, 3000], @@ -3216,7 +3216,7 @@ def test_xlim_plot_line_correctly_in_mixed_plot_type(self): def test_subplots_sharex_false(self): # test when sharex is set to False, two plots should have different # labels, GH 25160 - df = DataFrame(np.random.rand(10, 2)) + df = pd.DataFrame(np.random.rand(10, 2)) df.iloc[5:, 1] = np.nan df.iloc[:5, 0] = np.nan @@ -3238,7 +3238,7 @@ def test_hist_plot_by_argument(self): def test_plot_no_rows(self): # GH 27758 - df = DataFrame(columns=["foo"], dtype=int) + df = pd.DataFrame(columns=["foo"], dtype=int) assert df.empty ax = df.plot() assert len(ax.get_lines()) == 1 @@ -3247,7 +3247,7 @@ def test_plot_no_rows(self): assert len(line.get_ydata()) == 0 def test_plot_no_numeric_data(self): - df = DataFrame(["a", "b", "c"]) + df = pd.DataFrame(["a", "b", "c"]) with pytest.raises(TypeError): df.plot() From f94dbb45e119824b025536074353daa20758161f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 16 Sep 2019 18:59:26 +0200 Subject: [PATCH 033/142] try if warning gone --- pandas/tests/plotting/test_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index f623aad310319..5631addf4235f 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3231,7 +3231,7 @@ def test_subplots_sharex_false(self): def test_hist_plot_by_argument(self): # GH 15079 - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df = pd.DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) _check_plot_works(df.plot.hist, column="A", by="C") From 0415cb0a0662494aee61bd94979a96c18fac1c5a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 17 Sep 2019 10:26:33 +0200 Subject: [PATCH 034/142] try again --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/plotting/test_frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 74687f1eff26d..3ad33ecd9e179 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -25,10 +25,10 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas import concat import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.index import MultiIndex +from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib import converter diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 5631addf4235f..f623aad310319 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3231,7 +3231,7 @@ def test_subplots_sharex_false(self): def test_hist_plot_by_argument(self): # GH 15079 - df = pd.DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) _check_plot_works(df.plot.hist, column="A", by="C") From c00588001be47262ed9bcb22d9307d9b28f8e25e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 12:49:28 +0100 Subject: [PATCH 035/142] fix conflict and merge master --- pandas/plotting/_matplotlib/hist.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 370b362f77fb5..0e60ca97758b9 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -36,18 +36,6 @@ def _args_adjust(self): bins_list.append(self._caculcate_bins(group)) self.bins = bins_list - # create common bin edge - values = self.data._convert(datetime=True)._get_numeric_data() - values = np.ravel(values) - values = values[~isna(values)] - - _, self.bins = np.histogram( - values, - bins=self.bins, - range=self.kwds.get("range", None), - weights=self.kwds.get("weights", None), - ) - if is_list_like(self.bottom): self.bottom = np.array(self.bottom) From a1fabc513d6c03fcb57b5033c08b144794557514 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 13:39:53 +0100 Subject: [PATCH 036/142] Fix linting error --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/plotting/_matplotlib/hist.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 06c2c69bd3f0c..97ff2f4a2b1d7 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -106,7 +106,7 @@ def __init__( layout=None, include_bool=False, column=None, - **kwds + **kwds, ): import matplotlib.pyplot as plt diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 0e60ca97758b9..d0d98e897c91a 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -90,7 +90,7 @@ def _group_plot( xlabelsize=None, ylabelsize=None, yrot=None, - **kwds + **kwds, ): if "figure" in kwds: raise ValueError( From 70453f13b72c9170f5c6a19fcccb78f42f8025a9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 14:31:54 +0100 Subject: [PATCH 037/142] Add test --- pandas/tests/plotting/test_frame.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index d2f7aeca6cd73..ebf259b563261 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3256,12 +3256,13 @@ def test_subplots_sharex_false(self): tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) - def test_hist_plot_by_argument(self): + @pytest.mark.parametrize("column", ["A", ["A", "B"]]) + def test_hist_plot_by_argument(self, column): # GH 15079 df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) - _check_plot_works(df.plot.hist, column="A", by="C") + _check_plot_works(df.plot.hist, column=column, by="C") def test_plot_no_rows(self): # GH 27758 From b6579a56c21b2e4b8ca0c903813f9dd15c0eb126 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 14:41:13 +0100 Subject: [PATCH 038/142] remove unused code --- pandas/plotting/_matplotlib/core.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 97ff2f4a2b1d7..51b3a735ac9fa 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -248,12 +248,6 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - # TODO: unused? - # if self.sort_columns: - # columns = com.try_sort(data.columns) - # else: - # columns = data.columns - if not isinstance(data.columns, ABCMultiIndex): for col, values in data.items(): if keep_index is True: From e99f3dc685df32f7a610b909f3adc03b6255012d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 16:53:35 +0100 Subject: [PATCH 039/142] add test and make code more robust --- pandas/plotting/_matplotlib/core.py | 9 ++++++--- pandas/tests/plotting/test_frame.py | 6 ++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 51b3a735ac9fa..c7131c0868a98 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -248,7 +248,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - if not isinstance(data.columns, ABCMultiIndex): + if self.by is None: for col, values in data.items(): if keep_index is True: yield col, values @@ -258,10 +258,11 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): cols = data.columns.get_level_values(0).unique() for col in cols: + mask = data.columns.get_level_values(0) == col if keep_index is True: - yield col, data[col] + yield col, data.loc[:, mask] else: - yield col, data[col].values + yield col, data.loc[:, mask].values @property def nseries(self): @@ -420,6 +421,8 @@ def _compute_plot_data(self): data_list = [] for key, group in grouped: columns = MultiIndex.from_product([[key], self.column]) + # columns = MultiIndex([tuple([c for c in col]) for col in columns]) + group = group[self.column] group.columns = columns data_list.append(group) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ebf259b563261..8df4a16f73646 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3256,13 +3256,15 @@ def test_subplots_sharex_false(self): tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) + @pytest.mark.parametrize("by", ["C", ["C", "D"]]) @pytest.mark.parametrize("column", ["A", ["A", "B"]]) - def test_hist_plot_by_argument(self, column): + def test_hist_plot_by_argument(self, by, column): # GH 15079 df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) - _check_plot_works(df.plot.hist, column=column, by="C") + _check_plot_works(df.plot.hist, column=column, by=by) def test_plot_no_rows(self): # GH 27758 From 99d6d67316d0432cdaedcaaecc683c638ca95ee4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 16:54:56 +0100 Subject: [PATCH 040/142] remove comment --- pandas/plotting/_matplotlib/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c7131c0868a98..de2d5e20216fc 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -421,8 +421,6 @@ def _compute_plot_data(self): data_list = [] for key, group in grouped: columns = MultiIndex.from_product([[key], self.column]) - # columns = MultiIndex([tuple([c for c in col]) for col in columns]) - group = group[self.column] group.columns = columns data_list.append(group) From 8e2fcf62daf0efda8c81ae2bcc8150228a7a3bd5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 16:59:53 +0100 Subject: [PATCH 041/142] clean the code --- pandas/plotting/_matplotlib/core.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index de2d5e20216fc..d5ed818ba5757 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -268,14 +268,10 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): def nseries(self): if self.data.ndim == 1: return 1 + elif self.by is None: + return self.data.shape[1] else: - - # If MultiIndex column, only return the first level which - # corresponds to by argument - if not isinstance(self.data.columns, ABCMultiIndex): - return self.data.shape[1] - else: - return len(set(self.data.columns.get_level_values(0))) + return len(set(self.data.columns.get_level_values(0))) def draw(self): self.plt.draw_if_interactive() From d02f4ac35887b2af4482ffd3d6a0f72842f5197f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 17:17:51 +0100 Subject: [PATCH 042/142] simplify code --- pandas/plotting/_matplotlib/core.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index d5ed818ba5757..c6b70bbfcf971 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -113,6 +113,8 @@ def __init__( self.data = data self.by = by + if isinstance(column, str): + column = [column] self.column = column self.kind = kind @@ -403,16 +405,12 @@ def _compute_plot_data(self): self.subplots = True grouped = data.groupby(self.by) - if self.column is not None: - grouped = grouped[self.column] - if len(self.column) == 1: # recreate data according to groupby object data_dict = {} for key, group in grouped: - data_dict[key] = group + data_dict[key] = group[self.column[0]] data = DataFrame(data_dict) - else: data_list = [] for key, group in grouped: From 947189c5b8a3903641f3598a0b9176aea42900b4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 17:29:29 +0100 Subject: [PATCH 043/142] simplify code --- pandas/plotting/_matplotlib/core.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c6b70bbfcf971..b468fb5bf1071 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -405,21 +405,14 @@ def _compute_plot_data(self): self.subplots = True grouped = data.groupby(self.by) - if len(self.column) == 1: - # recreate data according to groupby object - data_dict = {} - for key, group in grouped: - data_dict[key] = group[self.column[0]] - data = DataFrame(data_dict) - else: - data_list = [] - for key, group in grouped: - columns = MultiIndex.from_product([[key], self.column]) - group = group[self.column] - group.columns = columns - data_list.append(group) - - data = concat(data_list, axis=1) + data_list = [] + for key, group in grouped: + columns = MultiIndex.from_product([[key], self.column]) + group = group[self.column] + group.columns = columns + data_list.append(group) + + data = concat(data_list, axis=1) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` From 6b5203d40cb4f0bad503c2aad7c8b3a148b8c33c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 18:05:05 +0100 Subject: [PATCH 044/142] fix linting --- pandas/plotting/_matplotlib/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index b468fb5bf1071..cb5e544ee7218 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -24,7 +24,6 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.common as com -from pandas.core.frame import DataFrame from pandas.core.index import MultiIndex from pandas.core.reshape.concat import concat From 27d0d214e1fe9e84a24cbae403186ce82d12ad30 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 19:32:07 +0100 Subject: [PATCH 045/142] Add doc for hist --- pandas/plotting/_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index dd907457f7c32..bb8de96830147 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1131,7 +1131,7 @@ def box(self, by=None, **kwargs): """ return self(kind="box", by=by, **kwargs) - def hist(self, by=None, bins=10, **kwargs): + def hist(self, column=None, by=None, bins=10, **kwargs): """ Draw one histogram of the DataFrame's columns. @@ -1142,6 +1142,8 @@ def hist(self, by=None, bins=10, **kwargs): Parameters ---------- + column: str or sequence, optional + If passed, will be used to limit data to a subset of columns. by : str or sequence, optional Column in the DataFrame to group by. bins : int, default 10 @@ -1176,7 +1178,7 @@ def hist(self, by=None, bins=10, **kwargs): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) """ - return self(kind="hist", by=by, bins=bins, **kwargs) + return self(kind="hist", column=column, by=by, bins=bins, **kwargs) def kde(self, bw_method=None, ind=None, **kwargs): """ From 48ff52120ee833d991423b2fe17e579f90ccb0c4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 19:59:18 +0100 Subject: [PATCH 046/142] revert change --- pandas/plotting/_core.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index bb8de96830147..dd907457f7c32 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1131,7 +1131,7 @@ def box(self, by=None, **kwargs): """ return self(kind="box", by=by, **kwargs) - def hist(self, column=None, by=None, bins=10, **kwargs): + def hist(self, by=None, bins=10, **kwargs): """ Draw one histogram of the DataFrame's columns. @@ -1142,8 +1142,6 @@ def hist(self, column=None, by=None, bins=10, **kwargs): Parameters ---------- - column: str or sequence, optional - If passed, will be used to limit data to a subset of columns. by : str or sequence, optional Column in the DataFrame to group by. bins : int, default 10 @@ -1178,7 +1176,7 @@ def hist(self, column=None, by=None, bins=10, **kwargs): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) """ - return self(kind="hist", column=column, by=by, bins=bins, **kwargs) + return self(kind="hist", by=by, bins=bins, **kwargs) def kde(self, bw_method=None, ind=None, **kwargs): """ From f39d948d0736f3679d87e1bce7148b1fbd0ea2ed Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 20:01:53 +0100 Subject: [PATCH 047/142] fix warning --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index cb5e544ee7218..bade4809d5594 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -24,7 +24,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.common as com -from pandas.core.index import MultiIndex +from pandas import MultiIndex from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing From 5d1705c6d9515749dc9b0fc53874d1d9cf4889f2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Jan 2020 20:02:36 +0100 Subject: [PATCH 048/142] isort --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index bade4809d5594..bbb38ec320c0e 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -23,8 +23,8 @@ ) from pandas.core.dtypes.missing import isna, notna -import pandas.core.common as com from pandas import MultiIndex +import pandas.core.common as com from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing From 46a803162b4e6fddbbf16c8554f280a9d3cf883b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Jan 2020 14:01:17 +0100 Subject: [PATCH 049/142] simplify code --- pandas/plotting/_matplotlib/hist.py | 91 ++++++++++------------------- 1 file changed, 30 insertions(+), 61 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index d0d98e897c91a..3ddda5362a798 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -68,7 +68,6 @@ def _plot( ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) - y = y[~isna(y)] base = np.zeros(len(bins) - 1) bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) @@ -77,75 +76,45 @@ def _plot( cls._update_stacker(ax, stacking_id, n) return patches - @classmethod - def _group_plot( - cls, - axes, - data, - fig, - labels, - bins=None, - rot=90, - xrot=None, - xlabelsize=None, - ylabelsize=None, - yrot=None, - **kwds, - ): - if "figure" in kwds: - raise ValueError( - "Cannot pass 'figure' when using the " - "'by' argument, since a new 'Figure' instance " - "will be created" - ) - - xrot = xrot or rot + def _make_plot(self): + colors = self._get_colors() + stacking_id = self._get_stacking_id() + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) - for i, (label, y) in enumerate(data): - ax = axes[i] - if len(y.shape) > 1: - notna = [col[~isna(col)] for col in y.T] - y_notna = np.array(np.array(notna).T) - else: - y_notna = y[~isna(y)] - ax.hist(y_notna, bins[i], label=labels, **kwds) - ax.set_title(pprint_thing(label)) + kwds = self.kwds.copy() + label = pprint_thing(label) + kwds["label"] = label - _set_ticks_props( - axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot - ) + style, kwds = self._apply_style_colors(colors, kwds, i, label) + if style is not None: + kwds["style"] = style - fig.subplots_adjust( - bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.8, wspace=0.3 - ) - return axes + kwds = self._make_plot_keywords(kwds, y) - def _make_plot(self): - colors = self._get_colors() - stacking_id = self._get_stacking_id() - if self.by is None: - for i, (label, y) in enumerate(self._iter_data()): - ax = self._get_ax(i) + if self.by is not None: + kwds["bins"] = kwds["bins"][i] + kwds["label"] = self.column + kwds.pop("color") - kwds = self.kwds.copy() - label = pprint_thing(label) - kwds["label"] = label + y = self._reformat_y(y) + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) - style, kwds = self._apply_style_colors(colors, kwds, i, label) - if style is not None: - kwds["style"] = style + # when by is applied, show title for subplots to know which group it is + if self.by is not None: + ax.set_title(pprint_thing(label)) - kwds = self._make_plot_keywords(kwds, y) - artists = self._plot( - ax, y, column_num=i, stacking_id=stacking_id, **kwds - ) - self._add_legend_handle(artists[0], label, index=i) + self._add_legend_handle(artists[0], label, index=i) + def _reformat_y(self, y): + """Internal function to reformat y given `by` is applied or not. + """ + if self.by is not None and len(y.shape) > 1: + notna = [col[~isna(col)] for col in y.T] + y = np.array(np.array(notna).T) else: - kwds = self.kwds.copy() - kwds = self._make_plot_keywords(kwds, None) - data = self._iter_data() - self._group_plot(self.axes, data, self.fig, self.column, **kwds) + y = y[~isna(y)] + return y def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" From 57a96e6d6e1f4281e865487f2e5946a40217add2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Jan 2020 14:03:54 +0100 Subject: [PATCH 050/142] simpler python --- pandas/plotting/_matplotlib/core.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 43488ed2a4c52..c42f27b3712fa 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -112,9 +112,7 @@ def __init__( self.data = data self.by = by - if isinstance(column, str): - column = [column] - self.column = column + self.column = [column] if isinstance(column, str) else column self.kind = kind From 29127f08870537a1d3d542317129f5fb8f2c959f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Jan 2020 14:04:55 +0100 Subject: [PATCH 051/142] remove unused --- pandas/plotting/_matplotlib/hist.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 3ddda5362a798..8032d33db16f1 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -17,7 +17,6 @@ class HistPlot(LinePlot): def __init__(self, data, bins=10, bottom=0, **kwargs): self.bins = bins # use mpl default self.bottom = bottom - # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) From 61bb97f45bbf4ae39ec82064db5b1f7869b2b887 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Jan 2020 14:06:08 +0100 Subject: [PATCH 052/142] restore blank lines --- pandas/plotting/_matplotlib/hist.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 8032d33db16f1..5fbfe7ea879b3 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -78,10 +78,12 @@ def _plot( def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() + for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) kwds = self.kwds.copy() + label = pprint_thing(label) kwds["label"] = label From 62fb9e660fa0f377adb77ac6aac0e09c472ae254 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Jan 2020 15:54:04 +0100 Subject: [PATCH 053/142] Add extensive tests --- pandas/plotting/_matplotlib/hist.py | 2 + pandas/tests/plotting/test_frame.py | 98 +++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 5fbfe7ea879b3..a014af9c846d9 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -93,6 +93,8 @@ def _make_plot(self): kwds = self._make_plot_keywords(kwds, y) + # the bins is multi-dimension array now and each plot need only 1-d and + # when by is applied, label should be columns that are grouped if self.by is not None: kwds["bins"] = kwds["bins"][i] kwds["label"] = self.column diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 08e200e6bba2d..d0c3d9d23ef3e 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3266,6 +3266,104 @@ def test_hist_plot_by_argument(self, by, column): _check_plot_works(df.plot.hist, column=column, by=by) + @pytest.mark.slow + @pytest.mark.parametrize( + "by, column, layout, axes_num", + [ + (["C"], "A", (2, 2), 3), + ("C", "A", (2, 2), 3), + (["C"], ["A"], (1, 3), 3), + ("C", ["A", "B"], (3, 1), 3), + (["C", "D"], "A", (9, 1), 9), + (["C", "D"], "A", (3, 3), 9), + (["C", "D"], ["A"], (5, 2), 9), + (["C", "D"], ["A", "B"], (9, 1), 9), + (["C", "D"], ["A", "B"], (5, 2), 9), + ], + ) + def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): + # GH 15079 + np.random.randn(2020) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.hist, column=column, by=by, layout=layout) + self._check_axes_shape(axes, axes_num=axes_num, layout=layout) + + def test_hist_plot_invalid_layout_with_by(self): + # GH 15079, test if error is raised when invalid layout is given + np.random.randn(2020) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + + # layout too small for all 3 plots + with pytest.raises(ValueError): + df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) + + # invalid format for layout + with pytest.raises(ValueError): + df.plot.hist(column=["A", "B"], by="C", layout=(1,)) + with pytest.raises(ValueError): + df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) + + @pytest.mark.slow + def test_axis_share_x_with_by(self): + # GH 15079 + np.random.randn(2020) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + + ax1, ax2, ax3 = df.plot.hist(column="A", by="C", sharex=True) + + # share x + assert ax1._shared_x_axes.joined(ax1, ax2) + assert ax2._shared_x_axes.joined(ax1, ax2) + assert ax3._shared_x_axes.joined(ax1, ax3) + assert ax3._shared_x_axes.joined(ax2, ax3) + + # don't share y + assert not ax1._shared_y_axes.joined(ax1, ax2) + assert not ax2._shared_y_axes.joined(ax1, ax2) + assert not ax3._shared_y_axes.joined(ax1, ax3) + assert not ax3._shared_y_axes.joined(ax2, ax3) + + @pytest.mark.slow + def test_axis_share_y_with_by(self): + # GH 15079 + np.random.randn(2020) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + + ax1, ax2, ax3 = df.plot.hist(column="A", by="C", sharey=True) + + # share y + assert ax1._shared_y_axes.joined(ax1, ax2) + assert ax2._shared_y_axes.joined(ax1, ax2) + assert ax3._shared_y_axes.joined(ax1, ax3) + assert ax3._shared_y_axes.joined(ax2, ax3) + + # don't share x + assert not ax1._shared_x_axes.joined(ax1, ax2) + assert not ax2._shared_x_axes.joined(ax1, ax2) + assert not ax3._shared_x_axes.joined(ax1, ax3) + assert not ax3._shared_x_axes.joined(ax2, ax3) + + @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) + def test_figure_shape_hist_with_by(self, figsize): + # GH 15079 + np.random.randn(2020) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + + axes = df.plot.hist(column="A", by="C", figsize=figsize) + self._check_axes_shape(axes, axes_num=3, figsize=figsize) + def test_plot_no_rows(self): # GH 27758 df = pd.DataFrame(columns=["foo"], dtype=int) From 638174bece1ac74fa498e0436c32076a3a68883c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Jan 2020 16:18:21 +0100 Subject: [PATCH 054/142] fix seed --- pandas/tests/plotting/test_frame.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index d0c3d9d23ef3e..bebd915fec61d 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3283,7 +3283,7 @@ def test_hist_plot_by_argument(self, by, column): ) def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): # GH 15079 - np.random.randn(2020) + np.random.seed(0) df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) df["D"] = np.random.choice(["a", "b", "c"], 30) @@ -3294,7 +3294,7 @@ def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): def test_hist_plot_invalid_layout_with_by(self): # GH 15079, test if error is raised when invalid layout is given - np.random.randn(2020) + np.random.seed(0) df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) df["D"] = np.random.choice(["a", "b", "c"], 30) @@ -3312,7 +3312,7 @@ def test_hist_plot_invalid_layout_with_by(self): @pytest.mark.slow def test_axis_share_x_with_by(self): # GH 15079 - np.random.randn(2020) + np.random.seed(0) df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) df["D"] = np.random.choice(["a", "b", "c"], 30) @@ -3334,7 +3334,7 @@ def test_axis_share_x_with_by(self): @pytest.mark.slow def test_axis_share_y_with_by(self): # GH 15079 - np.random.randn(2020) + np.random.seed(0) df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) df["D"] = np.random.choice(["a", "b", "c"], 30) @@ -3356,7 +3356,6 @@ def test_axis_share_y_with_by(self): @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) def test_figure_shape_hist_with_by(self, figsize): # GH 15079 - np.random.randn(2020) df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) df["D"] = np.random.choice(["a", "b", "c"], 30) From 5adb25dccf30630d2caab9e0d5f878ac05a72a06 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 15 Jan 2020 09:22:28 +0100 Subject: [PATCH 055/142] code change based on reviews --- doc/source/whatsnew/v1.1.0.rst | 3 ++- pandas/plotting/_core.py | 10 ++++++++++ pandas/plotting/_matplotlib/core.py | 14 +++++++------- pandas/plotting/_matplotlib/hist.py | 3 +-- pandas/tests/plotting/test_frame.py | 11 ++++++++--- 5 files changed, 28 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 721bcb0758992..327787cbebc97 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -127,7 +127,8 @@ Plotting ^^^^^^^^ - -- +- Implement ``by`` argument for :meth:`DataFrame.plot.hist` (:issue:`15079`) + Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index dd907457f7c32..a65980221837b 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1175,6 +1175,16 @@ def hist(self, by=None, bins=10, **kwargs): ... columns = ['one']) >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) + + .. plot:: + :context: close-figs + + >>> np.random.seed(159753) + >>> df = pd.DataFrame(np.random.randn(30, 2), columns=['A', 'B']) + >>> df['C'] = np.random.choice(['a', 'b', 'c'], 30) + >>> df['D'] = np.random.choice(['a', 'b', 'c'], 30) + >>> ax = df.plot.hist(column=['A', 'B'], by=['C'], figsize=(8, 10)) + """ return self(kind="hist", by=by, bins=bins, **kwargs) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c42f27b3712fa..b0b0369f95ce6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -248,7 +248,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if self.by is None: for col, values in data.items(): - if keep_index is True: + if keep_index: yield col, values else: yield col, values.values @@ -256,11 +256,11 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): cols = data.columns.get_level_values(0).unique() for col in cols: - mask = data.columns.get_level_values(0) == col + data_value = data.loc[:, data.columns.get_level_values(0) == col] if keep_index is True: - yield col, data.loc[:, mask] + yield col, data_value else: - yield col, data.loc[:, mask].values + yield col, data_value.values @property def nseries(self): @@ -404,9 +404,9 @@ def _compute_plot_data(self): data_list = [] for key, group in grouped: columns = MultiIndex.from_product([[key], self.column]) - group = group[self.column] - group.columns = columns - data_list.append(group) + sub_group = group[self.column] + sub_group.columns = columns + data_list.append(sub_group) data = concat(data_list, axis=1) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 85e103970d40b..dc182536a3696 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -110,8 +110,7 @@ def _make_plot(self): self._add_legend_handle(artists[0], label, index=i) def _reformat_y(self, y): - """Internal function to reformat y given `by` is applied or not. - """ + """Internal function to reformat y given `by` is applied or not.""" if self.by is not None and len(y.shape) > 1: notna = [col[~isna(col)] for col in y.T] y = np.array(np.array(notna).T) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index bebd915fec61d..4292295e08793 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -4,6 +4,7 @@ from datetime import date, datetime import itertools +import re import string import warnings @@ -3300,13 +3301,17 @@ def test_hist_plot_invalid_layout_with_by(self): df["D"] = np.random.choice(["a", "b", "c"], 30) # layout too small for all 3 plots - with pytest.raises(ValueError): + msg = "larger than required size" + with pytest.raises(ValueError, match=msg): df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) # invalid format for layout - with pytest.raises(ValueError): + msg = re.escape("Layout must be a tuple of (rows, columns)") + with pytest.raises(ValueError, match=msg): df.plot.hist(column=["A", "B"], by="C", layout=(1,)) - with pytest.raises(ValueError): + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) @pytest.mark.slow From 7051432e30b3ced05a052ad7c55c8e3d7cfad9c2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 15 Jan 2020 09:46:50 +0100 Subject: [PATCH 056/142] fix linting --- pandas/plotting/_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a65980221837b..c4861eafb60b7 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1182,9 +1182,7 @@ def hist(self, by=None, bins=10, **kwargs): >>> np.random.seed(159753) >>> df = pd.DataFrame(np.random.randn(30, 2), columns=['A', 'B']) >>> df['C'] = np.random.choice(['a', 'b', 'c'], 30) - >>> df['D'] = np.random.choice(['a', 'b', 'c'], 30) >>> ax = df.plot.hist(column=['A', 'B'], by=['C'], figsize=(8, 10)) - """ return self(kind="hist", by=by, bins=bins, **kwargs) From adbde9f77398e18a8e6bf3a44803e71eefa0d733 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 15 Jan 2020 09:50:02 +0100 Subject: [PATCH 057/142] update doc --- pandas/plotting/_core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c4861eafb60b7..9b4f69d380eea 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1176,6 +1176,8 @@ def hist(self, by=None, bins=10, **kwargs): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) + If `by` is defined, a grouped hist plot is generated: + .. plot:: :context: close-figs From abd10f317eb78a5508ee3c66356b1d3283e3fc5d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 18:07:31 +0100 Subject: [PATCH 058/142] code change based on reviews --- pandas/plotting/_matplotlib/core.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 881d6b171b25a..26e1eb35b8879 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -112,7 +112,7 @@ def __init__( self.data = data self.by = by - self.column = [column] if isinstance(column, str) else column + self.column = [column] if not isinstance(column, list) else column self.kind = kind @@ -247,20 +247,21 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): data = data.fillna(fillna) if self.by is None: - for col, values in data.items(): - if keep_index: - yield col, values - else: - yield col, values.values + cols = data.columns else: cols = data.columns.get_level_values(0).unique() - for col in cols: - data_value = data.loc[:, data.columns.get_level_values(0) == col] - if keep_index is True: - yield col, data_value - else: - yield col, data_value.values + for col in cols: + if self.by is None: + values = data.loc[:, col] + else: + # if `by` is defined, select columns which are grouped by + values = data.loc[:, data.columns.get_level_values(0) == col] + + if keep_index: + yield col, values + else: + yield col, values.values @property def nseries(self): From c20d81a75d8242a71573b6e0153b723388351c79 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 20:08:57 +0100 Subject: [PATCH 059/142] fixup --- pandas/plotting/_matplotlib/core.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 26e1eb35b8879..cfa501103abf8 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -247,21 +247,21 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): data = data.fillna(fillna) if self.by is None: - cols = data.columns + for col, values in data.items(): + self._yield_values(keep_index, col, values) else: cols = data.columns.get_level_values(0).unique() - for col in cols: - if self.by is None: - values = data.loc[:, col] - else: - # if `by` is defined, select columns which are grouped by + for col in cols: values = data.loc[:, data.columns.get_level_values(0) == col] + self._yield_values(keep_index, col, values) - if keep_index: - yield col, values - else: - yield col, values.values + def _yield_values(self, keep_index, col, values): + """Yield col and values based on keep_index value.""" + if keep_index is True: + yield col, values + else: + yield col, values.values @property def nseries(self): From 07112c00b661eae885e69b068ae28e0b52304656 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 20:10:53 +0100 Subject: [PATCH 060/142] fixup --- pandas/plotting/_matplotlib/core.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index cfa501103abf8..d6302ce91cf4d 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -248,20 +248,19 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if self.by is None: for col, values in data.items(): - self._yield_values(keep_index, col, values) + if keep_index is True: + yield col, values + else: + yield col, values.values else: cols = data.columns.get_level_values(0).unique() for col in cols: - values = data.loc[:, data.columns.get_level_values(0) == col] - self._yield_values(keep_index, col, values) - - def _yield_values(self, keep_index, col, values): - """Yield col and values based on keep_index value.""" - if keep_index is True: - yield col, values - else: - yield col, values.values + data_values = data.loc[:, data.columns.get_level_values(0) == col] + if keep_index is True: + yield col, data_values + else: + yield col, data_values.values @property def nseries(self): From fb0b87cc8702f461b4d4bdca037b9e92f4255e8a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 11 Feb 2020 19:55:00 +0100 Subject: [PATCH 061/142] code change on reviews --- pandas/plotting/_matplotlib/hist.py | 6 ++- pandas/tests/plotting/test_frame.py | 67 +++++++++++------------------ 2 files changed, 30 insertions(+), 43 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index dc182536a3696..ad8c6a60de161 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,3 +1,4 @@ +from typing import Union import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -9,6 +10,7 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots +from pandas.core.series import Series class HistPlot(LinePlot): @@ -38,7 +40,7 @@ def _args_adjust(self): if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - def _caculcate_bins(self, data): + def _caculcate_bins(self, data: ABCDataFrame) -> np.array: """Calculate bins given data""" values = data._convert(datetime=True)._get_numeric_data() @@ -109,7 +111,7 @@ def _make_plot(self): self._add_legend_handle(artists[0], label, index=i) - def _reformat_y(self, y): + def _reformat_y(self, y: Union[Series, np.array]) -> Union[Series, np.array]: """Internal function to reformat y given `by` is applied or not.""" if self.by is not None and len(y.shape) > 1: notna = [col[~isna(col)] for col in y.T] diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4292295e08793..0eb96d3b21715 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -26,6 +26,15 @@ import pandas.plotting as plotting +@pytest.fixture(scope="module") +def test_hist_df(): + np.random.seed(0) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + return df + + @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): def setup_method(self, method): @@ -3259,13 +3268,9 @@ def test_subplots_sharex_false(self): @pytest.mark.parametrize("by", ["C", ["C", "D"]]) @pytest.mark.parametrize("column", ["A", ["A", "B"]]) - def test_hist_plot_by_argument(self, by, column): + def test_hist_plot_by_argument(self, by, column, test_hist_df): # GH 15079 - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - - _check_plot_works(df.plot.hist, column=column, by=by) + _check_plot_works(test_hist_df.plot.hist, column=column, by=by) @pytest.mark.slow @pytest.mark.parametrize( @@ -3282,47 +3287,36 @@ def test_hist_plot_by_argument(self, by, column): (["C", "D"], ["A", "B"], (5, 2), 9), ], ) - def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): + def test_hist_plot_layout_with_by(self, by, column, layout, axes_num, test_hist_df): # GH 15079 - np.random.seed(0) - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - + # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.hist, column=column, by=by, layout=layout) + axes = _check_plot_works( + test_hist_df.plot.hist, column=column, by=by, layout=layout + ) self._check_axes_shape(axes, axes_num=axes_num, layout=layout) - def test_hist_plot_invalid_layout_with_by(self): + def test_hist_plot_invalid_layout_with_by_raises(self, test_hist_df): # GH 15079, test if error is raised when invalid layout is given - np.random.seed(0) - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) # layout too small for all 3 plots msg = "larger than required size" with pytest.raises(ValueError, match=msg): - df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) + test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) # invalid format for layout msg = re.escape("Layout must be a tuple of (rows, columns)") with pytest.raises(ValueError, match=msg): - df.plot.hist(column=["A", "B"], by="C", layout=(1,)) + test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): - df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) + test_hist_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) @pytest.mark.slow - def test_axis_share_x_with_by(self): + def test_axis_share_x_with_by(self, test_hist_df): # GH 15079 - np.random.seed(0) - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - - ax1, ax2, ax3 = df.plot.hist(column="A", by="C", sharex=True) + ax1, ax2, ax3 = test_hist_df.plot.hist(column="A", by="C", sharex=True) # share x assert ax1._shared_x_axes.joined(ax1, ax2) @@ -3337,14 +3331,9 @@ def test_axis_share_x_with_by(self): assert not ax3._shared_y_axes.joined(ax2, ax3) @pytest.mark.slow - def test_axis_share_y_with_by(self): + def test_axis_share_y_with_by(self, test_hist_df): # GH 15079 - np.random.seed(0) - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - - ax1, ax2, ax3 = df.plot.hist(column="A", by="C", sharey=True) + ax1, ax2, ax3 = test_hist_df.plot.hist(column="A", by="C", sharey=True) # share y assert ax1._shared_y_axes.joined(ax1, ax2) @@ -3359,13 +3348,9 @@ def test_axis_share_y_with_by(self): assert not ax3._shared_x_axes.joined(ax2, ax3) @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) - def test_figure_shape_hist_with_by(self, figsize): + def test_figure_shape_hist_with_by(self, figsize, test_hist_df): # GH 15079 - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - - axes = df.plot.hist(column="A", by="C", figsize=figsize) + axes = test_hist_df.plot.hist(column="A", by="C", figsize=figsize) self._check_axes_shape(axes, axes_num=3, figsize=figsize) def test_plot_no_rows(self): From a6a8e579aa2cd89252f32435ea2d0507dfc5aefc Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 11 Feb 2020 20:25:56 +0100 Subject: [PATCH 062/142] fix isort --- pandas/plotting/_matplotlib/hist.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ad8c6a60de161..338cde64d9082 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,4 +1,5 @@ from typing import Union + import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -6,11 +7,11 @@ from pandas.core.dtypes.missing import isna, remove_na_arraylike import pandas.core.common as com +from pandas.core.series import Series from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots -from pandas.core.series import Series class HistPlot(LinePlot): From 7f77f485d3076bb937aaf32a1b7c06e0cf32aba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 08:36:20 +0100 Subject: [PATCH 063/142] short code --- pandas/plotting/_matplotlib/core.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index d6302ce91cf4d..68f81edd889e5 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -247,20 +247,18 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): data = data.fillna(fillna) if self.by is None: - for col, values in data.items(): - if keep_index is True: - yield col, values - else: - yield col, values.values + for col, val in data.items(): + if not keep_index: + val = val.values + yield col, val else: cols = data.columns.get_level_values(0).unique() for col in cols: - data_values = data.loc[:, data.columns.get_level_values(0) == col] - if keep_index is True: - yield col, data_values - else: - yield col, data_values.values + val = data.loc[:, data.columns.get_level_values(0) == col] + if not keep_index: + val = val.values + yield col, val @property def nseries(self): From a120d27ff5bb90e991dfcec50f008aabe15a1583 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 09:53:21 +0100 Subject: [PATCH 064/142] simpler python --- pandas/plotting/_matplotlib/core.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 79afb3abd6e24..c8d167dddd90b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -246,19 +246,18 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - if self.by is None: - for col, val in data.items(): - if not keep_index: - val = val.values - yield col, val - else: - cols = data.columns.get_level_values(0).unique() - - for col in cols: - val = data.loc[:, data.columns.get_level_values(0) == col] - if not keep_index: - val = val.values - yield col, val + iter_data = data + if self.by is not None: + cols = data.columns.levels[0] + iter_data = { + col: data.loc[:, data.columns.get_level_values(0) == col] + for col in cols + } + + for col, val in iter_data.items(): + if not keep_index: + val = val.values + yield col, val @property def nseries(self): From f87afee50716c5b24dc2f2cad074d5638ae5172e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 09:55:05 +0100 Subject: [PATCH 065/142] add inline comment --- pandas/plotting/_matplotlib/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c8d167dddd90b..6bb651f7300eb 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -248,6 +248,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): iter_data = data if self.by is not None: + # select sub-columns based on the value of first level of MI cols = data.columns.levels[0] iter_data = { col: data.loc[:, data.columns.get_level_values(0) == col] From 82711ee082d08e2d862338ae4403cbfaa4a35d0f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 09:57:34 +0100 Subject: [PATCH 066/142] simplier pandas --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6bb651f7300eb..8bf39398f889a 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -267,7 +267,7 @@ def nseries(self): elif self.by is None: return self.data.shape[1] else: - return len(set(self.data.columns.get_level_values(0))) + return len(self.data.columns.levels[0]) def draw(self): self.plt.draw_if_interactive() From 60f729811232019f25bf6a4bbf2bb56f043f0532 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 16:58:46 +0100 Subject: [PATCH 067/142] code change on JR review --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/plotting/_core.py | 11 ++++++++++- pandas/plotting/_matplotlib/core.py | 3 ++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0ce622fdff98a..8f61342c05cd8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -43,6 +43,7 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) +- Implement ``by`` argument for :meth:`DataFrame.plot.hist` (:issue:`15079`) - - @@ -203,7 +204,6 @@ Plotting ^^^^^^^^ - -- Implement ``by`` argument for :meth:`DataFrame.plot.hist` (:issue:`15079`) - :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`). - - Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``medianprops`` (:issue:`30346`) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index f914ae6bc0649..4b87a1e624583 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1177,6 +1177,9 @@ def hist(self, by=None, bins=10, **kwargs): ---------- by : str or sequence, optional Column in the DataFrame to group by. + + .. versionadded:: 1.1.0 + bins : int, default 10 Number of histogram bins to be used. **kwargs @@ -1209,7 +1212,8 @@ def hist(self, by=None, bins=10, **kwargs): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) - If `by` is defined, a grouped hist plot is generated: + If `by` can be assigned by the DataFrame column names, or a list of column + names for which to group, and a grouped hist plot is generated: .. plot:: :context: close-figs @@ -1218,6 +1222,11 @@ def hist(self, by=None, bins=10, **kwargs): >>> df = pd.DataFrame(np.random.randn(30, 2), columns=['A', 'B']) >>> df['C'] = np.random.choice(['a', 'b', 'c'], 30) >>> ax = df.plot.hist(column=['A', 'B'], by=['C'], figsize=(8, 10)) + + .. plot:: + :context: close-figs + + >>> ax = df.plot.hist(column=['A', 'B'], by='C', figsize=(8, 10)) """ return self(kind="hist", by=by, bins=bins, **kwargs) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 8bf39398f889a..77921f9906636 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -267,7 +267,7 @@ def nseries(self): elif self.by is None: return self.data.shape[1] else: - return len(self.data.columns.levels[0]) + return self._grouped_data_size def draw(self): self.plt.draw_if_interactive() @@ -398,6 +398,7 @@ def _compute_plot_data(self): if self.by is not None: self.subplots = True grouped = data.groupby(self.by) + self._grouped_data_size = len(grouped) data_list = [] for key, group in grouped: From 071488b5b2245e923dc17b2b17e8defeeda212d7 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 23 Feb 2020 17:17:50 +0100 Subject: [PATCH 068/142] fix linting --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 4b87a1e624583..93b3c7ec8473d 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1178,7 +1178,7 @@ def hist(self, by=None, bins=10, **kwargs): by : str or sequence, optional Column in the DataFrame to group by. - .. versionadded:: 1.1.0 + .. versionadded:: 1.1.0 bins : int, default 10 Number of histogram bins to be used. From 867094a72735d1b9040f740c115935e51095cc97 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 8 Mar 2020 17:24:07 +0100 Subject: [PATCH 069/142] code change on reviews --- pandas/plotting/_matplotlib/core.py | 40 +++++++++++++++++++---------- pandas/plotting/_matplotlib/hist.py | 4 +-- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 77921f9906636..5b25e4e9d1aff 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -112,7 +112,7 @@ def __init__( self.data = data self.by = by - self.column = [column] if not isinstance(column, list) else column + self.columns = [column] if not isinstance(column, list) else column self.kind = kind @@ -385,6 +385,28 @@ def result(self): else: return self.axes[0] + def _transform_grouped_data(self, data: ABCDataFrame) -> ABCDataFrame: + """ + Internal function to transform grouped DataFrame object to a normal + DataFrame to facilitate further manipulation. + + The input is the original DataFrame to plot, and output is the reconstructed + DataFrame with MultiIndex columns. The first level of MI is unique values of + groups, and second level of MI is the columns selected by users. + """ + grouped = data.groupby(self.by) + self._grouped_data_size = len(grouped) + + data_list = [] + for key, group in grouped: + columns = MultiIndex.from_product([[key], self.columns]) + sub_group = group[self.columns] + sub_group.columns = columns + data_list.append(sub_group) + + data = concat(data_list, axis=1) + return data + def _compute_plot_data(self): data = self.data @@ -394,20 +416,12 @@ def _compute_plot_data(self): label = "None" data = data.to_frame(name=label) - # GH15079 restructure data if by is defined + # GH15079 reconstruct data if by is defined if self.by is not None: - self.subplots = True - grouped = data.groupby(self.by) - self._grouped_data_size = len(grouped) - data_list = [] - for key, group in grouped: - columns = MultiIndex.from_product([[key], self.column]) - sub_group = group[self.column] - sub_group.columns = columns - data_list.append(sub_group) - - data = concat(data_list, axis=1) + # Set subplots to True if self.by is defined + self.subplots = True + data = self._transform_grouped_data(data) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 338cde64d9082..8960edeab58b3 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -32,7 +32,7 @@ def _args_adjust(self): self.bins = self._caculcate_bins(self.data) else: - grouped = self.data.groupby(self.by)[self.column] + grouped = self.data.groupby(self.by)[self.columns] bins_list = [] for key, group in grouped: bins_list.append(self._caculcate_bins(group)) @@ -100,7 +100,7 @@ def _make_plot(self): # when by is applied, label should be columns that are grouped if self.by is not None: kwds["bins"] = kwds["bins"][i] - kwds["label"] = self.column + kwds["label"] = self.columns kwds.pop("color") y = self._reformat_y(y) From b0f06b2ac0b499e3050b94568724682a17268f0e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 8 Mar 2020 17:34:35 +0100 Subject: [PATCH 070/142] Add docstring --- pandas/plotting/_matplotlib/core.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 5b25e4e9d1aff..539a264e74dd3 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -241,6 +241,29 @@ def _validate_color_args(self): ) def _iter_data(self, data=None, keep_index=False, fillna=None): + """ + Iterate data to yield inputs for plotting methods. + + When self.by is not defined, iter_data is served as a DataFrame, and column + name and Series or values of Series are yielded. + + When self.by is defined, since values of multiple columns might have to be + yielded at the same time to visualize multiple plots, `df.items()` cannot + achieve it, so here to convert iter_data to dictionaries to provide input + for plot methods, and column name and DataFrame or values of DataFrame are + yielded. + + Parameters + ---------- + data: DataFrame + keep_index: bool, if to keep original index or not + fillna: values used to fill NAs, default is None + + Returns + ------- + If self.by is None, return column name and Series/values of Series; If self.by + is not None, return column name and DataFrame/values of DataFrame. + """ if data is None: data = self.data if fillna is not None: From 111e89c008c97bf3de8e2dee5d820f6aa23b8311 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 16:54:15 +0100 Subject: [PATCH 071/142] fix typo --- pandas/plotting/_matplotlib/hist.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 8960edeab58b3..f7ff19ae783a5 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -29,19 +29,19 @@ def _args_adjust(self): # where subplots are created based on by argument if is_integer(self.bins): if self.by is None: - self.bins = self._caculcate_bins(self.data) + self.bins = self._calculate_bins(self.data) else: grouped = self.data.groupby(self.by)[self.columns] bins_list = [] for key, group in grouped: - bins_list.append(self._caculcate_bins(group)) + bins_list.append(self._calculate_bins(group)) self.bins = bins_list if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - def _caculcate_bins(self, data: ABCDataFrame) -> np.array: + def _calculate_bins(self, data: ABCDataFrame) -> np.array: """Calculate bins given data""" values = data._convert(datetime=True)._get_numeric_data() From 83ec86809de41a489533795c416eb7b5ac5af40c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 16:58:22 +0100 Subject: [PATCH 072/142] remove blank --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3b01676e6ec53..1c107ecd422dc 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -346,7 +346,6 @@ I/O Plotting ^^^^^^^^ -- - :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`). - - Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``medianprops`` (:issue:`30346`) From d6c8566c1d1a7ce9b8c711decb99a25f2942e011 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 17:15:32 +0100 Subject: [PATCH 073/142] use more meaningful example --- pandas/plotting/_core.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 93b3c7ec8473d..261d093ae2026 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1212,21 +1212,15 @@ def hist(self, by=None, bins=10, **kwargs): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) - If `by` can be assigned by the DataFrame column names, or a list of column - names for which to group, and a grouped hist plot is generated: + A grouped histogram can be generated by providing the parameter `by` (which + can be a column name, or a list of column names): .. plot:: :context: close-figs - >>> np.random.seed(159753) - >>> df = pd.DataFrame(np.random.randn(30, 2), columns=['A', 'B']) - >>> df['C'] = np.random.choice(['a', 'b', 'c'], 30) - >>> ax = df.plot.hist(column=['A', 'B'], by=['C'], figsize=(8, 10)) - - .. plot:: - :context: close-figs - - >>> ax = df.plot.hist(column=['A', 'B'], by='C', figsize=(8, 10)) + >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85] + >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list}) + >>> ax = df.plot.hist(column=["age"], by="gender") """ return self(kind="hist", by=by, bins=bins, **kwargs) From 6a0ac8dd588038a8f346020ed091e6df28b509e4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 17:17:44 +0100 Subject: [PATCH 074/142] keep as is --- pandas/plotting/_matplotlib/core.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 539a264e74dd3..9920db7538ae3 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -278,10 +278,11 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): for col in cols } - for col, val in iter_data.items(): - if not keep_index: - val = val.values - yield col, val + for col, values in iter_data.items(): + if keep_index is True: + yield col, values + else: + yield col, values.values @property def nseries(self): From 49d0791adc6be7cdb61c956b35dfbd0e29934006 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 17:19:14 +0100 Subject: [PATCH 075/142] remove less useful comment --- pandas/plotting/_matplotlib/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 9920db7538ae3..6fdaa691abe80 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -442,8 +442,6 @@ def _compute_plot_data(self): # GH15079 reconstruct data if by is defined if self.by is not None: - - # Set subplots to True if self.by is defined self.subplots = True data = self._transform_grouped_data(data) From 2bfbe78149008cd509703cf4f0fa565a6f9559f3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Mar 2020 17:23:57 +0100 Subject: [PATCH 076/142] change figsize --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 261d093ae2026..29935b6dcda34 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1220,7 +1220,7 @@ def hist(self, by=None, bins=10, **kwargs): >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85] >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list}) - >>> ax = df.plot.hist(column=["age"], by="gender") + >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8)) """ return self(kind="hist", by=by, bins=bins, **kwargs) From c5d75189945f1c4efa974987d45ccbf148668435 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Apr 2020 11:53:08 +0200 Subject: [PATCH 077/142] clean iter_data --- pandas/plotting/_matplotlib/core.py | 11 +---------- pandas/plotting/_matplotlib/hist.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6fdaa691abe80..173fdbec42029 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -269,16 +269,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - iter_data = data - if self.by is not None: - # select sub-columns based on the value of first level of MI - cols = data.columns.levels[0] - iter_data = { - col: data.loc[:, data.columns.get_level_values(0) == col] - for col in cols - } - - for col, values in iter_data.items(): + for col, values in data.items(): if keep_index is True: yield col, values else: diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index f7ff19ae783a5..e35f54d65f570 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -78,11 +78,24 @@ def _plot( cls._update_stacker(ax, stacking_id, n) return patches + def _create_iter_data(self): + """Create data for iteration if `by` is assigned""" + data = self.data + if self.by is not None: + # select sub-columns based on the value of first level of MI + cols = data.columns.levels[0] + iter_data = { + col: data.loc[:, data.columns.get_level_values(0) == col] + for col in cols + } + return iter_data + def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() + data = self._create_iter_data() - for i, (label, y) in enumerate(self._iter_data()): + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() From 03356cea0796e18953239e6ac796cb91a332930d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Apr 2020 11:55:14 +0200 Subject: [PATCH 078/142] remove unused docs --- pandas/plotting/_matplotlib/core.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 173fdbec42029..6866770dbcd84 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -241,29 +241,6 @@ def _validate_color_args(self): ) def _iter_data(self, data=None, keep_index=False, fillna=None): - """ - Iterate data to yield inputs for plotting methods. - - When self.by is not defined, iter_data is served as a DataFrame, and column - name and Series or values of Series are yielded. - - When self.by is defined, since values of multiple columns might have to be - yielded at the same time to visualize multiple plots, `df.items()` cannot - achieve it, so here to convert iter_data to dictionaries to provide input - for plot methods, and column name and DataFrame or values of DataFrame are - yielded. - - Parameters - ---------- - data: DataFrame - keep_index: bool, if to keep original index or not - fillna: values used to fill NAs, default is None - - Returns - ------- - If self.by is None, return column name and Series/values of Series; If self.by - is not None, return column name and DataFrame/values of DataFrame. - """ if data is None: data = self.data if fillna is not None: From 7abc47df909a7d68f717a7de6d4bdb8bae15b9d2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Apr 2020 12:01:46 +0200 Subject: [PATCH 079/142] cleaner pandas --- pandas/plotting/_matplotlib/core.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6866770dbcd84..eaac0c812edb9 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -25,6 +25,7 @@ from pandas import MultiIndex import pandas.core.common as com +from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing @@ -377,7 +378,7 @@ def result(self): else: return self.axes[0] - def _transform_grouped_data(self, data: ABCDataFrame) -> ABCDataFrame: + def _reformat_grouped_data(self, grouped: DataFrameGroupBy) -> ABCDataFrame: """ Internal function to transform grouped DataFrame object to a normal DataFrame to facilitate further manipulation. @@ -386,7 +387,6 @@ def _transform_grouped_data(self, data: ABCDataFrame) -> ABCDataFrame: DataFrame with MultiIndex columns. The first level of MI is unique values of groups, and second level of MI is the columns selected by users. """ - grouped = data.groupby(self.by) self._grouped_data_size = len(grouped) data_list = [] @@ -411,7 +411,8 @@ def _compute_plot_data(self): # GH15079 reconstruct data if by is defined if self.by is not None: self.subplots = True - data = self._transform_grouped_data(data) + grouped_data = data.groupby(self.by) + data = self._transform_grouped_data(grouped_data) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` From db832b413dbc9e4e4c165df75876813930f5a0cb Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Apr 2020 12:04:13 +0200 Subject: [PATCH 080/142] cleaner --- pandas/plotting/_matplotlib/core.py | 15 +++++++++++++++ pandas/plotting/_matplotlib/hist.py | 12 ------------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index eaac0c812edb9..ec53fbb709354 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -241,6 +241,21 @@ def _validate_color_args(self): "pass 'style' without a color symbol" ) + def _create_iter_data(self): + """ + Create data for iteration if `by` is assigned, and it is used in both + hist and boxplot. + """ + data = self.data + if self.by is not None: + # select sub-columns based on the value of first level of MI + cols = data.columns.levels[0] + iter_data = { + col: data.loc[:, data.columns.get_level_values(0) == col] + for col in cols + } + return iter_data + def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: data = self.data diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index e35f54d65f570..c5e6324fcc462 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -78,18 +78,6 @@ def _plot( cls._update_stacker(ax, stacking_id, n) return patches - def _create_iter_data(self): - """Create data for iteration if `by` is assigned""" - data = self.data - if self.by is not None: - # select sub-columns based on the value of first level of MI - cols = data.columns.levels[0] - iter_data = { - col: data.loc[:, data.columns.get_level_values(0) == col] - for col in cols - } - return iter_data - def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() From 9ae59871f4366ab2ea5f3c79202ad24ac274d387 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Apr 2020 12:19:47 +0200 Subject: [PATCH 081/142] fixup --- pandas/plotting/_matplotlib/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index e81bcddcf6488..8862276161b5d 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -246,12 +246,12 @@ def _create_iter_data(self): Create data for iteration if `by` is assigned, and it is used in both hist and boxplot. """ - data = self.data + iter_data = self.data if self.by is not None: # select sub-columns based on the value of first level of MI - cols = data.columns.levels[0] + cols = self.data.columns.levels[0] iter_data = { - col: data.loc[:, data.columns.get_level_values(0) == col] + col: self.data.loc[:, self.data.columns.get_level_values(0) == col] for col in cols } return iter_data From 10c2ad11b239ce312530435a210aaacdfd87c03b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 4 Apr 2020 12:34:23 +0200 Subject: [PATCH 082/142] rename --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 8862276161b5d..89c265ff2e844 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -427,7 +427,7 @@ def _compute_plot_data(self): if self.by is not None: self.subplots = True grouped_data = data.groupby(self.by) - data = self._transform_grouped_data(grouped_data) + data = self._reformat_grouped_data(grouped_data) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` From ce8cfd4551b0ed4ac3b150681401933bf9fd7f93 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Apr 2020 22:45:16 +0200 Subject: [PATCH 083/142] code change on reviews --- pandas/plotting/_matplotlib/core.py | 101 +++++++++++++++++++++------- pandas/plotting/_matplotlib/hist.py | 2 +- pandas/tests/plotting/test_frame.py | 4 +- 3 files changed, 82 insertions(+), 25 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 89c265ff2e844..1726473bef34e 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import Optional +from typing import Dict, Optional, Union import warnings import numpy as np @@ -23,9 +23,8 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas import MultiIndex +from pandas import DataFrame, MultiIndex import pandas.core.common as com -from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing @@ -112,8 +111,17 @@ def __init__( import matplotlib.pyplot as plt self.data = data - self.by = by - self.columns = [column] if not isinstance(column, list) else column + self.by = [by] if not isinstance(by, list) or by is None else by + + if self.by: + self._grouped_data_size = len(data.groupby(self.by)) + + # Assign the rest of columns into self.columns if by is explicitly defined + # while column is not, so as to keep the same behaviour with current df.hist + if self.by and column is None: + self.columns = [col for col in data.columns if col not in self.by] + else: + self.columns = [column] if not isinstance(column, list) else column self.kind = kind @@ -241,17 +249,48 @@ def _validate_color_args(self): "pass 'style' without a color symbol" ) - def _create_iter_data(self): + @staticmethod + def _create_iter_data_given_by( + data: ABCDataFrame, by: Optional[list] + ) -> Union[ABCDataFrame, Dict[str, Union[ABCDataFrame, ABCSeries]]]: """ - Create data for iteration if `by` is assigned, and it is used in both - hist and boxplot. + Create data for iteration given `by` is assigned or not, and it is only + used in both hist and boxplot. + + If `by` is assigned, return a dictionary of DataFrames in which the key of + dictionary is the values in groups. + If `by` is not assigned, return input as is, and this preserves current + status of iter_data. + + Parameters + ---------- + data: reformatted grouped data from `_compute_plot_data` method + by: list or None, value assigned to `by`. + + Returns + ------- + iter_data: DataFrame or Dictionary of DataFrames + + Examples + -------- + If `by` is assigned: + + >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] + >>> mi = MultiIndex.from_tuples(tuples) + >>> value = [[1, 3, np.nan, np.nan], + ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] + >>> data = DataFrame(value, columns=mi) + >>> _create_iter_data_given_by(data, by=["col1"]) # doctest: +SKIP + {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), + 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} """ - iter_data = self.data - if self.by is not None: - # select sub-columns based on the value of first level of MI - cols = self.data.columns.levels[0] + if not by: + iter_data = data + else: + # Select sub-columns based on the value of first level of MI + cols = data.columns.levels[0] iter_data = { - col: self.data.loc[:, self.data.columns.get_level_values(0) == col] + col: data.loc[:, data.columns.get_level_values(0) == col] for col in cols } return iter_data @@ -393,16 +432,33 @@ def result(self): else: return self.axes[0] - def _reformat_grouped_data(self, grouped: DataFrameGroupBy) -> ABCDataFrame: + def _reconstruct_data_with_by(self, data: ABCDataFrame) -> ABCDataFrame: """ - Internal function to transform grouped DataFrame object to a normal - DataFrame to facilitate further manipulation. - - The input is the original DataFrame to plot, and output is the reconstructed - DataFrame with MultiIndex columns. The first level of MI is unique values of - groups, and second level of MI is the columns selected by users. + Internal function to group data, and reassign multiindex column names onto the + result in order to let grouped data be used in _compute_plot_data method. + + Parameters + ---------- + data: Original DataFrame to plot + + Returns + ------- + Output is the reconstructed DataFrame with MultiIndex columns. The first level + of MI is unique values of groups, and second level of MI is the columns + selected by users. + + Examples + -------- + >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} + >>> df = DataFrame(d) + >>> _reconstruct_data_with_by(df) # doctest: +SKIP + h1 h2 + a b a b + 0 1 3 NaN NaN + 1 3 4 NaN NaN + 2 NaN NaN 5 6 """ - self._grouped_data_size = len(grouped) + grouped = data.groupby(self.by) data_list = [] for key, group in grouped: @@ -426,8 +482,7 @@ def _compute_plot_data(self): # GH15079 reconstruct data if by is defined if self.by is not None: self.subplots = True - grouped_data = data.groupby(self.by) - data = self._reformat_grouped_data(grouped_data) + data = self._reconstruct_data_with_by(self.data) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index c5e6324fcc462..4259ca19014a8 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -81,7 +81,7 @@ def _plot( def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() - data = self._create_iter_data() + data = self._create_iter_data_given_by(self.data, self.by) for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 64ffd4cce0500..85ce3c5b63818 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3275,7 +3275,7 @@ def test_subplots_sharex_false(self): tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) @pytest.mark.parametrize("by", ["C", ["C", "D"]]) - @pytest.mark.parametrize("column", ["A", ["A", "B"]]) + @pytest.mark.parametrize("column", ["A", ["A", "B"], None]) def test_hist_plot_by_argument(self, by, column, test_hist_df): # GH 15079 _check_plot_works(test_hist_df.plot.hist, column=column, by=by) @@ -3287,11 +3287,13 @@ def test_hist_plot_by_argument(self, by, column, test_hist_df): (["C"], "A", (2, 2), 3), ("C", "A", (2, 2), 3), (["C"], ["A"], (1, 3), 3), + ("C", None, (3, 1), 3), ("C", ["A", "B"], (3, 1), 3), (["C", "D"], "A", (9, 1), 9), (["C", "D"], "A", (3, 3), 9), (["C", "D"], ["A"], (5, 2), 9), (["C", "D"], ["A", "B"], (9, 1), 9), + (["C", "D"], None, (9, 1), 9), (["C", "D"], ["A", "B"], (5, 2), 9), ], ) From 627cc02e1770d10d270d21a4e39f08fab51c2ce2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Apr 2020 23:19:27 +0200 Subject: [PATCH 084/142] fixup --- pandas/plotting/_matplotlib/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 1726473bef34e..923b3150764ce 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -23,7 +23,7 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas import DataFrame, MultiIndex +from pandas import MultiIndex import pandas.core.common as com from pandas.core.reshape.concat import concat @@ -111,7 +111,7 @@ def __init__( import matplotlib.pyplot as plt self.data = data - self.by = [by] if not isinstance(by, list) or by is None else by + self.by = [by] if not isinstance(by, list) and by is not None else by if self.by: self._grouped_data_size = len(data.groupby(self.by)) @@ -279,7 +279,7 @@ def _create_iter_data_given_by( >>> mi = MultiIndex.from_tuples(tuples) >>> value = [[1, 3, np.nan, np.nan], ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] - >>> data = DataFrame(value, columns=mi) + >>> data = DataFrame(value, columns=mi) # doctest: +SKIP >>> _create_iter_data_given_by(data, by=["col1"]) # doctest: +SKIP {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} From ee8972d0cb753640877fb8e192364f3795631c72 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Apr 2020 20:02:33 +0200 Subject: [PATCH 085/142] linting --- pandas/plotting/_matplotlib/hist.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 56104f64a8c8b..829b69ad99bdb 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -49,9 +49,7 @@ def _calculate_bins(self, data: ABCDataFrame) -> np.array: values = values[~isna(values)] hist, bins = np.histogram( - values, - bins=self.bins, - range=self.kwds.get("range", None) + values, bins=self.bins, range=self.kwds.get("range", None) ) return bins From 0839be219f81cc5537ca18f4bb7be0f9960c483c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 1 May 2020 21:12:42 +0200 Subject: [PATCH 086/142] annotation --- pandas/plotting/_matplotlib/core.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 78fd598ee8e1d..d088e746d6cfc 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -4,6 +4,7 @@ import numpy as np +from pandas._typing import Label from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -23,7 +24,7 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas import MultiIndex +from pandas import DataFrame, MultiIndex, Series import pandas.core.common as com from pandas.core.reshape.concat import concat @@ -104,7 +105,7 @@ def __init__( table=False, layout=None, include_bool=False, - column=None, + column: Optional[Label] = None, **kwds, ): @@ -251,8 +252,8 @@ def _validate_color_args(self): @staticmethod def _create_iter_data_given_by( - data: ABCDataFrame, by: Optional[list] - ) -> Union[ABCDataFrame, Dict[str, Union[ABCDataFrame, ABCSeries]]]: + data: DataFrame, by: Optional[list] + ) -> Union[DataFrame, Dict[str, Union[DataFrame, Series]]]: """ Create data for iteration given `by` is assigned or not, and it is only used in both hist and boxplot. @@ -432,7 +433,7 @@ def result(self): else: return self.axes[0] - def _reconstruct_data_with_by(self, data: ABCDataFrame) -> ABCDataFrame: + def _reconstruct_data_with_by(self, data: DataFrame) -> DataFrame: """ Internal function to group data, and reassign multiindex column names onto the result in order to let grouped data be used in _compute_plot_data method. From 142ee532f8cad03d9206d61ce6e58f6830d13a7d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 1 May 2020 21:34:49 +0200 Subject: [PATCH 087/142] annotation --- pandas/plotting/_matplotlib/hist.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 829b69ad99bdb..1970bb570fced 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -8,6 +8,7 @@ import pandas.core.common as com from pandas.core.series import Series +from pandas.core.frame import DataFrame from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot @@ -41,7 +42,7 @@ def _args_adjust(self): if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - def _calculate_bins(self, data: ABCDataFrame) -> np.array: + def _calculate_bins(self, data: DataFrame) -> np.array: """Calculate bins given data""" values = data._convert(datetime=True)._get_numeric_data() From 2710cf20594fc7e9f5cef79582761465ea36a6ee Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 5 May 2020 16:11:07 +0200 Subject: [PATCH 088/142] fixup --- pandas/plotting/_matplotlib/core.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 84d9ec206c5e2..60879ae9a3f76 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union import warnings import numpy as np @@ -168,8 +168,8 @@ def __init__( self.grid = grid self.legend = legend - self.legend_handles = [] - self.legend_labels = [] + self.legend_handles: List = [] + self.legend_labels: List = [] for attr in self._pop_attributes: value = kwds.pop(attr, self._attr_defaults.get(attr, None)) @@ -251,7 +251,7 @@ def _validate_color_args(self): @staticmethod def _create_iter_data_given_by( - data: DataFrame, by: Optional[list] + data: DataFrame, by: Optional[List] ) -> Union[DataFrame, Dict[str, Union[DataFrame, Series]]]: """ Create data for iteration given `by` is assigned or not, and it is only @@ -279,16 +279,19 @@ def _create_iter_data_given_by( >>> mi = MultiIndex.from_tuples(tuples) >>> value = [[1, 3, np.nan, np.nan], ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] - >>> data = DataFrame(value, columns=mi) # doctest: +SKIP + >>> data = DataFrame(value, columns=mi) >>> _create_iter_data_given_by(data, by=["col1"]) # doctest: +SKIP {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} """ + iter_data: Union[DataFrame, Dict[str, Union[DataFrame, Series]]] if not by: iter_data = data else: # Select sub-columns based on the value of first level of MI - cols = data.columns.levels[0] + # TODO: mypy complains because Index does not have levels, only MI has. + cols = data.columns.levels[0] # type: ignore + print(data.columns.get_level_values(0)) iter_data = { col: data.loc[:, data.columns.get_level_values(0) == col] for col in cols From f76d2cbfbfab61fcd04a973fc6d22cab5109bad4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 5 May 2020 16:12:01 +0200 Subject: [PATCH 089/142] remove --- pandas/plotting/_matplotlib/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 60879ae9a3f76..d5460e9e3c493 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -291,7 +291,6 @@ def _create_iter_data_given_by( # Select sub-columns based on the value of first level of MI # TODO: mypy complains because Index does not have levels, only MI has. cols = data.columns.levels[0] # type: ignore - print(data.columns.get_level_values(0)) iter_data = { col: data.loc[:, data.columns.get_level_values(0) == col] for col in cols From a5ecbd70116d96594a92f0d82d7f3397e4ed0e2a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 11 May 2020 19:01:56 +0200 Subject: [PATCH 090/142] add missing annoatation --- pandas/plotting/_matplotlib/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index d5460e9e3c493..7205680461260 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -4,6 +4,7 @@ import numpy as np +from pandas._typing import Label from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -104,7 +105,7 @@ def __init__( table=False, layout=None, include_bool=False, - column=None, + column: Optional[Label] = None, **kwds, ): From 7425dff980de752c943b47f2161d4c2701a16d3d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 21 May 2020 20:08:34 +0200 Subject: [PATCH 091/142] code change on WA review --- pandas/plotting/_matplotlib/core.py | 7 ++++--- pandas/plotting/_matplotlib/hist.py | 6 +++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 0d73eacbcfc76..9106b0cb0789c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -2,6 +2,7 @@ from typing import Dict, List, Optional, Union import warnings +from matplotlib.artist import Artist import numpy as np from pandas._typing import Label @@ -112,7 +113,7 @@ def __init__( import matplotlib.pyplot as plt self.data = data - self.by = [by] if not isinstance(by, list) and by is not None else by + self.by = com.maybe_make_list(by) if self.by: self._grouped_data_size = len(data.groupby(self.by)) @@ -169,8 +170,8 @@ def __init__( self.grid = grid self.legend = legend - self.legend_handles: List = [] - self.legend_labels: List = [] + self.legend_handles: List[Artist] = [] + self.legend_labels: List[Label] = [] for attr in self._pop_attributes: value = kwds.pop(attr, self._attr_defaults.get(attr, None)) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 61cbbc991ebe5..954e4ddb1d814 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -119,7 +119,11 @@ def _make_plot(self): self._add_legend_handle(artists[0], label, index=i) def _reformat_y(self, y: Union[Series, np.array]) -> Union[Series, np.array]: - """Internal function to reformat y given `by` is applied or not.""" + """Internal function to reformat y given `by` is applied or not. + + If by is None, input y is 1-d array; and if by is not None, groupby will take + place and input y is multi-dimensional array. + """ if self.by is not None and len(y.shape) > 1: notna = [col[~isna(col)] for col in y.T] y = np.array(np.array(notna).T) From b06e454c48c727c1660c7628d915626f0eeb6917 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 19 Jun 2020 20:26:31 +0200 Subject: [PATCH 092/142] solve mypy --- pandas/plotting/_matplotlib/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 4b8c8b8f70576..9cce2a8e87f78 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -284,7 +284,7 @@ def _create_iter_data_given_by( >>> value = [[1, 3, np.nan, np.nan], ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] >>> data = DataFrame(value, columns=mi) - >>> _create_iter_data_given_by(data, by=["col1"]) # doctest: +SKIP + >>> _create_iter_data_given_by(data, by=["col1"]) {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} """ @@ -293,8 +293,8 @@ def _create_iter_data_given_by( iter_data = data else: # Select sub-columns based on the value of first level of MI - # TODO: mypy complains because Index does not have levels, only MI has. - cols = data.columns.levels[0] # type: ignore + assert isinstance(data, MultiIndex) + cols = data.columns.levels[0] iter_data = { col: data.loc[:, data.columns.get_level_values(0) == col] for col in cols From 79294eddb89cbe4ca884bc2f48a501902b2642b9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 19 Jun 2020 21:37:41 +0200 Subject: [PATCH 093/142] fix typo --- pandas/plotting/_matplotlib/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 9cce2a8e87f78..129f47d0d4718 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -293,7 +293,7 @@ def _create_iter_data_given_by( iter_data = data else: # Select sub-columns based on the value of first level of MI - assert isinstance(data, MultiIndex) + assert isinstance(data.columns, MultiIndex) cols = data.columns.levels[0] iter_data = { col: data.loc[:, data.columns.get_level_values(0) == col] @@ -457,7 +457,7 @@ def _reconstruct_data_with_by(self, data: DataFrame) -> DataFrame: -------- >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} >>> df = DataFrame(d) - >>> _reconstruct_data_with_by(df) # doctest: +SKIP + >>> _reconstruct_data_with_by(df) h1 h2 a b a b 0 1 3 NaN NaN From add406f08120532105bebe1fb9465565fae85fbf Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 23 Jun 2020 20:25:25 +0200 Subject: [PATCH 094/142] code change on reviews --- pandas/plotting/_matplotlib/core.py | 94 ++----------------------- pandas/plotting/_matplotlib/grouped.py | 97 ++++++++++++++++++++++++++ pandas/plotting/_matplotlib/hist.py | 9 +-- 3 files changed, 104 insertions(+), 96 deletions(-) create mode 100644 pandas/plotting/_matplotlib/grouped.py diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 129f47d0d4718..09c647ceddb3f 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -27,13 +27,12 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas import DataFrame, MultiIndex, Series import pandas.core.common as com -from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters +from pandas.plotting._matplotlib.grouped import reconstruct_data_with_by from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import ( _flatten, @@ -125,7 +124,8 @@ def __init__( if self.by and column is None: self.columns = [col for col in data.columns if col not in self.by] else: - self.columns = [column] if not isinstance(column, list) else column + self.columns = com.convert_to_list_like(column) + # self.columns = [column] if not isinstance(column, list) else column self.kind = kind @@ -253,54 +253,6 @@ def _validate_color_args(self): "pass 'style' without a color symbol" ) - @staticmethod - def _create_iter_data_given_by( - data: DataFrame, by: Optional[List] - ) -> Union[DataFrame, Dict[str, Union[DataFrame, Series]]]: - """ - Create data for iteration given `by` is assigned or not, and it is only - used in both hist and boxplot. - - If `by` is assigned, return a dictionary of DataFrames in which the key of - dictionary is the values in groups. - If `by` is not assigned, return input as is, and this preserves current - status of iter_data. - - Parameters - ---------- - data: reformatted grouped data from `_compute_plot_data` method - by: list or None, value assigned to `by`. - - Returns - ------- - iter_data: DataFrame or Dictionary of DataFrames - - Examples - -------- - If `by` is assigned: - - >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] - >>> mi = MultiIndex.from_tuples(tuples) - >>> value = [[1, 3, np.nan, np.nan], - ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] - >>> data = DataFrame(value, columns=mi) - >>> _create_iter_data_given_by(data, by=["col1"]) - {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), - 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} - """ - iter_data: Union[DataFrame, Dict[str, Union[DataFrame, Series]]] - if not by: - iter_data = data - else: - # Select sub-columns based on the value of first level of MI - assert isinstance(data.columns, MultiIndex) - cols = data.columns.levels[0] - iter_data = { - col: data.loc[:, data.columns.get_level_values(0) == col] - for col in cols - } - return iter_data - def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: data = self.data @@ -438,44 +390,6 @@ def result(self): else: return self.axes[0] - def _reconstruct_data_with_by(self, data: DataFrame) -> DataFrame: - """ - Internal function to group data, and reassign multiindex column names onto the - result in order to let grouped data be used in _compute_plot_data method. - - Parameters - ---------- - data: Original DataFrame to plot - - Returns - ------- - Output is the reconstructed DataFrame with MultiIndex columns. The first level - of MI is unique values of groups, and second level of MI is the columns - selected by users. - - Examples - -------- - >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} - >>> df = DataFrame(d) - >>> _reconstruct_data_with_by(df) - h1 h2 - a b a b - 0 1 3 NaN NaN - 1 3 4 NaN NaN - 2 NaN NaN 5 6 - """ - grouped = data.groupby(self.by) - - data_list = [] - for key, group in grouped: - columns = MultiIndex.from_product([[key], self.columns]) - sub_group = group[self.columns] - sub_group.columns = columns - data_list.append(sub_group) - - data = concat(data_list, axis=1) - return data - def _compute_plot_data(self): data = self.data @@ -488,7 +402,7 @@ def _compute_plot_data(self): # GH15079 reconstruct data if by is defined if self.by is not None: self.subplots = True - data = self._reconstruct_data_with_by(self.data) + data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` diff --git a/pandas/plotting/_matplotlib/grouped.py b/pandas/plotting/_matplotlib/grouped.py new file mode 100644 index 0000000000000..cac6189d7c91b --- /dev/null +++ b/pandas/plotting/_matplotlib/grouped.py @@ -0,0 +1,97 @@ +from typing import Dict, List, Optional, Union + +import numpy as np + +from pandas._typing import Label + +from pandas import DataFrame, MultiIndex, Series, concat + + +def create_iter_data_given_by( + data: DataFrame, by: Optional[List] +) -> Union[DataFrame, Dict[str, Union[DataFrame, Series]]]: + """ + Create data for iteration given `by` is assigned or not, and it is only + used in both hist and boxplot. + + If `by` is assigned, return a dictionary of DataFrames in which the key of + dictionary is the values in groups. + If `by` is not assigned, return input as is, and this preserves current + status of iter_data. + + Parameters + ---------- + data: reformatted grouped data from `_compute_plot_data` method + by: list or None, value assigned to `by`. + + Returns + ------- + iter_data: DataFrame or Dictionary of DataFrames + + Examples + -------- + If `by` is assigned: + + >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] + >>> mi = MultiIndex.from_tuples(tuples) + >>> value = [[1, 3, np.nan, np.nan], + ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] + >>> data = DataFrame(value, columns=mi) + >>> create_iter_data_given_by(data, by=["col1"]) + {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), + 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} + """ + iter_data: Union[DataFrame, Dict[str, Union[DataFrame, Series]]] + if not by: + iter_data = data + else: + # Select sub-columns based on the value of first level of MI + assert isinstance(data.columns, MultiIndex) + cols = data.columns.levels[0] + iter_data = { + col: data.loc[:, data.columns.get_level_values(0) == col] for col in cols + } + return iter_data + + +def reconstruct_data_with_by( + data: DataFrame, by: Union[Label, List[Label]], cols: List[Label] +) -> DataFrame: + """ + Internal function to group data, and reassign multiindex column names onto the + result in order to let grouped data be used in _compute_plot_data method. + + Parameters + ---------- + data: Original DataFrame to plot + by: grouped `by` parameter selected by users + cols: columns of data set (excluding columns used in `by`) + + Returns + ------- + Output is the reconstructed DataFrame with MultiIndex columns. The first level + of MI is unique values of groups, and second level of MI is the columns + selected by users. + + Examples + -------- + >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} + >>> df = DataFrame(d) + >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) + h1 h2 + a b a b + 0 1 3 NaN NaN + 1 3 4 NaN NaN + 2 NaN NaN 5 6 + """ + grouped = data.groupby(by) + + data_list = [] + for key, group in grouped: + columns = MultiIndex.from_product([[key], cols]) + sub_group = group[cols] + sub_group.columns = columns + data_list.append(sub_group) + + data = concat(data_list, axis=1) + return data diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 954e4ddb1d814..363d78ccaa5f3 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -11,6 +11,7 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot +from pandas.plotting._matplotlib.grouped import create_iter_data_given_by from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots @@ -33,17 +34,13 @@ def _args_adjust(self): else: grouped = self.data.groupby(self.by)[self.columns] - bins_list = [] - for key, group in grouped: - bins_list.append(self._calculate_bins(group)) - self.bins = bins_list + self.bins = [self._calculate_bins(group) for key, group in grouped] if is_list_like(self.bottom): self.bottom = np.array(self.bottom) def _calculate_bins(self, data: DataFrame) -> np.array: """Calculate bins given data""" - values = data._convert(datetime=True)._get_numeric_data() values = np.ravel(values) values = values[~isna(values)] @@ -78,7 +75,7 @@ def _plot( def _make_plot(self): colors = self._get_colors() stacking_id = self._get_stacking_id() - data = self._create_iter_data_given_by(self.data, self.by) + data = create_iter_data_given_by(self.data, self.by) for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) From bb22c533e144b90f26298a238d7228b295a2a168 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 23 Jun 2020 21:00:31 +0200 Subject: [PATCH 095/142] fix linting --- pandas/plotting/_matplotlib/core.py | 3 +-- pandas/plotting/_matplotlib/grouped.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 09c647ceddb3f..286d2783ccb2b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import Dict, List, Optional, Union +from typing import List, Optional import warnings from matplotlib.artist import Artist @@ -125,7 +125,6 @@ def __init__( self.columns = [col for col in data.columns if col not in self.by] else: self.columns = com.convert_to_list_like(column) - # self.columns = [column] if not isinstance(column, list) else column self.kind = kind diff --git a/pandas/plotting/_matplotlib/grouped.py b/pandas/plotting/_matplotlib/grouped.py index cac6189d7c91b..3373c1d247449 100644 --- a/pandas/plotting/_matplotlib/grouped.py +++ b/pandas/plotting/_matplotlib/grouped.py @@ -1,7 +1,5 @@ from typing import Dict, List, Optional, Union -import numpy as np - from pandas._typing import Label from pandas import DataFrame, MultiIndex, Series, concat @@ -32,6 +30,7 @@ def create_iter_data_given_by( -------- If `by` is assigned: + >>> import numpy as np >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] >>> mi = MultiIndex.from_tuples(tuples) >>> value = [[1, 3, np.nan, np.nan], From 25214e6ce159500368113c3a59ada21c94928349 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 27 Jun 2020 21:41:25 +0200 Subject: [PATCH 096/142] rename --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/plotting/_matplotlib/{grouped.py => groupby.py} | 0 pandas/plotting/_matplotlib/hist.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename pandas/plotting/_matplotlib/{grouped.py => groupby.py} (100%) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 286d2783ccb2b..1a3cf9b906dc6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -32,7 +32,7 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters -from pandas.plotting._matplotlib.grouped import reconstruct_data_with_by +from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import ( _flatten, diff --git a/pandas/plotting/_matplotlib/grouped.py b/pandas/plotting/_matplotlib/groupby.py similarity index 100% rename from pandas/plotting/_matplotlib/grouped.py rename to pandas/plotting/_matplotlib/groupby.py diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 363d78ccaa5f3..417995eb18451 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -11,7 +11,7 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot -from pandas.plotting._matplotlib.grouped import create_iter_data_given_by +from pandas.plotting._matplotlib.groupby import create_iter_data_given_by from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots From 77e46f4cf5b9f13e79226d444a54024a5b159b9c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 27 Jun 2020 21:48:38 +0200 Subject: [PATCH 097/142] modulize reformat_y for hist --- pandas/plotting/_matplotlib/groupby.py | 20 ++++++++++++++++++++ pandas/plotting/_matplotlib/hist.py | 21 +++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 3373c1d247449..33165709c2af5 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -1,7 +1,11 @@ from typing import Dict, List, Optional, Union +import numpy as np + from pandas._typing import Label +from pandas.core.dtypes.missing import isna + from pandas import DataFrame, MultiIndex, Series, concat @@ -94,3 +98,19 @@ def reconstruct_data_with_by( data = concat(data_list, axis=1) return data + + +def reformat_hist_y_given_by( + y: Union[Series, np.array], by: Optional[Union[Label, List[Label]]] +) -> Union[Series, np.array]: + """Internal function to reformat y given `by` is applied or not for hist plot. + + If by is None, input y is 1-d array; and if by is not None, groupby will take + place and input y is multi-dimensional array. + """ + if by is not None and len(y.shape) > 1: + notna = [col[~isna(col)] for col in y.T] + y = np.array(np.array(notna).T) + else: + y = y[~isna(y)] + return y diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 8556725d15df9..e80e0519b725e 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -7,11 +7,13 @@ from pandas.core.dtypes.missing import isna, remove_na_arraylike from pandas.core.frame import DataFrame -from pandas.core.series import Series from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot -from pandas.plotting._matplotlib.groupby import create_iter_data_given_by +from pandas.plotting._matplotlib.groupby import ( + create_iter_data_given_by, + reformat_hist_y_given_by, +) from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots @@ -98,7 +100,7 @@ def _make_plot(self): kwds["label"] = self.columns kwds.pop("color") - y = self._reformat_y(y) + y = reformat_hist_y_given_by(y, self.by) # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, # and each sub-array (10,) will be called in each iteration. If users only @@ -115,19 +117,6 @@ def _make_plot(self): self._add_legend_handle(artists[0], label, index=i) - def _reformat_y(self, y: Union[Series, np.array]) -> Union[Series, np.array]: - """Internal function to reformat y given `by` is applied or not. - - If by is None, input y is 1-d array; and if by is not None, groupby will take - place and input y is multi-dimensional array. - """ - if self.by is not None and len(y.shape) > 1: - notna = [col[~isna(col)] for col in y.T] - y = np.array(np.array(notna).T) - else: - y = y[~isna(y)] - return y - def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" # y is required for KdePlot From 9de9c617d652f21075ac7eec17850d8d7793abe9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 27 Jun 2020 21:52:25 +0200 Subject: [PATCH 098/142] better annotation --- pandas/plotting/_matplotlib/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 33165709c2af5..c4795e35e0004 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import Label +from pandas._typing import FrameOrSeriesUnion, Label from pandas.core.dtypes.missing import isna @@ -11,7 +11,7 @@ def create_iter_data_given_by( data: DataFrame, by: Optional[List] -) -> Union[DataFrame, Dict[str, Union[DataFrame, Series]]]: +) -> Union[DataFrame, Dict[str, FrameOrSeriesUnion]]: """ Create data for iteration given `by` is assigned or not, and it is only used in both hist and boxplot. @@ -44,7 +44,7 @@ def create_iter_data_given_by( {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} """ - iter_data: Union[DataFrame, Dict[str, Union[DataFrame, Series]]] + iter_data: Union[DataFrame, Dict[str, FrameOrSeriesUnion]] if not by: iter_data = data else: From af68d2ea647d6fb03d63b6eaaf8d495e66bce18b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 27 Jun 2020 21:52:57 +0200 Subject: [PATCH 099/142] improve annotation --- pandas/plotting/_matplotlib/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index c4795e35e0004..047b10ee78fc5 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -10,7 +10,7 @@ def create_iter_data_given_by( - data: DataFrame, by: Optional[List] + data: DataFrame, by: Optional[List[Label]] ) -> Union[DataFrame, Dict[str, FrameOrSeriesUnion]]: """ Create data for iteration given `by` is assigned or not, and it is only From b75015ac9b8d93761b7e21e82e10c5e892c5b074 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 27 Jun 2020 21:53:53 +0200 Subject: [PATCH 100/142] fix linting --- pandas/plotting/_matplotlib/hist.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index e80e0519b725e..c3f4ceff9f904 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,5 +1,3 @@ -from typing import Union - import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like From b90303d930be5f389f395f3bfbd259e0eb4bcf69 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 27 Jun 2020 22:10:10 +0200 Subject: [PATCH 101/142] improve docstring --- pandas/plotting/_matplotlib/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 047b10ee78fc5..061f95aacec90 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -105,8 +105,8 @@ def reformat_hist_y_given_by( ) -> Union[Series, np.array]: """Internal function to reformat y given `by` is applied or not for hist plot. - If by is None, input y is 1-d array; and if by is not None, groupby will take - place and input y is multi-dimensional array. + If by is None, input y is 1-d with NaN removed; and if by is not None, groupby + will take place and input y is multi-dimensional array. """ if by is not None and len(y.shape) > 1: notna = [col[~isna(col)] for col in y.T] From 2ac32f5b4cf838e9b8ba653bd9bdd91bebd0975b Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Fri, 21 May 2021 20:23:48 +0200 Subject: [PATCH 102/142] remove added test file --- pandas/tests/plotting/test_frame.py | 3533 ------------------------- pandas/tests/plotting/test_hist_by.py | 112 + 2 files changed, 112 insertions(+), 3533 deletions(-) delete mode 100644 pandas/tests/plotting/test_frame.py create mode 100644 pandas/tests/plotting/test_hist_by.py diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py deleted file mode 100644 index 4392ef6d04d34..0000000000000 --- a/pandas/tests/plotting/test_frame.py +++ /dev/null @@ -1,3533 +0,0 @@ -""" Test cases for DataFrame.plot """ - -from datetime import date, datetime -import itertools -import re -import string -import warnings - -import numpy as np -from numpy.random import rand, randn -import pytest - -import pandas.util._test_decorators as td - -from pandas.core.dtypes.api import is_list_like - -import pandas as pd -from pandas import DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range -import pandas._testing as tm -from pandas.core.arrays import integer_array -from pandas.tests.plotting.common import TestPlotBase, _check_plot_works - -from pandas.io.formats.printing import pprint_thing -import pandas.plotting as plotting - - -@pytest.fixture(scope="module") -def test_hist_with_by_df(): - np.random.seed(0) - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - return df - - -@td.skip_if_no_mpl -class TestDataFramePlots(TestPlotBase): - def setup_method(self, method): - TestPlotBase.setup_method(self, method) - import matplotlib as mpl - - mpl.rcdefaults() - - self.tdf = tm.makeTimeDataFrame() - self.hexbin_df = DataFrame( - { - "A": np.random.uniform(size=20), - "B": np.random.uniform(size=20), - "C": np.arange(20) + np.random.uniform(size=20), - } - ) - - def _assert_ytickslabels_visibility(self, axes, expected): - for ax, exp in zip(axes, expected): - self._check_visible(ax.get_yticklabels(), visible=exp) - - def _assert_xtickslabels_visibility(self, axes, expected): - for ax, exp in zip(axes, expected): - self._check_visible(ax.get_xticklabels(), visible=exp) - - @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) - @pytest.mark.slow - def test_plot(self): - from pandas.plotting._matplotlib.compat import _mpl_ge_3_1_0 - - df = self.tdf - _check_plot_works(df.plot, grid=False) - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, subplots=True) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, subplots=True, layout=(-1, 2)) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, subplots=True, use_index=False) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - df = DataFrame({"x": [1, 2], "y": [3, 4]}) - if _mpl_ge_3_1_0(): - msg = "'Line2D' object has no property 'blarg'" - else: - msg = "Unknown property blarg" - with pytest.raises(AttributeError, match=msg): - df.plot.line(blarg=True) - - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - - _check_plot_works(df.plot, use_index=True) - _check_plot_works(df.plot, sort_columns=False) - _check_plot_works(df.plot, yticks=[1, 5, 10]) - _check_plot_works(df.plot, xticks=[1, 5, 10]) - _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) - - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.plot, subplots=True, title="blah") - - # We have to redo it here because _check_plot_works does two plots, - # once without an ax kwarg and once with an ax kwarg and the new sharex - # behaviour does not remove the visibility of the latter axis (as ax is - # present). see: https://github.com/pandas-dev/pandas/issues/9737 - - axes = df.plot(subplots=True, title="blah") - self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - # axes[0].figure.savefig("test.png") - for ax in axes[:2]: - self._check_visible(ax.xaxis) # xaxis must be visible for grid - self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible(ax.get_xticklabels(minor=True), visible=False) - self._check_visible([ax.xaxis.get_label()], visible=False) - for ax in [axes[2]]: - self._check_visible(ax.xaxis) - self._check_visible(ax.get_xticklabels()) - self._check_visible([ax.xaxis.get_label()]) - self._check_ticks_props(ax, xrot=0) - - _check_plot_works(df.plot, title="blah") - - tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) - _check_plot_works(df.plot, use_index=True) - - # unicode - index = MultiIndex.from_tuples( - [ - ("\u03b1", 0), - ("\u03b1", 1), - ("\u03b2", 2), - ("\u03b2", 3), - ("\u03b3", 4), - ("\u03b3", 5), - ("\u03b4", 6), - ("\u03b4", 7), - ], - names=["i0", "i1"], - ) - columns = MultiIndex.from_tuples( - [("bar", "\u0394"), ("bar", "\u0395")], names=["c0", "c1"] - ) - df = DataFrame(np.random.randint(0, 10, (8, 2)), columns=columns, index=index) - _check_plot_works(df.plot, title="\u03A3") - - # GH 6951 - # Test with single column - df = DataFrame({"x": np.random.rand(10)}) - axes = _check_plot_works(df.plot.bar, subplots=True) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - - axes = _check_plot_works(df.plot.bar, subplots=True, layout=(-1, 1)) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - # When ax is supplied and required number of axes is 1, - # passed ax should be used: - fig, ax = self.plt.subplots() - axes = df.plot.bar(subplots=True, ax=ax) - assert len(axes) == 1 - result = ax.axes - assert result is axes[0] - - def test_integer_array_plot(self): - # GH 25587 - arr = integer_array([1, 2, 3, 4], dtype="UInt32") - - s = Series(arr) - _check_plot_works(s.plot.line) - _check_plot_works(s.plot.bar) - _check_plot_works(s.plot.hist) - _check_plot_works(s.plot.pie) - - df = DataFrame({"x": arr, "y": arr}) - _check_plot_works(df.plot.line) - _check_plot_works(df.plot.bar) - _check_plot_works(df.plot.hist) - _check_plot_works(df.plot.pie, y="y") - _check_plot_works(df.plot.scatter, x="x", y="y") - _check_plot_works(df.plot.hexbin, x="x", y="y") - - def test_mpl2_color_cycle_str(self): - # GH 15516 - colors = ["C" + str(x) for x in range(10)] - df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) - for c in colors: - _check_plot_works(df.plot, color=c) - - def test_color_single_series_list(self): - # GH 3486 - df = DataFrame({"A": [1, 2, 3]}) - _check_plot_works(df.plot, color=["red"]) - - def test_rgb_tuple_color(self): - # GH 16695 - df = DataFrame({"x": [1, 2], "y": [3, 4]}) - _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0)) - _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0, 0.5)) - - def test_color_empty_string(self): - df = DataFrame(randn(10, 2)) - with pytest.raises(ValueError): - df.plot(color="") - - def test_color_and_style_arguments(self): - df = DataFrame({"x": [1, 2], "y": [3, 4]}) - # passing both 'color' and 'style' arguments should be allowed - # if there is no color symbol in the style strings: - ax = df.plot(color=["red", "black"], style=["-", "--"]) - # check that the linestyles are correctly set: - linestyle = [line.get_linestyle() for line in ax.lines] - assert linestyle == ["-", "--"] - # check that the colors are correctly set: - color = [line.get_color() for line in ax.lines] - assert color == ["red", "black"] - # passing both 'color' and 'style' arguments should not be allowed - # if there is a color symbol in the style strings: - with pytest.raises(ValueError): - df.plot(color=["red", "black"], style=["k-", "r--"]) - - def test_nonnumeric_exclude(self): - df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}) - ax = df.plot() - assert len(ax.get_lines()) == 1 # B was plotted - - @pytest.mark.slow - def test_implicit_label(self): - df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) - ax = df.plot(x="a", y="b") - self._check_text_labels(ax.xaxis.get_label(), "a") - - @pytest.mark.slow - def test_donot_overwrite_index_name(self): - # GH 8494 - df = DataFrame(randn(2, 2), columns=["a", "b"]) - df.index.name = "NAME" - df.plot(y="b", label="LABEL") - assert df.index.name == "NAME" - - @pytest.mark.slow - def test_plot_xy(self): - # columns.inferred_type == 'string' - df = self.tdf - self._check_data(df.plot(x=0, y=1), df.set_index("A")["B"].plot()) - self._check_data(df.plot(x=0), df.set_index("A").plot()) - self._check_data(df.plot(y=0), df.B.plot()) - self._check_data(df.plot(x="A", y="B"), df.set_index("A").B.plot()) - self._check_data(df.plot(x="A"), df.set_index("A").plot()) - self._check_data(df.plot(y="B"), df.B.plot()) - - # columns.inferred_type == 'integer' - df.columns = np.arange(1, len(df.columns) + 1) - self._check_data(df.plot(x=1, y=2), df.set_index(1)[2].plot()) - self._check_data(df.plot(x=1), df.set_index(1).plot()) - self._check_data(df.plot(y=1), df[1].plot()) - - # figsize and title - ax = df.plot(x=1, y=2, title="Test", figsize=(16, 8)) - self._check_text_labels(ax.title, "Test") - self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16.0, 8.0)) - - # columns.inferred_type == 'mixed' - # TODO add MultiIndex test - - @pytest.mark.slow - @pytest.mark.parametrize( - "input_log, expected_log", [(True, "log"), ("sym", "symlog")] - ) - def test_logscales(self, input_log, expected_log): - df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) - - ax = df.plot(logy=input_log) - self._check_ax_scales(ax, yaxis=expected_log) - assert ax.get_yscale() == expected_log - - ax = df.plot(logx=input_log) - self._check_ax_scales(ax, xaxis=expected_log) - assert ax.get_xscale() == expected_log - - ax = df.plot(loglog=input_log) - self._check_ax_scales(ax, xaxis=expected_log, yaxis=expected_log) - assert ax.get_xscale() == expected_log - assert ax.get_yscale() == expected_log - - @pytest.mark.parametrize("input_param", ["logx", "logy", "loglog"]) - def test_invalid_logscale(self, input_param): - # GH: 24867 - df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) - - msg = "Boolean, None and 'sym' are valid options, 'sm' is given." - with pytest.raises(ValueError, match=msg): - df.plot(**{input_param: "sm"}) - - @pytest.mark.slow - def test_xcompat(self): - import pandas as pd - - df = self.tdf - ax = df.plot(x_compat=True) - lines = ax.get_lines() - assert not isinstance(lines[0].get_xdata(), PeriodIndex) - - tm.close() - pd.plotting.plot_params["xaxis.compat"] = True - ax = df.plot() - lines = ax.get_lines() - assert not isinstance(lines[0].get_xdata(), PeriodIndex) - - tm.close() - pd.plotting.plot_params["x_compat"] = False - - ax = df.plot() - lines = ax.get_lines() - assert not isinstance(lines[0].get_xdata(), PeriodIndex) - assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) - - tm.close() - # useful if you're plotting a bunch together - with pd.plotting.plot_params.use("x_compat", True): - ax = df.plot() - lines = ax.get_lines() - assert not isinstance(lines[0].get_xdata(), PeriodIndex) - - tm.close() - ax = df.plot() - lines = ax.get_lines() - assert not isinstance(lines[0].get_xdata(), PeriodIndex) - assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) - - def test_period_compat(self): - # GH 9012 - # period-array conversions - df = DataFrame( - np.random.rand(21, 2), - index=bdate_range(datetime(2000, 1, 1), datetime(2000, 1, 31)), - columns=["a", "b"], - ) - - df.plot() - self.plt.axhline(y=0) - tm.close() - - def test_unsorted_index(self): - df = DataFrame( - {"y": np.arange(100)}, index=np.arange(99, -1, -1), dtype=np.int64 - ) - ax = df.plot() - lines = ax.get_lines()[0] - rs = lines.get_xydata() - rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") - tm.assert_series_equal(rs, df.y, check_index_type=False) - tm.close() - - df.index = pd.Index(np.arange(99, -1, -1), dtype=np.float64) - ax = df.plot() - lines = ax.get_lines()[0] - rs = lines.get_xydata() - rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") - tm.assert_series_equal(rs, df.y) - - def test_unsorted_index_lims(self): - df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0]}, index=[1.0, 0.0, 3.0, 2.0]) - ax = df.plot() - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - assert xmin <= np.nanmin(lines[0].get_data()[0]) - assert xmax >= np.nanmax(lines[0].get_data()[0]) - - df = DataFrame( - {"y": [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0]}, - index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0], - ) - ax = df.plot() - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - assert xmin <= np.nanmin(lines[0].get_data()[0]) - assert xmax >= np.nanmax(lines[0].get_data()[0]) - - df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0], "z": [91.0, 90.0, 93.0, 92.0]}) - ax = df.plot(x="z", y="y") - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - assert xmin <= np.nanmin(lines[0].get_data()[0]) - assert xmax >= np.nanmax(lines[0].get_data()[0]) - - @pytest.mark.slow - def test_subplots(self): - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - - for kind in ["bar", "barh", "line", "area"]: - axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) - self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - assert axes.shape == (3,) - - for ax, column in zip(axes, df.columns): - self._check_legend_labels(ax, labels=[pprint_thing(column)]) - - for ax in axes[:-2]: - self._check_visible(ax.xaxis) # xaxis must be visible for grid - self._check_visible(ax.get_xticklabels(), visible=False) - if not (kind == "bar" and self.mpl_ge_3_1_0): - # change https://github.com/pandas-dev/pandas/issues/26714 - self._check_visible(ax.get_xticklabels(minor=True), visible=False) - self._check_visible(ax.xaxis.get_label(), visible=False) - self._check_visible(ax.get_yticklabels()) - - self._check_visible(axes[-1].xaxis) - self._check_visible(axes[-1].get_xticklabels()) - self._check_visible(axes[-1].get_xticklabels(minor=True)) - self._check_visible(axes[-1].xaxis.get_label()) - self._check_visible(axes[-1].get_yticklabels()) - - axes = df.plot(kind=kind, subplots=True, sharex=False) - for ax in axes: - self._check_visible(ax.xaxis) - self._check_visible(ax.get_xticklabels()) - self._check_visible(ax.get_xticklabels(minor=True)) - self._check_visible(ax.xaxis.get_label()) - self._check_visible(ax.get_yticklabels()) - - axes = df.plot(kind=kind, subplots=True, legend=False) - for ax in axes: - assert ax.get_legend() is None - - def test_groupby_boxplot_sharey(self): - # https://github.com/pandas-dev/pandas/issues/20968 - # sharey can now be switched check whether the right - # pair of axes is turned on or off - - df = DataFrame( - { - "a": [-1.43, -0.15, -3.70, -1.43, -0.14], - "b": [0.56, 0.84, 0.29, 0.56, 0.85], - "c": [0, 1, 2, 3, 1], - }, - index=[0, 1, 2, 3, 4], - ) - - # behavior without keyword - axes = df.groupby("c").boxplot() - expected = [True, False, True, False] - self._assert_ytickslabels_visibility(axes, expected) - - # set sharey=True should be identical - axes = df.groupby("c").boxplot(sharey=True) - expected = [True, False, True, False] - self._assert_ytickslabels_visibility(axes, expected) - - # sharey=False, all yticklabels should be visible - axes = df.groupby("c").boxplot(sharey=False) - expected = [True, True, True, True] - self._assert_ytickslabels_visibility(axes, expected) - - def test_groupby_boxplot_sharex(self): - # https://github.com/pandas-dev/pandas/issues/20968 - # sharex can now be switched check whether the right - # pair of axes is turned on or off - - df = DataFrame( - { - "a": [-1.43, -0.15, -3.70, -1.43, -0.14], - "b": [0.56, 0.84, 0.29, 0.56, 0.85], - "c": [0, 1, 2, 3, 1], - }, - index=[0, 1, 2, 3, 4], - ) - - # behavior without keyword - axes = df.groupby("c").boxplot() - expected = [True, True, True, True] - self._assert_xtickslabels_visibility(axes, expected) - - # set sharex=False should be identical - axes = df.groupby("c").boxplot(sharex=False) - expected = [True, True, True, True] - self._assert_xtickslabels_visibility(axes, expected) - - # sharex=True, yticklabels should be visible - # only for bottom plots - axes = df.groupby("c").boxplot(sharex=True) - expected = [False, False, True, True] - self._assert_xtickslabels_visibility(axes, expected) - - @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) - @pytest.mark.slow - def test_subplots_timeseries(self): - idx = date_range(start="2014-07-01", freq="M", periods=10) - df = DataFrame(np.random.rand(10, 3), index=idx) - - for kind in ["line", "area"]: - axes = df.plot(kind=kind, subplots=True, sharex=True) - self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - - for ax in axes[:-2]: - # GH 7801 - self._check_visible(ax.xaxis) # xaxis must be visible for grid - self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible(ax.get_xticklabels(minor=True), visible=False) - self._check_visible(ax.xaxis.get_label(), visible=False) - self._check_visible(ax.get_yticklabels()) - - self._check_visible(axes[-1].xaxis) - self._check_visible(axes[-1].get_xticklabels()) - self._check_visible(axes[-1].get_xticklabels(minor=True)) - self._check_visible(axes[-1].xaxis.get_label()) - self._check_visible(axes[-1].get_yticklabels()) - self._check_ticks_props(axes, xrot=0) - - axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) - for ax in axes: - self._check_visible(ax.xaxis) - self._check_visible(ax.get_xticklabels()) - self._check_visible(ax.get_xticklabels(minor=True)) - self._check_visible(ax.xaxis.get_label()) - self._check_visible(ax.get_yticklabels()) - self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) - - def test_subplots_timeseries_y_axis(self): - # GH16953 - data = { - "numeric": np.array([1, 2, 5]), - "timedelta": [ - pd.Timedelta(-10, unit="s"), - pd.Timedelta(10, unit="m"), - pd.Timedelta(10, unit="h"), - ], - "datetime_no_tz": [ - pd.to_datetime("2017-08-01 00:00:00"), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00"), - ], - "datetime_all_tz": [ - pd.to_datetime("2017-08-01 00:00:00", utc=True), - pd.to_datetime("2017-08-01 02:00:00", utc=True), - pd.to_datetime("2017-08-02 00:00:00", utc=True), - ], - "text": ["This", "should", "fail"], - } - testdata = DataFrame(data) - - ax_numeric = testdata.plot(y="numeric") - assert ( - ax_numeric.get_lines()[0].get_data()[1] == testdata["numeric"].values - ).all() - ax_timedelta = testdata.plot(y="timedelta") - assert ( - ax_timedelta.get_lines()[0].get_data()[1] == testdata["timedelta"].values - ).all() - ax_datetime_no_tz = testdata.plot(y="datetime_no_tz") - assert ( - ax_datetime_no_tz.get_lines()[0].get_data()[1] - == testdata["datetime_no_tz"].values - ).all() - ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") - assert ( - ax_datetime_all_tz.get_lines()[0].get_data()[1] - == testdata["datetime_all_tz"].values - ).all() - - msg = "no numeric data to plot" - with pytest.raises(TypeError, match=msg): - testdata.plot(y="text") - - @pytest.mark.xfail(reason="not support for period, categorical, datetime_mixed_tz") - def test_subplots_timeseries_y_axis_not_supported(self): - """ - This test will fail for: - period: - since period isn't yet implemented in ``select_dtypes`` - and because it will need a custom value converter + - tick formatter (as was done for x-axis plots) - - categorical: - because it will need a custom value converter + - tick formatter (also doesn't work for x-axis, as of now) - - datetime_mixed_tz: - because of the way how pandas handles ``Series`` of - ``datetime`` objects with different timezone, - generally converting ``datetime`` objects in a tz-aware - form could help with this problem - """ - data = { - "numeric": np.array([1, 2, 5]), - "period": [ - pd.Period("2017-08-01 00:00:00", freq="H"), - pd.Period("2017-08-01 02:00", freq="H"), - pd.Period("2017-08-02 00:00:00", freq="H"), - ], - "categorical": pd.Categorical( - ["c", "b", "a"], categories=["a", "b", "c"], ordered=False - ), - "datetime_mixed_tz": [ - pd.to_datetime("2017-08-01 00:00:00", utc=True), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00"), - ], - } - testdata = pd.DataFrame(data) - ax_period = testdata.plot(x="numeric", y="period") - assert ( - ax_period.get_lines()[0].get_data()[1] == testdata["period"].values - ).all() - ax_categorical = testdata.plot(x="numeric", y="categorical") - assert ( - ax_categorical.get_lines()[0].get_data()[1] - == testdata["categorical"].values - ).all() - ax_datetime_mixed_tz = testdata.plot(x="numeric", y="datetime_mixed_tz") - assert ( - ax_datetime_mixed_tz.get_lines()[0].get_data()[1] - == testdata["datetime_mixed_tz"].values - ).all() - - @pytest.mark.slow - def test_subplots_layout(self): - # GH 6667 - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - - axes = df.plot(subplots=True, layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - assert axes.shape == (2, 2) - - axes = df.plot(subplots=True, layout=(-1, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - assert axes.shape == (2, 2) - - axes = df.plot(subplots=True, layout=(2, -1)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - assert axes.shape == (2, 2) - - axes = df.plot(subplots=True, layout=(1, 4)) - self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) - assert axes.shape == (1, 4) - - axes = df.plot(subplots=True, layout=(-1, 4)) - self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) - assert axes.shape == (1, 4) - - axes = df.plot(subplots=True, layout=(4, -1)) - self._check_axes_shape(axes, axes_num=3, layout=(4, 1)) - assert axes.shape == (4, 1) - - with pytest.raises(ValueError): - df.plot(subplots=True, layout=(1, 1)) - with pytest.raises(ValueError): - df.plot(subplots=True, layout=(-1, -1)) - - # single column - df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) - axes = df.plot(subplots=True) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1,) - - axes = df.plot(subplots=True, layout=(3, 3)) - self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) - assert axes.shape == (3, 3) - - @pytest.mark.slow - def test_subplots_warnings(self): - # GH 9464 - with tm.assert_produces_warning(None): - df = DataFrame(np.random.randn(100, 4)) - df.plot(subplots=True, layout=(3, 2)) - - df = DataFrame( - np.random.randn(100, 4), index=date_range("1/1/2000", periods=100) - ) - df.plot(subplots=True, layout=(3, 2)) - - @pytest.mark.slow - def test_subplots_multiple_axes(self): - # GH 5353, 6970, GH 7069 - fig, axes = self.plt.subplots(2, 3) - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - - returned = df.plot(subplots=True, ax=axes[0], sharex=False, sharey=False) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3,) - assert returned[0].figure is fig - # draw on second row - returned = df.plot(subplots=True, ax=axes[1], sharex=False, sharey=False) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3,) - assert returned[0].figure is fig - self._check_axes_shape(axes, axes_num=6, layout=(2, 3)) - tm.close() - - with pytest.raises(ValueError): - fig, axes = self.plt.subplots(2, 3) - # pass different number of axes from required - df.plot(subplots=True, ax=axes) - - # pass 2-dim axes and invalid layout - # invalid lauout should not affect to input and return value - # (show warning is tested in - # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes - fig, axes = self.plt.subplots(2, 2) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - df = DataFrame(np.random.rand(10, 4), index=list(string.ascii_letters[:10])) - - returned = df.plot( - subplots=True, ax=axes, layout=(2, 1), sharex=False, sharey=False - ) - self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4,) - - returned = df.plot( - subplots=True, ax=axes, layout=(2, -1), sharex=False, sharey=False - ) - self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4,) - - returned = df.plot( - subplots=True, ax=axes, layout=(-1, 2), sharex=False, sharey=False - ) - self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4,) - - # single column - fig, axes = self.plt.subplots(1, 1) - df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) - - axes = df.plot(subplots=True, ax=[axes], sharex=False, sharey=False) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1,) - - def test_subplots_ts_share_axes(self): - # GH 3964 - fig, axes = self.plt.subplots(3, 3, sharex=True, sharey=True) - self.plt.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) - df = DataFrame( - np.random.randn(10, 9), - index=date_range(start="2014-07-01", freq="M", periods=10), - ) - for i, ax in enumerate(axes.ravel()): - df[i].plot(ax=ax, fontsize=5) - - # Rows other than bottom should not be visible - for ax in axes[0:-1].ravel(): - self._check_visible(ax.get_xticklabels(), visible=False) - - # Bottom row should be visible - for ax in axes[-1].ravel(): - self._check_visible(ax.get_xticklabels(), visible=True) - - # First column should be visible - for ax in axes[[0, 1, 2], [0]].ravel(): - self._check_visible(ax.get_yticklabels(), visible=True) - - # Other columns should not be visible - for ax in axes[[0, 1, 2], [1]].ravel(): - self._check_visible(ax.get_yticklabels(), visible=False) - for ax in axes[[0, 1, 2], [2]].ravel(): - self._check_visible(ax.get_yticklabels(), visible=False) - - def test_subplots_sharex_axes_existing_axes(self): - # GH 9158 - d = {"A": [1.0, 2.0, 3.0, 4.0], "B": [4.0, 3.0, 2.0, 1.0], "C": [5, 1, 3, 4]} - df = DataFrame(d, index=date_range("2014 10 11", "2014 10 14")) - - axes = df[["A", "B"]].plot(subplots=True) - df["C"].plot(ax=axes[0], secondary_y=True) - - self._check_visible(axes[0].get_xticklabels(), visible=False) - self._check_visible(axes[1].get_xticklabels(), visible=True) - for ax in axes.ravel(): - self._check_visible(ax.get_yticklabels(), visible=True) - - @pytest.mark.slow - def test_subplots_dup_columns(self): - # GH 10962 - df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa")) - axes = df.plot(subplots=True) - for ax in axes: - self._check_legend_labels(ax, labels=["a"]) - assert len(ax.lines) == 1 - tm.close() - - axes = df.plot(subplots=True, secondary_y="a") - for ax in axes: - # (right) is only attached when subplots=False - self._check_legend_labels(ax, labels=["a"]) - assert len(ax.lines) == 1 - tm.close() - - ax = df.plot(secondary_y="a") - self._check_legend_labels(ax, labels=["a (right)"] * 5) - assert len(ax.lines) == 0 - assert len(ax.right_ax.lines) == 5 - - def test_negative_log(self): - df = -DataFrame( - rand(6, 4), - index=list(string.ascii_letters[:6]), - columns=["x", "y", "z", "four"], - ) - - with pytest.raises(ValueError): - df.plot.area(logy=True) - with pytest.raises(ValueError): - df.plot.area(loglog=True) - - def _compare_stacked_y_cood(self, normal_lines, stacked_lines): - base = np.zeros(len(normal_lines[0].get_data()[1])) - for nl, sl in zip(normal_lines, stacked_lines): - base += nl.get_data()[1] # get y coordinates - sy = sl.get_data()[1] - tm.assert_numpy_array_equal(base, sy) - - def test_line_area_stacked(self): - with tm.RNGContext(42): - df = DataFrame(rand(6, 4), columns=["w", "x", "y", "z"]) - neg_df = -df - # each column has either positive or negative value - sep_df = DataFrame( - {"w": rand(6), "x": rand(6), "y": -rand(6), "z": -rand(6)} - ) - # each column has positive-negative mixed value - mixed_df = DataFrame( - randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=["w", "x", "y", "z"], - ) - - for kind in ["line", "area"]: - ax1 = _check_plot_works(df.plot, kind=kind, stacked=False) - ax2 = _check_plot_works(df.plot, kind=kind, stacked=True) - self._compare_stacked_y_cood(ax1.lines, ax2.lines) - - ax1 = _check_plot_works(neg_df.plot, kind=kind, stacked=False) - ax2 = _check_plot_works(neg_df.plot, kind=kind, stacked=True) - self._compare_stacked_y_cood(ax1.lines, ax2.lines) - - ax1 = _check_plot_works(sep_df.plot, kind=kind, stacked=False) - ax2 = _check_plot_works(sep_df.plot, kind=kind, stacked=True) - self._compare_stacked_y_cood(ax1.lines[:2], ax2.lines[:2]) - self._compare_stacked_y_cood(ax1.lines[2:], ax2.lines[2:]) - - _check_plot_works(mixed_df.plot, stacked=False) - with pytest.raises(ValueError): - mixed_df.plot(stacked=True) - - # Use an index with strictly positive values, preventing - # matplotlib from warning about ignoring xlim - df2 = df.set_index(df.index + 1) - _check_plot_works(df2.plot, kind=kind, logx=True, stacked=True) - - def test_line_area_nan_df(self): - values1 = [1, 2, np.nan, 3] - values2 = [3, np.nan, 2, 1] - df = DataFrame({"a": values1, "b": values2}) - tdf = DataFrame({"a": values1, "b": values2}, index=tm.makeDateIndex(k=4)) - - for d in [df, tdf]: - ax = _check_plot_works(d.plot) - masked1 = ax.lines[0].get_ydata() - masked2 = ax.lines[1].get_ydata() - # remove nan for comparison purpose - - exp = np.array([1, 2, 3], dtype=np.float64) - tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp) - - exp = np.array([3, 2, 1], dtype=np.float64) - tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp) - tm.assert_numpy_array_equal( - masked1.mask, np.array([False, False, True, False]) - ) - tm.assert_numpy_array_equal( - masked2.mask, np.array([False, True, False, False]) - ) - - expected1 = np.array([1, 2, 0, 3], dtype=np.float64) - expected2 = np.array([3, 0, 2, 1], dtype=np.float64) - - ax = _check_plot_works(d.plot, stacked=True) - tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) - - ax = _check_plot_works(d.plot.area) - tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) - - ax = _check_plot_works(d.plot.area, stacked=False) - tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2) - - def test_line_lim(self): - df = DataFrame(rand(6, 3), columns=["x", "y", "z"]) - ax = df.plot() - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - assert xmin <= lines[0].get_data()[0][0] - assert xmax >= lines[0].get_data()[0][-1] - - ax = df.plot(secondary_y=True) - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - assert xmin <= lines[0].get_data()[0][0] - assert xmax >= lines[0].get_data()[0][-1] - - axes = df.plot(secondary_y=True, subplots=True) - self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - for ax in axes: - assert hasattr(ax, "left_ax") - assert not hasattr(ax, "right_ax") - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - assert xmin <= lines[0].get_data()[0][0] - assert xmax >= lines[0].get_data()[0][-1] - - def test_area_lim(self): - df = DataFrame(rand(6, 4), columns=["x", "y", "z", "four"]) - - neg_df = -df - for stacked in [True, False]: - ax = _check_plot_works(df.plot.area, stacked=stacked) - xmin, xmax = ax.get_xlim() - ymin, ymax = ax.get_ylim() - lines = ax.get_lines() - assert xmin <= lines[0].get_data()[0][0] - assert xmax >= lines[0].get_data()[0][-1] - assert ymin == 0 - - ax = _check_plot_works(neg_df.plot.area, stacked=stacked) - ymin, ymax = ax.get_ylim() - assert ymax == 0 - - @pytest.mark.slow - def test_bar_colors(self): - import matplotlib.pyplot as plt - - default_colors = self._unpack_cycler(plt.rcParams) - - df = DataFrame(randn(5, 5)) - ax = df.plot.bar() - self._check_colors(ax.patches[::5], facecolors=default_colors[:5]) - tm.close() - - custom_colors = "rgcby" - ax = df.plot.bar(color=custom_colors) - self._check_colors(ax.patches[::5], facecolors=custom_colors) - tm.close() - - from matplotlib import cm - - # Test str -> colormap functionality - ax = df.plot.bar(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::5], facecolors=rgba_colors) - tm.close() - - # Test colormap functionality - ax = df.plot.bar(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::5], facecolors=rgba_colors) - tm.close() - - ax = df.loc[:, [0]].plot.bar(color="DodgerBlue") - self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) - tm.close() - - ax = df.plot(kind="bar", color="green") - self._check_colors(ax.patches[::5], facecolors=["green"] * 5) - tm.close() - - def test_bar_user_colors(self): - df = pd.DataFrame( - {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} - ) - # This should *only* work when `y` is specified, else - # we use one color per column - ax = df.plot.bar(y="A", color=df["color"]) - result = [p.get_facecolor() for p in ax.patches] - expected = [ - (1.0, 0.0, 0.0, 1.0), - (0.0, 0.0, 1.0, 1.0), - (0.0, 0.0, 1.0, 1.0), - (1.0, 0.0, 0.0, 1.0), - ] - assert result == expected - - @pytest.mark.slow - def test_bar_linewidth(self): - df = DataFrame(randn(5, 5)) - - # regular - ax = df.plot.bar(linewidth=2) - for r in ax.patches: - assert r.get_linewidth() == 2 - - # stacked - ax = df.plot.bar(stacked=True, linewidth=2) - for r in ax.patches: - assert r.get_linewidth() == 2 - - # subplots - axes = df.plot.bar(linewidth=2, subplots=True) - self._check_axes_shape(axes, axes_num=5, layout=(5, 1)) - for ax in axes: - for r in ax.patches: - assert r.get_linewidth() == 2 - - @pytest.mark.slow - def test_bar_barwidth(self): - df = DataFrame(randn(5, 5)) - - width = 0.9 - - # regular - ax = df.plot.bar(width=width) - for r in ax.patches: - assert r.get_width() == width / len(df.columns) - - # stacked - ax = df.plot.bar(stacked=True, width=width) - for r in ax.patches: - assert r.get_width() == width - - # horizontal regular - ax = df.plot.barh(width=width) - for r in ax.patches: - assert r.get_height() == width / len(df.columns) - - # horizontal stacked - ax = df.plot.barh(stacked=True, width=width) - for r in ax.patches: - assert r.get_height() == width - - # subplots - axes = df.plot.bar(width=width, subplots=True) - for ax in axes: - for r in ax.patches: - assert r.get_width() == width - - # horizontal subplots - axes = df.plot.barh(width=width, subplots=True) - for ax in axes: - for r in ax.patches: - assert r.get_height() == width - - @pytest.mark.slow - def test_bar_barwidth_position(self): - df = DataFrame(randn(5, 5)) - self._check_bar_alignment( - df, kind="bar", stacked=False, width=0.9, position=0.2 - ) - self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, position=0.2) - self._check_bar_alignment( - df, kind="barh", stacked=False, width=0.9, position=0.2 - ) - self._check_bar_alignment( - df, kind="barh", stacked=True, width=0.9, position=0.2 - ) - self._check_bar_alignment( - df, kind="bar", subplots=True, width=0.9, position=0.2 - ) - self._check_bar_alignment( - df, kind="barh", subplots=True, width=0.9, position=0.2 - ) - - @pytest.mark.slow - def test_bar_barwidth_position_int(self): - # GH 12979 - df = DataFrame(randn(5, 5)) - - for w in [1, 1.0]: - ax = df.plot.bar(stacked=True, width=w) - ticks = ax.xaxis.get_ticklocs() - tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4])) - assert ax.get_xlim() == (-0.75, 4.75) - # check left-edge of bars - assert ax.patches[0].get_x() == -0.5 - assert ax.patches[-1].get_x() == 3.5 - - self._check_bar_alignment(df, kind="bar", stacked=True, width=1) - self._check_bar_alignment(df, kind="barh", stacked=False, width=1) - self._check_bar_alignment(df, kind="barh", stacked=True, width=1) - self._check_bar_alignment(df, kind="bar", subplots=True, width=1) - self._check_bar_alignment(df, kind="barh", subplots=True, width=1) - - @pytest.mark.slow - def test_bar_bottom_left(self): - df = DataFrame(rand(5, 5)) - ax = df.plot.bar(stacked=False, bottom=1) - result = [p.get_y() for p in ax.patches] - assert result == [1] * 25 - - ax = df.plot.bar(stacked=True, bottom=[-1, -2, -3, -4, -5]) - result = [p.get_y() for p in ax.patches[:5]] - assert result == [-1, -2, -3, -4, -5] - - ax = df.plot.barh(stacked=False, left=np.array([1, 1, 1, 1, 1])) - result = [p.get_x() for p in ax.patches] - assert result == [1] * 25 - - ax = df.plot.barh(stacked=True, left=[1, 2, 3, 4, 5]) - result = [p.get_x() for p in ax.patches[:5]] - assert result == [1, 2, 3, 4, 5] - - axes = df.plot.bar(subplots=True, bottom=-1) - for ax in axes: - result = [p.get_y() for p in ax.patches] - assert result == [-1] * 5 - - axes = df.plot.barh(subplots=True, left=np.array([1, 1, 1, 1, 1])) - for ax in axes: - result = [p.get_x() for p in ax.patches] - assert result == [1] * 5 - - @pytest.mark.slow - def test_bar_nan(self): - df = DataFrame({"A": [10, np.nan, 20], "B": [5, 10, 20], "C": [1, 2, 3]}) - ax = df.plot.bar() - expected = [10, 0, 20, 5, 10, 20, 1, 2, 3] - result = [p.get_height() for p in ax.patches] - assert result == expected - - ax = df.plot.bar(stacked=True) - result = [p.get_height() for p in ax.patches] - assert result == expected - - result = [p.get_y() for p in ax.patches] - expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] - assert result == expected - - @pytest.mark.slow - def test_bar_categorical(self): - # GH 13019 - df1 = pd.DataFrame( - np.random.randn(6, 5), - index=pd.Index(list("ABCDEF")), - columns=pd.Index(list("abcde")), - ) - # categorical index must behave the same - df2 = pd.DataFrame( - np.random.randn(6, 5), - index=pd.CategoricalIndex(list("ABCDEF")), - columns=pd.CategoricalIndex(list("abcde")), - ) - - for df in [df1, df2]: - ax = df.plot.bar() - ticks = ax.xaxis.get_ticklocs() - tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4, 5])) - assert ax.get_xlim() == (-0.5, 5.5) - # check left-edge of bars - assert ax.patches[0].get_x() == -0.25 - assert ax.patches[-1].get_x() == 5.15 - - ax = df.plot.bar(stacked=True) - tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4, 5])) - assert ax.get_xlim() == (-0.5, 5.5) - assert ax.patches[0].get_x() == -0.25 - assert ax.patches[-1].get_x() == 4.75 - - @pytest.mark.slow - def test_plot_scatter(self): - df = DataFrame( - randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=["x", "y", "z", "four"], - ) - - _check_plot_works(df.plot.scatter, x="x", y="y") - _check_plot_works(df.plot.scatter, x=1, y=2) - - with pytest.raises(TypeError): - df.plot.scatter(x="x") - with pytest.raises(TypeError): - df.plot.scatter(y="y") - - # GH 6951 - axes = df.plot(x="x", y="y", kind="scatter", subplots=True) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - - def test_raise_error_on_datetime_time_data(self): - # GH 8113, datetime.time type is not supported by matplotlib in scatter - df = pd.DataFrame(np.random.randn(10), columns=["a"]) - df["dtime"] = pd.date_range(start="2014-01-01", freq="h", periods=10).time - msg = "must be a string or a number, not 'datetime.time'" - - with pytest.raises(TypeError, match=msg): - df.plot(kind="scatter", x="dtime", y="a") - - def test_scatterplot_datetime_data(self): - # GH 30391 - dates = pd.date_range(start=date(2019, 1, 1), periods=12, freq="W") - vals = np.random.normal(0, 1, len(dates)) - df = pd.DataFrame({"dates": dates, "vals": vals}) - - _check_plot_works(df.plot.scatter, x="dates", y="vals") - _check_plot_works(df.plot.scatter, x=0, y=1) - - def test_scatterplot_object_data(self): - # GH 18755 - df = pd.DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) - - _check_plot_works(df.plot.scatter, x="a", y="b") - _check_plot_works(df.plot.scatter, x=0, y=1) - - df = pd.DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) - - _check_plot_works(df.plot.scatter, x="a", y="b") - _check_plot_works(df.plot.scatter, x=0, y=1) - - @pytest.mark.slow - def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): - # addressing issue #10611, to ensure colobar does not - # interfere with x-axis label and ticklabels with - # ipython inline backend. - random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) - - ax1 = df.plot.scatter(x="A label", y="B label") - ax2 = df.plot.scatter(x="A label", y="B label", c="C label") - - vis1 = [vis.get_visible() for vis in ax1.xaxis.get_minorticklabels()] - vis2 = [vis.get_visible() for vis in ax2.xaxis.get_minorticklabels()] - assert vis1 == vis2 - - vis1 = [vis.get_visible() for vis in ax1.xaxis.get_majorticklabels()] - vis2 = [vis.get_visible() for vis in ax2.xaxis.get_majorticklabels()] - assert vis1 == vis2 - - assert ( - ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible() - ) - - @pytest.mark.slow - def test_if_hexbin_xaxis_label_is_visible(self): - # addressing issue #10678, to ensure colobar does not - # interfere with x-axis label and ticklabels with - # ipython inline backend. - random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) - - ax = df.plot.hexbin("A label", "B label", gridsize=12) - assert all(vis.get_visible() for vis in ax.xaxis.get_minorticklabels()) - assert all(vis.get_visible() for vis in ax.xaxis.get_majorticklabels()) - assert ax.xaxis.get_label().get_visible() - - @pytest.mark.slow - def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): - import matplotlib.pyplot as plt - - random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) - - fig, axes = plt.subplots(1, 2) - df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) - df.plot.scatter("A label", "B label", c="C label", ax=axes[1]) - plt.tight_layout() - - points = np.array([ax.get_position().get_points() for ax in fig.axes]) - axes_x_coords = points[:, :, 0] - parent_distance = axes_x_coords[1, :] - axes_x_coords[0, :] - colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] - assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() - - @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) - @pytest.mark.slow - def test_plot_scatter_with_categorical_data(self, x, y): - # after fixing GH 18755, should be able to plot categorical data - df = pd.DataFrame( - {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} - ) - - _check_plot_works(df.plot.scatter, x=x, y=y) - - @pytest.mark.slow - def test_plot_scatter_with_c(self): - df = DataFrame( - randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=["x", "y", "z", "four"], - ) - - axes = [df.plot.scatter(x="x", y="y", c="z"), df.plot.scatter(x=0, y=1, c=2)] - for ax in axes: - # default to Greys - assert ax.collections[0].cmap.name == "Greys" - - # n.b. there appears to be no public method - # to get the colorbar label - assert ax.collections[0].colorbar._label == "z" - - cm = "cubehelix" - ax = df.plot.scatter(x="x", y="y", c="z", colormap=cm) - assert ax.collections[0].cmap.name == cm - - # verify turning off colorbar works - ax = df.plot.scatter(x="x", y="y", c="z", colorbar=False) - assert ax.collections[0].colorbar is None - - # verify that we can still plot a solid color - ax = df.plot.scatter(x=0, y=1, c="red") - assert ax.collections[0].colorbar is None - self._check_colors(ax.collections, facecolors=["r"]) - - # Ensure that we can pass an np.array straight through to matplotlib, - # this functionality was accidentally removed previously. - # See https://github.com/pandas-dev/pandas/issues/8852 for bug report - # - # Exercise colormap path and non-colormap path as they are independent - # - df = DataFrame({"A": [1, 2], "B": [3, 4]}) - red_rgba = [1.0, 0.0, 0.0, 1.0] - green_rgba = [0.0, 1.0, 0.0, 1.0] - rgba_array = np.array([red_rgba, green_rgba]) - ax = df.plot.scatter(x="A", y="B", c=rgba_array) - # expect the face colors of the points in the non-colormap path to be - # identical to the values we supplied, normally we'd be on shaky ground - # comparing floats for equality but here we expect them to be - # identical. - tm.assert_numpy_array_equal(ax.collections[0].get_facecolor(), rgba_array) - # we don't test the colors of the faces in this next plot because they - # are dependent on the spring colormap, which may change its colors - # later. - float_array = np.array([0.0, 1.0]) - df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") - - @pytest.mark.parametrize("cmap", [None, "Greys"]) - def test_scatter_with_c_column_name_with_colors(self, cmap): - # https://github.com/pandas-dev/pandas/issues/34316 - df = pd.DataFrame( - [[5.1, 3.5], [4.9, 3.0], [7.0, 3.2], [6.4, 3.2], [5.9, 3.0]], - columns=["length", "width"], - ) - df["species"] = ["r", "r", "g", "g", "b"] - ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap) - assert ax.collections[0].colorbar is None - - def test_plot_scatter_with_s(self): - # this refers to GH 32904 - df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"],) - - ax = df.plot.scatter(x="a", y="b", s="c") - tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes()) - - def test_scatter_colors(self): - df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) - with pytest.raises(TypeError): - df.plot.scatter(x="a", y="b", c="c", color="green") - - default_colors = self._unpack_cycler(self.plt.rcParams) - - ax = df.plot.scatter(x="a", y="b", c="c") - tm.assert_numpy_array_equal( - ax.collections[0].get_facecolor()[0], - np.array(self.colorconverter.to_rgba(default_colors[0])), - ) - - ax = df.plot.scatter(x="a", y="b", color="white") - tm.assert_numpy_array_equal( - ax.collections[0].get_facecolor()[0], - np.array([1, 1, 1, 1], dtype=np.float64), - ) - - def test_scatter_colorbar_different_cmap(self): - # GH 33389 - import matplotlib.pyplot as plt - - df = pd.DataFrame({"x": [1, 2, 3], "y": [1, 3, 2], "c": [1, 2, 3]}) - df["x2"] = df["x"] + 1 - - fig, ax = plt.subplots() - df.plot("x", "y", c="c", kind="scatter", cmap="cividis", ax=ax) - df.plot("x2", "y", c="c", kind="scatter", cmap="magma", ax=ax) - - assert ax.collections[0].cmap.name == "cividis" - assert ax.collections[1].cmap.name == "magma" - - @pytest.mark.slow - def test_plot_bar(self): - df = DataFrame( - randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=["one", "two", "three", "four"], - ) - - _check_plot_works(df.plot.bar) - _check_plot_works(df.plot.bar, legend=False) - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.plot.bar, subplots=True) - _check_plot_works(df.plot.bar, stacked=True) - - df = DataFrame( - randn(10, 15), index=list(string.ascii_letters[:10]), columns=range(15) - ) - _check_plot_works(df.plot.bar) - - df = DataFrame({"a": [0, 1], "b": [1, 0]}) - ax = _check_plot_works(df.plot.bar) - self._check_ticks_props(ax, xrot=90) - - ax = df.plot.bar(rot=35, fontsize=10) - self._check_ticks_props(ax, xrot=35, xlabelsize=10, ylabelsize=10) - - ax = _check_plot_works(df.plot.barh) - self._check_ticks_props(ax, yrot=0) - - ax = df.plot.barh(rot=55, fontsize=11) - self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11) - - def _check_bar_alignment( - self, - df, - kind="bar", - stacked=False, - subplots=False, - align="center", - width=0.5, - position=0.5, - ): - - axes = df.plot( - kind=kind, - stacked=stacked, - subplots=subplots, - align=align, - width=width, - position=position, - grid=True, - ) - - axes = self._flatten_visible(axes) - - for ax in axes: - if kind == "bar": - axis = ax.xaxis - ax_min, ax_max = ax.get_xlim() - min_edge = min(p.get_x() for p in ax.patches) - max_edge = max(p.get_x() + p.get_width() for p in ax.patches) - elif kind == "barh": - axis = ax.yaxis - ax_min, ax_max = ax.get_ylim() - min_edge = min(p.get_y() for p in ax.patches) - max_edge = max(p.get_y() + p.get_height() for p in ax.patches) - else: - raise ValueError - - # GH 7498 - # compare margins between lim and bar edges - tm.assert_almost_equal(ax_min, min_edge - 0.25) - tm.assert_almost_equal(ax_max, max_edge + 0.25) - - p = ax.patches[0] - if kind == "bar" and (stacked is True or subplots is True): - edge = p.get_x() - center = edge + p.get_width() * position - elif kind == "bar" and stacked is False: - center = p.get_x() + p.get_width() * len(df.columns) * position - edge = p.get_x() - elif kind == "barh" and (stacked is True or subplots is True): - center = p.get_y() + p.get_height() * position - edge = p.get_y() - elif kind == "barh" and stacked is False: - center = p.get_y() + p.get_height() * len(df.columns) * position - edge = p.get_y() - else: - raise ValueError - - # Check the ticks locates on integer - assert (axis.get_ticklocs() == np.arange(len(df))).all() - - if align == "center": - # Check whether the bar locates on center - tm.assert_almost_equal(axis.get_ticklocs()[0], center) - elif align == "edge": - # Check whether the bar's edge starts from the tick - tm.assert_almost_equal(axis.get_ticklocs()[0], edge) - else: - raise ValueError - - return axes - - @pytest.mark.slow - def test_bar_stacked_center(self): - # GH2157 - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind="bar", stacked=True) - self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9) - self._check_bar_alignment(df, kind="barh", stacked=True) - self._check_bar_alignment(df, kind="barh", stacked=True, width=0.9) - - @pytest.mark.slow - def test_bar_center(self): - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind="bar", stacked=False) - self._check_bar_alignment(df, kind="bar", stacked=False, width=0.9) - self._check_bar_alignment(df, kind="barh", stacked=False) - self._check_bar_alignment(df, kind="barh", stacked=False, width=0.9) - - @pytest.mark.slow - def test_bar_subplots_center(self): - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind="bar", subplots=True) - self._check_bar_alignment(df, kind="bar", subplots=True, width=0.9) - self._check_bar_alignment(df, kind="barh", subplots=True) - self._check_bar_alignment(df, kind="barh", subplots=True, width=0.9) - - @pytest.mark.slow - def test_bar_align_single_column(self): - df = DataFrame(randn(5)) - self._check_bar_alignment(df, kind="bar", stacked=False) - self._check_bar_alignment(df, kind="bar", stacked=True) - self._check_bar_alignment(df, kind="barh", stacked=False) - self._check_bar_alignment(df, kind="barh", stacked=True) - self._check_bar_alignment(df, kind="bar", subplots=True) - self._check_bar_alignment(df, kind="barh", subplots=True) - - @pytest.mark.slow - def test_bar_edge(self): - df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) - - self._check_bar_alignment(df, kind="bar", stacked=True, align="edge") - self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, align="edge") - self._check_bar_alignment(df, kind="barh", stacked=True, align="edge") - self._check_bar_alignment( - df, kind="barh", stacked=True, width=0.9, align="edge" - ) - - self._check_bar_alignment(df, kind="bar", stacked=False, align="edge") - self._check_bar_alignment( - df, kind="bar", stacked=False, width=0.9, align="edge" - ) - self._check_bar_alignment(df, kind="barh", stacked=False, align="edge") - self._check_bar_alignment( - df, kind="barh", stacked=False, width=0.9, align="edge" - ) - - self._check_bar_alignment(df, kind="bar", subplots=True, align="edge") - self._check_bar_alignment( - df, kind="bar", subplots=True, width=0.9, align="edge" - ) - self._check_bar_alignment(df, kind="barh", subplots=True, align="edge") - self._check_bar_alignment( - df, kind="barh", subplots=True, width=0.9, align="edge" - ) - - @pytest.mark.slow - def test_bar_log_no_subplots(self): - # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 - # regressions in 1.2.1 - expected = np.array([0.1, 1.0, 10.0, 100]) - - # no subplots - df = DataFrame({"A": [3] * 5, "B": list(range(1, 6))}, index=range(5)) - ax = df.plot.bar(grid=True, log=True) - tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) - - @pytest.mark.slow - def test_bar_log_subplots(self): - expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]) - - ax = DataFrame([Series([200, 300]), Series([300, 500])]).plot.bar( - log=True, subplots=True - ) - - tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) - tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) - - @pytest.mark.slow - def test_boxplot(self): - df = self.hist_df - series = df["height"] - numeric_cols = df._get_numeric_data().columns - labels = [pprint_thing(c) for c in numeric_cols] - - ax = _check_plot_works(df.plot.box) - self._check_text_labels(ax.get_xticklabels(), labels) - tm.assert_numpy_array_equal( - ax.xaxis.get_ticklocs(), np.arange(1, len(numeric_cols) + 1) - ) - assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) - - axes = series.plot.box(rot=40) - self._check_ticks_props(axes, xrot=40, yrot=0) - tm.close() - - ax = _check_plot_works(series.plot.box) - - positions = np.array([1, 6, 7]) - ax = df.plot.box(positions=positions) - numeric_cols = df._get_numeric_data().columns - labels = [pprint_thing(c) for c in numeric_cols] - self._check_text_labels(ax.get_xticklabels(), labels) - tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) - assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) - - @pytest.mark.slow - def test_boxplot_vertical(self): - df = self.hist_df - numeric_cols = df._get_numeric_data().columns - labels = [pprint_thing(c) for c in numeric_cols] - - # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) - self._check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) - self._check_text_labels(ax.get_yticklabels(), labels) - assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) - - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.box, subplots=True, vert=False, logx=True) - self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) - self._check_ax_scales(axes, xaxis="log") - for ax, label in zip(axes, labels): - self._check_text_labels(ax.get_yticklabels(), [label]) - assert len(ax.lines) == self.bp_n_objects - - positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) - self._check_text_labels(ax.get_yticklabels(), labels) - tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) - assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) - - @pytest.mark.slow - def test_boxplot_return_type(self): - df = DataFrame( - randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=["one", "two", "three", "four"], - ) - with pytest.raises(ValueError): - df.plot.box(return_type="NOTATYPE") - - result = df.plot.box(return_type="dict") - self._check_box_return_type(result, "dict") - - result = df.plot.box(return_type="axes") - self._check_box_return_type(result, "axes") - - result = df.plot.box() # default axes - self._check_box_return_type(result, "axes") - - result = df.plot.box(return_type="both") - self._check_box_return_type(result, "both") - - @pytest.mark.slow - def test_boxplot_subplots_return_type(self): - df = self.hist_df - - # normal style: return_type=None - result = df.plot.box(subplots=True) - assert isinstance(result, Series) - self._check_box_return_type( - result, None, expected_keys=["height", "weight", "category"] - ) - - for t in ["dict", "axes", "both"]: - returned = df.plot.box(return_type=t, subplots=True) - self._check_box_return_type( - returned, - t, - expected_keys=["height", "weight", "category"], - check_ax_title=False, - ) - - @pytest.mark.slow - @td.skip_if_no_scipy - def test_kde_df(self): - df = DataFrame(randn(100, 4)) - ax = _check_plot_works(df.plot, kind="kde") - expected = [pprint_thing(c) for c in df.columns] - self._check_legend_labels(ax, labels=expected) - self._check_ticks_props(ax, xrot=0) - - ax = df.plot(kind="kde", rot=20, fontsize=5) - self._check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, kind="kde", subplots=True) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - axes = df.plot(kind="kde", logy=True, subplots=True) - self._check_ax_scales(axes, yaxis="log") - - @pytest.mark.slow - @td.skip_if_no_scipy - def test_kde_missing_vals(self): - df = DataFrame(np.random.uniform(size=(100, 4))) - df.loc[0, 0] = np.nan - _check_plot_works(df.plot, kind="kde") - - @pytest.mark.slow - def test_hist_df(self): - from matplotlib.patches import Rectangle - - df = DataFrame(randn(100, 4)) - series = df[0] - - ax = _check_plot_works(df.plot.hist) - expected = [pprint_thing(c) for c in df.columns] - self._check_legend_labels(ax, labels=expected) - - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.hist, subplots=True, logy=True) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - self._check_ax_scales(axes, yaxis="log") - - axes = series.plot.hist(rot=40) - self._check_ticks_props(axes, xrot=40, yrot=0) - tm.close() - - ax = series.plot.hist(cumulative=True, bins=4, density=True) - # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] - tm.assert_almost_equal(rects[-1].get_height(), 1.0) - tm.close() - - ax = series.plot.hist(cumulative=True, bins=4) - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] - - tm.assert_almost_equal(rects[-2].get_height(), 100.0) - tm.close() - - # if horizontal, yticklabels are rotated - axes = df.plot.hist(rot=50, fontsize=8, orientation="horizontal") - self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) - - @pytest.mark.parametrize( - "weights", [0.1 * np.ones(shape=(100,)), 0.1 * np.ones(shape=(100, 2))] - ) - def test_hist_weights(self, weights): - # GH 33173 - np.random.seed(0) - df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100,)))) - - ax1 = _check_plot_works(df.plot, kind="hist", weights=weights) - ax2 = _check_plot_works(df.plot, kind="hist") - - patch_height_with_weights = [patch.get_height() for patch in ax1.patches] - - # original heights with no weights, and we manually multiply with example - # weights, so after multiplication, they should be almost same - expected_patch_height = [0.1 * patch.get_height() for patch in ax2.patches] - - tm.assert_almost_equal(patch_height_with_weights, expected_patch_height) - - def _check_box_coord( - self, - patches, - expected_y=None, - expected_h=None, - expected_x=None, - expected_w=None, - ): - result_y = np.array([p.get_y() for p in patches]) - result_height = np.array([p.get_height() for p in patches]) - result_x = np.array([p.get_x() for p in patches]) - result_width = np.array([p.get_width() for p in patches]) - # dtype is depending on above values, no need to check - - if expected_y is not None: - tm.assert_numpy_array_equal(result_y, expected_y, check_dtype=False) - if expected_h is not None: - tm.assert_numpy_array_equal(result_height, expected_h, check_dtype=False) - if expected_x is not None: - tm.assert_numpy_array_equal(result_x, expected_x, check_dtype=False) - if expected_w is not None: - tm.assert_numpy_array_equal(result_width, expected_w, check_dtype=False) - - @pytest.mark.slow - def test_hist_df_coord(self): - normal_df = DataFrame( - { - "A": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([10, 9, 8, 7, 6])), - "B": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([8, 8, 8, 8, 8])), - "C": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([6, 7, 8, 9, 10])), - }, - columns=["A", "B", "C"], - ) - - nan_df = DataFrame( - { - "A": np.repeat( - np.array([np.nan, 1, 2, 3, 4, 5]), np.array([3, 10, 9, 8, 7, 6]) - ), - "B": np.repeat( - np.array([1, np.nan, 2, 3, 4, 5]), np.array([8, 3, 8, 8, 8, 8]) - ), - "C": np.repeat( - np.array([1, 2, 3, np.nan, 4, 5]), np.array([6, 7, 8, 3, 9, 10]) - ), - }, - columns=["A", "B", "C"], - ) - - for df in [normal_df, nan_df]: - ax = df.plot.hist(bins=5) - self._check_box_coord( - ax.patches[:5], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6]), - ) - self._check_box_coord( - ax.patches[5:10], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([8, 8, 8, 8, 8]), - ) - self._check_box_coord( - ax.patches[10:], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([6, 7, 8, 9, 10]), - ) - - ax = df.plot.hist(bins=5, stacked=True) - self._check_box_coord( - ax.patches[:5], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6]), - ) - self._check_box_coord( - ax.patches[5:10], - expected_y=np.array([10, 9, 8, 7, 6]), - expected_h=np.array([8, 8, 8, 8, 8]), - ) - self._check_box_coord( - ax.patches[10:], - expected_y=np.array([18, 17, 16, 15, 14]), - expected_h=np.array([6, 7, 8, 9, 10]), - ) - - axes = df.plot.hist(bins=5, stacked=True, subplots=True) - self._check_box_coord( - axes[0].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6]), - ) - self._check_box_coord( - axes[1].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([8, 8, 8, 8, 8]), - ) - self._check_box_coord( - axes[2].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([6, 7, 8, 9, 10]), - ) - - # horizontal - ax = df.plot.hist(bins=5, orientation="horizontal") - self._check_box_coord( - ax.patches[:5], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6]), - ) - self._check_box_coord( - ax.patches[5:10], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([8, 8, 8, 8, 8]), - ) - self._check_box_coord( - ax.patches[10:], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([6, 7, 8, 9, 10]), - ) - - ax = df.plot.hist(bins=5, stacked=True, orientation="horizontal") - self._check_box_coord( - ax.patches[:5], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6]), - ) - self._check_box_coord( - ax.patches[5:10], - expected_x=np.array([10, 9, 8, 7, 6]), - expected_w=np.array([8, 8, 8, 8, 8]), - ) - self._check_box_coord( - ax.patches[10:], - expected_x=np.array([18, 17, 16, 15, 14]), - expected_w=np.array([6, 7, 8, 9, 10]), - ) - - axes = df.plot.hist( - bins=5, stacked=True, subplots=True, orientation="horizontal" - ) - self._check_box_coord( - axes[0].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6]), - ) - self._check_box_coord( - axes[1].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([8, 8, 8, 8, 8]), - ) - self._check_box_coord( - axes[2].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([6, 7, 8, 9, 10]), - ) - - @pytest.mark.slow - def test_plot_int_columns(self): - df = DataFrame(randn(100, 4)).cumsum() - _check_plot_works(df.plot, legend=True) - - @pytest.mark.slow - def test_df_legend_labels(self): - kinds = ["line", "bar", "barh", "kde", "area", "hist"] - df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) - df2 = DataFrame(rand(3, 3), columns=["d", "e", "f"]) - df3 = DataFrame(rand(3, 3), columns=["g", "h", "i"]) - df4 = DataFrame(rand(3, 3), columns=["j", "k", "l"]) - - for kind in kinds: - - ax = df.plot(kind=kind, legend=True) - self._check_legend_labels(ax, labels=df.columns) - - ax = df2.plot(kind=kind, legend=False, ax=ax) - self._check_legend_labels(ax, labels=df.columns) - - ax = df3.plot(kind=kind, legend=True, ax=ax) - self._check_legend_labels(ax, labels=df.columns.union(df3.columns)) - - ax = df4.plot(kind=kind, legend="reverse", ax=ax) - expected = list(df.columns.union(df3.columns)) + list(reversed(df4.columns)) - self._check_legend_labels(ax, labels=expected) - - # Secondary Y - ax = df.plot(legend=True, secondary_y="b") - self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) - ax = df2.plot(legend=False, ax=ax) - self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) - ax = df3.plot(kind="bar", legend=True, secondary_y="h", ax=ax) - self._check_legend_labels( - ax, labels=["a", "b (right)", "c", "g", "h (right)", "i"] - ) - - # Time Series - ind = date_range("1/1/2014", periods=3) - df = DataFrame(randn(3, 3), columns=["a", "b", "c"], index=ind) - df2 = DataFrame(randn(3, 3), columns=["d", "e", "f"], index=ind) - df3 = DataFrame(randn(3, 3), columns=["g", "h", "i"], index=ind) - ax = df.plot(legend=True, secondary_y="b") - self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) - ax = df2.plot(legend=False, ax=ax) - self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) - ax = df3.plot(legend=True, ax=ax) - self._check_legend_labels(ax, labels=["a", "b (right)", "c", "g", "h", "i"]) - - # scatter - ax = df.plot.scatter(x="a", y="b", label="data1") - self._check_legend_labels(ax, labels=["data1"]) - ax = df2.plot.scatter(x="d", y="e", legend=False, label="data2", ax=ax) - self._check_legend_labels(ax, labels=["data1"]) - ax = df3.plot.scatter(x="g", y="h", label="data3", ax=ax) - self._check_legend_labels(ax, labels=["data1", "data3"]) - - # ensure label args pass through and - # index name does not mutate - # column names don't mutate - df5 = df.set_index("a") - ax = df5.plot(y="b") - self._check_legend_labels(ax, labels=["b"]) - ax = df5.plot(y="b", label="LABEL_b") - self._check_legend_labels(ax, labels=["LABEL_b"]) - self._check_text_labels(ax.xaxis.get_label(), "a") - ax = df5.plot(y="c", label="LABEL_c", ax=ax) - self._check_legend_labels(ax, labels=["LABEL_b", "LABEL_c"]) - assert df5.columns.tolist() == ["b", "c"] - - def test_missing_marker_multi_plots_on_same_ax(self): - # GH 18222 - df = pd.DataFrame( - data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"] - ) - fig, ax = self.plt.subplots(nrows=1, ncols=3) - # Left plot - df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[0]) - df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[0]) - df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[0]) - self._check_legend_labels(ax[0], labels=["r", "g", "b"]) - self._check_legend_marker(ax[0], expected_markers=["o", "x", "o"]) - # Center plot - df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[1]) - df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[1]) - df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[1]) - self._check_legend_labels(ax[1], labels=["b", "r", "g"]) - self._check_legend_marker(ax[1], expected_markers=["o", "o", "x"]) - # Right plot - df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[2]) - df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[2]) - df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[2]) - self._check_legend_labels(ax[2], labels=["g", "b", "r"]) - self._check_legend_marker(ax[2], expected_markers=["x", "o", "o"]) - - def test_legend_name(self): - multi = DataFrame( - randn(4, 4), - columns=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], - ) - multi.columns.names = ["group", "individual"] - - ax = multi.plot() - leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, "group,individual") - - df = DataFrame(randn(5, 5)) - ax = df.plot(legend=True, ax=ax) - leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, "group,individual") - - df.columns.name = "new" - ax = df.plot(legend=False, ax=ax) - leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, "group,individual") - - ax = df.plot(legend=True, ax=ax) - leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, "new") - - @pytest.mark.slow - def test_no_legend(self): - kinds = ["line", "bar", "barh", "kde", "area", "hist"] - df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) - - for kind in kinds: - - ax = df.plot(kind=kind, legend=False) - self._check_legend_labels(ax, visible=False) - - @pytest.mark.slow - def test_style_by_column(self): - import matplotlib.pyplot as plt - - fig = plt.gcf() - - df = DataFrame(randn(100, 3)) - for markers in [ - {0: "^", 1: "+", 2: "o"}, - {0: "^", 1: "+"}, - ["^", "+", "o"], - ["^", "+"], - ]: - fig.clf() - fig.add_subplot(111) - ax = df.plot(style=markers) - for i, l in enumerate(ax.get_lines()[: len(markers)]): - assert l.get_marker() == markers[i] - - @pytest.mark.slow - def test_line_label_none(self): - s = Series([1, 2]) - ax = s.plot() - assert ax.get_legend() is None - - ax = s.plot(legend=True) - assert ax.get_legend().get_texts()[0].get_text() == "None" - - @pytest.mark.slow - def test_line_colors(self): - from matplotlib import cm - - custom_colors = "rgcby" - df = DataFrame(randn(5, 5)) - - ax = df.plot(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - - tm.close() - - ax2 = df.plot(color=custom_colors) - lines2 = ax2.get_lines() - - for l1, l2 in zip(ax.get_lines(), lines2): - assert l1.get_color() == l2.get_color() - - tm.close() - - ax = df.plot(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - tm.close() - - ax = df.plot(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - tm.close() - - # make color a list if plotting one column frame - # handles cases like df.plot(color='DodgerBlue') - ax = df.loc[:, [0]].plot(color="DodgerBlue") - self._check_colors(ax.lines, linecolors=["DodgerBlue"]) - - ax = df.plot(color="red") - self._check_colors(ax.get_lines(), linecolors=["red"] * 5) - tm.close() - - # GH 10299 - custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] - ax = df.plot(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - tm.close() - - @pytest.mark.slow - def test_dont_modify_colors(self): - colors = ["r", "g", "b"] - pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) - assert len(colors) == 3 - - @pytest.mark.slow - def test_line_colors_and_styles_subplots(self): - # GH 9894 - from matplotlib import cm - - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(randn(5, 5)) - - axes = df.plot(subplots=True) - for ax, c in zip(axes, list(default_colors)): - c = [c] - self._check_colors(ax.get_lines(), linecolors=c) - tm.close() - - # single color char - axes = df.plot(subplots=True, color="k") - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["k"]) - tm.close() - - # single color str - axes = df.plot(subplots=True, color="green") - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["green"]) - tm.close() - - custom_colors = "rgcby" - axes = df.plot(color=custom_colors, subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - axes = df.plot(color=list(custom_colors), subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # GH 10299 - custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] - axes = df.plot(color=custom_colors, subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ["jet", cm.jet]: - axes = df.plot(colormap=cmap, subplots=True) - for ax, c in zip(axes, rgba_colors): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # make color a list if plotting one column frame - # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(color="DodgerBlue", subplots=True) - self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) - - # single character style - axes = df.plot(style="r", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["r"]) - tm.close() - - # list of styles - styles = list("rgcby") - axes = df.plot(style=styles, subplots=True) - for ax, c in zip(axes, styles): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - @pytest.mark.slow - def test_area_colors(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - - custom_colors = "rgcby" - df = DataFrame(rand(5, 5)) - - ax = df.plot.area(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] - self._check_colors(poly, facecolors=custom_colors) - - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, facecolors=custom_colors) - - for h in handles: - assert h.get_alpha() is None - tm.close() - - ax = df.plot.area(colormap="jet") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] - self._check_colors(poly, facecolors=jet_colors) - - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, facecolors=jet_colors) - for h in handles: - assert h.get_alpha() is None - tm.close() - - # When stacked=False, alpha is set to 0.5 - ax = df.plot.area(colormap=cm.jet, stacked=False) - self._check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] - jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] - self._check_colors(poly, facecolors=jet_with_alpha) - - handles, labels = ax.get_legend_handles_labels() - linecolors = jet_with_alpha - self._check_colors(handles[: len(jet_colors)], linecolors=linecolors) - for h in handles: - assert h.get_alpha() == 0.5 - - @pytest.mark.slow - def test_hist_colors(self): - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(randn(5, 5)) - ax = df.plot.hist() - self._check_colors(ax.patches[::10], facecolors=default_colors[:5]) - tm.close() - - custom_colors = "rgcby" - ax = df.plot.hist(color=custom_colors) - self._check_colors(ax.patches[::10], facecolors=custom_colors) - tm.close() - - from matplotlib import cm - - # Test str -> colormap functionality - ax = df.plot.hist(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::10], facecolors=rgba_colors) - tm.close() - - # Test colormap functionality - ax = df.plot.hist(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] - self._check_colors(ax.patches[::10], facecolors=rgba_colors) - tm.close() - - ax = df.loc[:, [0]].plot.hist(color="DodgerBlue") - self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) - - ax = df.plot(kind="hist", color="green") - self._check_colors(ax.patches[::10], facecolors=["green"] * 5) - tm.close() - - @pytest.mark.slow - @td.skip_if_no_scipy - def test_kde_colors(self): - from matplotlib import cm - - custom_colors = "rgcby" - df = DataFrame(rand(5, 5)) - - ax = df.plot.kde(color=custom_colors) - self._check_colors(ax.get_lines(), linecolors=custom_colors) - tm.close() - - ax = df.plot.kde(colormap="jet") - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - tm.close() - - ax = df.plot.kde(colormap=cm.jet) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - self._check_colors(ax.get_lines(), linecolors=rgba_colors) - - @pytest.mark.slow - @td.skip_if_no_scipy - def test_kde_colors_and_styles_subplots(self): - from matplotlib import cm - - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(randn(5, 5)) - - axes = df.plot(kind="kde", subplots=True) - for ax, c in zip(axes, list(default_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # single color char - axes = df.plot(kind="kde", color="k", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["k"]) - tm.close() - - # single color str - axes = df.plot(kind="kde", color="red", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["red"]) - tm.close() - - custom_colors = "rgcby" - axes = df.plot(kind="kde", color=custom_colors, subplots=True) - for ax, c in zip(axes, list(custom_colors)): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ["jet", cm.jet]: - axes = df.plot(kind="kde", colormap=cmap, subplots=True) - for ax, c in zip(axes, rgba_colors): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - # make color a list if plotting one column frame - # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(kind="kde", color="DodgerBlue", subplots=True) - self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) - - # single character style - axes = df.plot(kind="kde", style="r", subplots=True) - for ax in axes: - self._check_colors(ax.get_lines(), linecolors=["r"]) - tm.close() - - # list of styles - styles = list("rgcby") - axes = df.plot(kind="kde", style=styles, subplots=True) - for ax, c in zip(axes, styles): - self._check_colors(ax.get_lines(), linecolors=[c]) - tm.close() - - @pytest.mark.slow - def test_boxplot_colors(self): - def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): - # TODO: outside this func? - if fliers_c is None: - fliers_c = "k" - self._check_colors(bp["boxes"], linecolors=[box_c] * len(bp["boxes"])) - self._check_colors( - bp["whiskers"], linecolors=[whiskers_c] * len(bp["whiskers"]) - ) - self._check_colors( - bp["medians"], linecolors=[medians_c] * len(bp["medians"]) - ) - self._check_colors(bp["fliers"], linecolors=[fliers_c] * len(bp["fliers"])) - self._check_colors(bp["caps"], linecolors=[caps_c] * len(bp["caps"])) - - default_colors = self._unpack_cycler(self.plt.rcParams) - - df = DataFrame(randn(5, 5)) - bp = df.plot.box(return_type="dict") - _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) - tm.close() - - dict_colors = dict( - boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456" - ) - bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict") - _check_colors( - bp, - dict_colors["boxes"], - dict_colors["whiskers"], - dict_colors["medians"], - dict_colors["caps"], - "r", - ) - tm.close() - - # partial colors - dict_colors = dict(whiskers="c", medians="m") - bp = df.plot.box(color=dict_colors, return_type="dict") - _check_colors(bp, default_colors[0], "c", "m") - tm.close() - - from matplotlib import cm - - # Test str -> colormap functionality - bp = df.plot.box(colormap="jet", return_type="dict") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, 3)] - _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) - tm.close() - - # Test colormap functionality - bp = df.plot.box(colormap=cm.jet, return_type="dict") - _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) - tm.close() - - # string color is applied to all artists except fliers - bp = df.plot.box(color="DodgerBlue", return_type="dict") - _check_colors(bp, "DodgerBlue", "DodgerBlue", "DodgerBlue", "DodgerBlue") - - # tuple is also applied to all artists except fliers - bp = df.plot.box(color=(0, 1, 0), sym="#123456", return_type="dict") - _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), "#123456") - - with pytest.raises(ValueError): - # Color contains invalid key results in ValueError - df.plot.box(color=dict(boxes="red", xxxx="blue")) - - @pytest.mark.parametrize( - "props, expected", - [ - ("boxprops", "boxes"), - ("whiskerprops", "whiskers"), - ("capprops", "caps"), - ("medianprops", "medians"), - ], - ) - def test_specified_props_kwd_plot_box(self, props, expected): - # GH 30346 - df = DataFrame({k: np.random.random(100) for k in "ABC"}) - kwd = {props: dict(color="C1")} - result = df.plot.box(return_type="dict", **kwd) - - assert result[expected][0].get_color() == "C1" - - def test_default_color_cycle(self): - import matplotlib.pyplot as plt - import cycler - - colors = list("rgbk") - plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) - - df = DataFrame(randn(5, 3)) - ax = df.plot() - - expected = self._unpack_cycler(plt.rcParams)[:3] - self._check_colors(ax.get_lines(), linecolors=expected) - - def test_unordered_ts(self): - df = DataFrame( - np.array([3.0, 2.0, 1.0]), - index=[date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], - columns=["test"], - ) - ax = df.plot() - xticks = ax.lines[0].get_xdata() - assert xticks[0] < xticks[1] - ydata = ax.lines[0].get_ydata() - tm.assert_numpy_array_equal(ydata, np.array([1.0, 2.0, 3.0])) - - @td.skip_if_no_scipy - def test_kind_both_ways(self): - df = DataFrame({"x": [1, 2, 3]}) - for kind in plotting.PlotAccessor._common_kinds: - - df.plot(kind=kind) - getattr(df.plot, kind)() - for kind in ["scatter", "hexbin"]: - df.plot("x", "x", kind=kind) - getattr(df.plot, kind)("x", "x") - - def test_all_invalid_plot_data(self): - df = DataFrame(list("abcd")) - for kind in plotting.PlotAccessor._common_kinds: - - msg = "no numeric data to plot" - with pytest.raises(TypeError, match=msg): - df.plot(kind=kind) - - @pytest.mark.slow - def test_partially_invalid_plot_data(self): - with tm.RNGContext(42): - df = DataFrame(randn(10, 2), dtype=object) - df[np.random.rand(df.shape[0]) > 0.5] = "a" - for kind in plotting.PlotAccessor._common_kinds: - - msg = "no numeric data to plot" - with pytest.raises(TypeError, match=msg): - df.plot(kind=kind) - - with tm.RNGContext(42): - # area plot doesn't support positive/negative mixed data - kinds = ["area"] - df = DataFrame(rand(10, 2), dtype=object) - df[np.random.rand(df.shape[0]) > 0.5] = "a" - for kind in kinds: - with pytest.raises(TypeError): - df.plot(kind=kind) - - def test_invalid_kind(self): - df = DataFrame(randn(10, 2)) - with pytest.raises(ValueError): - df.plot(kind="aasdf") - - @pytest.mark.parametrize( - "x,y,lbl", - [ - (["B", "C"], "A", "a"), - (["A"], ["B", "C"], ["b", "c"]), - ("A", ["B", "C"], "badlabel"), - ], - ) - def test_invalid_xy_args(self, x, y, lbl): - # GH 18671, 19699 allows y to be list-like but not x - df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - with pytest.raises(ValueError): - df.plot(x=x, y=y, label=lbl) - - @pytest.mark.parametrize("x,y", [("A", "B"), (["A"], "B")]) - def test_invalid_xy_args_dup_cols(self, x, y): - # GH 18671, 19699 allows y to be list-like but not x - df = DataFrame([[1, 3, 5], [2, 4, 6]], columns=list("AAB")) - with pytest.raises(ValueError): - df.plot(x=x, y=y) - - @pytest.mark.parametrize( - "x,y,lbl,colors", - [ - ("A", ["B"], ["b"], ["red"]), - ("A", ["B", "C"], ["b", "c"], ["red", "blue"]), - (0, [1, 2], ["bokeh", "cython"], ["green", "yellow"]), - ], - ) - def test_y_listlike(self, x, y, lbl, colors): - # GH 19699: tests list-like y and verifies lbls & colors - df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - _check_plot_works(df.plot, x="A", y=y, label=lbl) - - ax = df.plot(x=x, y=y, label=lbl, color=colors) - assert len(ax.lines) == len(y) - self._check_colors(ax.get_lines(), linecolors=colors) - - @pytest.mark.parametrize("x,y,colnames", [(0, 1, ["A", "B"]), (1, 0, [0, 1])]) - def test_xy_args_integer(self, x, y, colnames): - # GH 20056: tests integer args for xy and checks col names - df = DataFrame({"A": [1, 2], "B": [3, 4]}) - df.columns = colnames - _check_plot_works(df.plot, x=x, y=y) - - @pytest.mark.slow - def test_hexbin_basic(self): - df = self.hexbin_df - - ax = df.plot.hexbin(x="A", y="B", gridsize=10) - # TODO: need better way to test. This just does existence. - assert len(ax.collections) == 1 - - # GH 6951 - axes = df.plot.hexbin(x="A", y="B", subplots=True) - # hexbin should have 2 axes in the figure, 1 for plotting and another - # is colorbar - assert len(axes[0].figure.axes) == 2 - # return value is single axes - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - - @pytest.mark.slow - def test_hexbin_with_c(self): - df = self.hexbin_df - - ax = df.plot.hexbin(x="A", y="B", C="C") - assert len(ax.collections) == 1 - - ax = df.plot.hexbin(x="A", y="B", C="C", reduce_C_function=np.std) - assert len(ax.collections) == 1 - - @pytest.mark.slow - def test_hexbin_cmap(self): - df = self.hexbin_df - - # Default to BuGn - ax = df.plot.hexbin(x="A", y="B") - assert ax.collections[0].cmap.name == "BuGn" - - cm = "cubehelix" - ax = df.plot.hexbin(x="A", y="B", colormap=cm) - assert ax.collections[0].cmap.name == cm - - @pytest.mark.slow - def test_no_color_bar(self): - df = self.hexbin_df - - ax = df.plot.hexbin(x="A", y="B", colorbar=None) - assert ax.collections[0].colorbar is None - - @pytest.mark.slow - def test_allow_cmap(self): - df = self.hexbin_df - - ax = df.plot.hexbin(x="A", y="B", cmap="YlGn") - assert ax.collections[0].cmap.name == "YlGn" - - with pytest.raises(TypeError): - df.plot.hexbin(x="A", y="B", cmap="YlGn", colormap="BuGn") - - @pytest.mark.slow - def test_pie_df(self): - df = DataFrame( - np.random.rand(5, 3), - columns=["X", "Y", "Z"], - index=["a", "b", "c", "d", "e"], - ) - with pytest.raises(ValueError): - df.plot.pie() - - ax = _check_plot_works(df.plot.pie, y="Y") - self._check_text_labels(ax.texts, df.index) - - ax = _check_plot_works(df.plot.pie, y=2) - self._check_text_labels(ax.texts, df.index) - - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.pie, subplots=True) - assert len(axes) == len(df.columns) - for ax in axes: - self._check_text_labels(ax.texts, df.index) - for ax, ylabel in zip(axes, df.columns): - assert ax.get_ylabel() == ylabel - - labels = ["A", "B", "C", "D", "E"] - color_args = ["r", "g", "b", "c", "m"] - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.plot.pie, subplots=True, labels=labels, colors=color_args - ) - assert len(axes) == len(df.columns) - - for ax in axes: - self._check_text_labels(ax.texts, labels) - self._check_colors(ax.patches, facecolors=color_args) - - def test_pie_df_nan(self): - df = DataFrame(np.random.rand(4, 4)) - for i in range(4): - df.iloc[i, i] = np.nan - fig, axes = self.plt.subplots(ncols=4) - df.plot.pie(subplots=True, ax=axes, legend=True) - - base_expected = ["0", "1", "2", "3"] - for i, ax in enumerate(axes): - expected = list(base_expected) # force copy - expected[i] = "" - result = [x.get_text() for x in ax.texts] - assert result == expected - # legend labels - # NaN's not included in legend with subplots - # see https://github.com/pandas-dev/pandas/issues/8390 - assert [x.get_text() for x in ax.get_legend().get_texts()] == base_expected[ - :i - ] + base_expected[i + 1 :] - - @pytest.mark.slow - def test_errorbar_plot(self): - with warnings.catch_warnings(): - d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} - df = DataFrame(d) - d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} - df_err = DataFrame(d_err) - - # check line plots - ax = _check_plot_works(df.plot, yerr=df_err, logy=True) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, logx=True, logy=True) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, loglog=True) - self._check_has_errorbars(ax, xerr=0, yerr=2) - - kinds = ["line", "bar", "barh"] - for kind in kinds: - ax = _check_plot_works(df.plot, yerr=df_err["x"], kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=d_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, kind=kind) - self._check_has_errorbars(ax, xerr=2, yerr=2) - ax = _check_plot_works( - df.plot, yerr=df_err["x"], xerr=df_err["x"], kind=kind - ) - self._check_has_errorbars(ax, xerr=2, yerr=2) - ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) - self._check_has_errorbars(ax, xerr=2, yerr=2) - - # _check_plot_works adds an ax so catch warning. see GH #13188 - axes = _check_plot_works( - df.plot, yerr=df_err, xerr=df_err, subplots=True, kind=kind - ) - self._check_has_errorbars(axes, xerr=1, yerr=1) - - ax = _check_plot_works( - (df + 1).plot, yerr=df_err, xerr=df_err, kind="bar", log=True - ) - self._check_has_errorbars(ax, xerr=2, yerr=2) - - # yerr is raw error values - ax = _check_plot_works(df["y"].plot, yerr=np.ones(12) * 0.4) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4) - self._check_has_errorbars(ax, xerr=0, yerr=2) - - # yerr is column name - for yerr in ["yerr", "誤差"]: - s_df = df.copy() - s_df[yerr] = np.ones(12) * 0.2 - ax = _check_plot_works(s_df.plot, yerr=yerr) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(s_df.plot, y="y", x="x", yerr=yerr) - self._check_has_errorbars(ax, xerr=0, yerr=1) - - with pytest.raises(ValueError): - df.plot(yerr=np.random.randn(11)) - - df_err = DataFrame({"x": ["zzz"] * 12, "y": ["zzz"] * 12}) - with pytest.raises((ValueError, TypeError)): - df.plot(yerr=df_err) - - @pytest.mark.xfail(reason="Iterator is consumed", raises=ValueError) - @pytest.mark.slow - def test_errorbar_plot_iterator(self): - with warnings.catch_warnings(): - d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} - df = DataFrame(d) - - # yerr is iterator - ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) - self._check_has_errorbars(ax, xerr=0, yerr=2) - - @pytest.mark.slow - def test_errorbar_with_integer_column_names(self): - # test with integer column names - df = DataFrame(np.random.randn(10, 2)) - df_err = DataFrame(np.random.randn(10, 2)) - ax = _check_plot_works(df.plot, yerr=df_err) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, y=0, yerr=1) - self._check_has_errorbars(ax, xerr=0, yerr=1) - - @pytest.mark.slow - def test_errorbar_with_partial_columns(self): - df = DataFrame(np.random.randn(10, 3)) - df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) - kinds = ["line", "bar"] - for kind in kinds: - ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - - ix = date_range("1/1/2000", periods=10, freq="M") - df.set_index(ix, inplace=True) - df_err.set_index(ix, inplace=True) - ax = _check_plot_works(df.plot, yerr=df_err, kind="line") - self._check_has_errorbars(ax, xerr=0, yerr=2) - - d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} - df = DataFrame(d) - d_err = {"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4} - df_err = DataFrame(d_err) - for err in [d_err, df_err]: - ax = _check_plot_works(df.plot, yerr=err) - self._check_has_errorbars(ax, xerr=0, yerr=1) - - @pytest.mark.slow - def test_errorbar_timeseries(self): - - with warnings.catch_warnings(): - d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} - d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} - - # check time-series plots - ix = date_range("1/1/2000", "1/1/2001", freq="M") - tdf = DataFrame(d, index=ix) - tdf_err = DataFrame(d_err, index=ix) - - kinds = ["line", "bar", "barh"] - for kind in kinds: - ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(tdf.plot, y="y", yerr=tdf_err["x"], kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(tdf.plot, y="y", yerr="x", kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - - # _check_plot_works adds an ax so catch warning. see GH #13188 - axes = _check_plot_works( - tdf.plot, kind=kind, yerr=tdf_err, subplots=True - ) - self._check_has_errorbars(axes, xerr=0, yerr=1) - - def test_errorbar_asymmetrical(self): - - np.random.seed(0) - err = np.random.rand(3, 2, 5) - - # each column is [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]... - df = DataFrame(np.arange(15).reshape(3, 5)).T - - ax = df.plot(yerr=err, xerr=err / 2) - - yerr_0_0 = ax.collections[1].get_paths()[0].vertices[:, 1] - expected_0_0 = err[0, :, 0] * np.array([-1, 1]) - tm.assert_almost_equal(yerr_0_0, expected_0_0) - - with pytest.raises(ValueError): - df.plot(yerr=err.T) - - tm.close() - - def test_table(self): - df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - _check_plot_works(df.plot, table=True) - _check_plot_works(df.plot, table=df) - - ax = df.plot() - assert len(ax.tables) == 0 - plotting.table(ax, df.T) - assert len(ax.tables) == 1 - - def test_errorbar_scatter(self): - df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) - df_err = DataFrame( - np.random.randn(5, 2) / 5, index=range(5), columns=["x", "y"] - ) - - ax = _check_plot_works(df.plot.scatter, x="x", y="y") - self._check_has_errorbars(ax, xerr=0, yerr=0) - ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err) - self._check_has_errorbars(ax, xerr=1, yerr=0) - - ax = _check_plot_works(df.plot.scatter, x="x", y="y", yerr=df_err) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err, yerr=df_err) - self._check_has_errorbars(ax, xerr=1, yerr=1) - - def _check_errorbar_color(containers, expected, has_err="has_xerr"): - lines = [] - errs = [c.lines for c in ax.containers if getattr(c, has_err, False)][0] - for el in errs: - if is_list_like(el): - lines.extend(el) - else: - lines.append(el) - err_lines = [x for x in lines if x in ax.collections] - self._check_colors( - err_lines, linecolors=np.array([expected] * len(err_lines)) - ) - - # GH 8081 - df = DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"]) - ax = df.plot.scatter(x="a", y="b", xerr="d", yerr="e", c="red") - self._check_has_errorbars(ax, xerr=1, yerr=1) - _check_errorbar_color(ax.containers, "red", has_err="has_xerr") - _check_errorbar_color(ax.containers, "red", has_err="has_yerr") - - ax = df.plot.scatter(x="a", y="b", yerr="e", color="green") - self._check_has_errorbars(ax, xerr=0, yerr=1) - _check_errorbar_color(ax.containers, "green", has_err="has_yerr") - - @pytest.mark.slow - def test_sharex_and_ax(self): - # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, - # the axis in fig.get_axis() are sorted differently than pandas - # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - - plt.close("all") - gs, axes = _generate_4_axes_via_gridspec() - - df = DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [1, 2, 3, 4, 5, 6], - "c": [1, 2, 3, 4, 5, 6], - "d": [1, 2, 3, 4, 5, 6], - } - ) - - def _check(axes): - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - for ax in [axes[0], axes[2]]: - self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible(ax.get_xticklabels(minor=True), visible=False) - for ax in [axes[1], axes[3]]: - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - - for ax in axes: - df.plot(x="a", y="b", title="title", ax=ax, sharex=True) - gs.tight_layout(plt.gcf()) - _check(axes) - tm.close() - - gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=axes, sharex=True) - _check(axes) - tm.close() - - gs, axes = _generate_4_axes_via_gridspec() - # without sharex, no labels should be touched! - for ax in axes: - df.plot(x="a", y="b", title="title", ax=ax) - - gs.tight_layout(plt.gcf()) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - - @pytest.mark.slow - def test_sharey_and_ax(self): - # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, - # the axis in fig.get_axis() are sorted differently than pandas - # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - - gs, axes = _generate_4_axes_via_gridspec() - - df = DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [1, 2, 3, 4, 5, 6], - "c": [1, 2, 3, 4, 5, 6], - "d": [1, 2, 3, 4, 5, 6], - } - ) - - def _check(axes): - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - for ax in [axes[0], axes[1]]: - self._check_visible(ax.get_yticklabels(), visible=True) - for ax in [axes[2], axes[3]]: - self._check_visible(ax.get_yticklabels(), visible=False) - - for ax in axes: - df.plot(x="a", y="b", title="title", ax=ax, sharey=True) - gs.tight_layout(plt.gcf()) - _check(axes) - tm.close() - - gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=axes, sharey=True) - - gs.tight_layout(plt.gcf()) - _check(axes) - tm.close() - - gs, axes = _generate_4_axes_via_gridspec() - # without sharex, no labels should be touched! - for ax in axes: - df.plot(x="a", y="b", title="title", ax=ax) - - gs.tight_layout(plt.gcf()) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - - @td.skip_if_no_scipy - def test_memory_leak(self): - """ Check that every plot type gets properly collected. """ - import weakref - import gc - - results = {} - for kind in plotting.PlotAccessor._all_kinds: - - args = {} - if kind in ["hexbin", "scatter", "pie"]: - df = self.hexbin_df - args = {"x": "A", "y": "B"} - elif kind == "area": - df = self.tdf.abs() - else: - df = self.tdf - - # Use a weakref so we can see if the object gets collected without - # also preventing it from being collected - results[kind] = weakref.proxy(df.plot(kind=kind, **args)) - - # have matplotlib delete all the figures - tm.close() - # force a garbage collection - gc.collect() - for key in results: - # check that every plot was collected - with pytest.raises(ReferenceError): - # need to actually access something to get an error - results[key].lines - - @pytest.mark.slow - def test_df_subplots_patterns_minorticks(self): - # GH 10657 - import matplotlib.pyplot as plt - - df = DataFrame( - np.random.randn(10, 2), - index=date_range("1/1/2000", periods=10), - columns=list("AB"), - ) - - # shared subplots - fig, axes = plt.subplots(2, 1, sharex=True) - axes = df.plot(subplots=True, ax=axes) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - # xaxis of 1st ax must be hidden - self._check_visible(axes[0].get_xticklabels(), visible=False) - self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) - self._check_visible(axes[1].get_xticklabels(), visible=True) - self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) - tm.close() - - fig, axes = plt.subplots(2, 1) - with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=axes, sharex=True) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - # xaxis of 1st ax must be hidden - self._check_visible(axes[0].get_xticklabels(), visible=False) - self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) - self._check_visible(axes[1].get_xticklabels(), visible=True) - self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) - tm.close() - - # not shared - fig, axes = plt.subplots(2, 1) - axes = df.plot(subplots=True, ax=axes) - for ax in axes: - assert len(ax.lines) == 1 - self._check_visible(ax.get_yticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - - @pytest.mark.slow - def test_df_gridspec_patterns(self): - # GH 10819 - import matplotlib.pyplot as plt - import matplotlib.gridspec as gridspec - - ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) - - df = DataFrame(np.random.randn(10, 2), index=ts.index, columns=list("AB")) - - def _get_vertical_grid(): - gs = gridspec.GridSpec(3, 1) - fig = plt.figure() - ax1 = fig.add_subplot(gs[:2, :]) - ax2 = fig.add_subplot(gs[2, :]) - return ax1, ax2 - - def _get_horizontal_grid(): - gs = gridspec.GridSpec(1, 3) - fig = plt.figure() - ax1 = fig.add_subplot(gs[:, :2]) - ax2 = fig.add_subplot(gs[:, 2]) - return ax1, ax2 - - for ax1, ax2 in [_get_vertical_grid(), _get_horizontal_grid()]: - ax1 = ts.plot(ax=ax1) - assert len(ax1.lines) == 1 - ax2 = df.plot(ax=ax2) - assert len(ax2.lines) == 2 - for ax in [ax1, ax2]: - self._check_visible(ax.get_yticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - - # subplots=True - for ax1, ax2 in [_get_vertical_grid(), _get_horizontal_grid()]: - axes = df.plot(subplots=True, ax=[ax1, ax2]) - assert len(ax1.lines) == 1 - assert len(ax2.lines) == 1 - for ax in axes: - self._check_visible(ax.get_yticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - - # vertical / subplots / sharex=True / sharey=True - ax1, ax2 = _get_vertical_grid() - with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) - assert len(axes[0].lines) == 1 - assert len(axes[1].lines) == 1 - for ax in [ax1, ax2]: - # yaxis are visible because there is only one column - self._check_visible(ax.get_yticklabels(), visible=True) - # xaxis of axes0 (top) are hidden - self._check_visible(axes[0].get_xticklabels(), visible=False) - self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) - self._check_visible(axes[1].get_xticklabels(), visible=True) - self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) - tm.close() - - # horizontal / subplots / sharex=True / sharey=True - ax1, ax2 = _get_horizontal_grid() - with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) - assert len(axes[0].lines) == 1 - assert len(axes[1].lines) == 1 - self._check_visible(axes[0].get_yticklabels(), visible=True) - # yaxis of axes1 (right) are hidden - self._check_visible(axes[1].get_yticklabels(), visible=False) - for ax in [ax1, ax2]: - # xaxis are visible because there is only one column - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - - # boxed - def _get_boxed_grid(): - gs = gridspec.GridSpec(3, 3) - fig = plt.figure() - ax1 = fig.add_subplot(gs[:2, :2]) - ax2 = fig.add_subplot(gs[:2, 2]) - ax3 = fig.add_subplot(gs[2, :2]) - ax4 = fig.add_subplot(gs[2, 2]) - return ax1, ax2, ax3, ax4 - - axes = _get_boxed_grid() - df = DataFrame(np.random.randn(10, 4), index=ts.index, columns=list("ABCD")) - axes = df.plot(subplots=True, ax=axes) - for ax in axes: - assert len(ax.lines) == 1 - # axis are visible because these are not shared - self._check_visible(ax.get_yticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - - # subplots / sharex=True / sharey=True - axes = _get_boxed_grid() - with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=axes, sharex=True, sharey=True) - for ax in axes: - assert len(ax.lines) == 1 - for ax in [axes[0], axes[2]]: # left column - self._check_visible(ax.get_yticklabels(), visible=True) - for ax in [axes[1], axes[3]]: # right column - self._check_visible(ax.get_yticklabels(), visible=False) - for ax in [axes[0], axes[1]]: # top row - self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible(ax.get_xticklabels(minor=True), visible=False) - for ax in [axes[2], axes[3]]: # bottom row - self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible(ax.get_xticklabels(minor=True), visible=True) - tm.close() - - @pytest.mark.slow - def test_df_grid_settings(self): - # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 - self._check_grid_settings( - DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}), - plotting.PlotAccessor._dataframe_kinds, - kws={"x": "a", "y": "b"}, - ) - - def test_invalid_colormap(self): - df = DataFrame(randn(3, 2), columns=["A", "B"]) - - with pytest.raises(ValueError): - df.plot(colormap="invalid_colormap") - - def test_plain_axes(self): - - # supplied ax itself is a SubplotAxes, but figure contains also - # a plain Axes object (GH11556) - fig, ax = self.plt.subplots() - fig.add_axes([0.2, 0.2, 0.2, 0.2]) - Series(rand(10)).plot(ax=ax) - - # supplied ax itself is a plain Axes, but because the cmap keyword - # a new ax is created for the colorbar -> also multiples axes (GH11520) - df = DataFrame({"a": randn(8), "b": randn(8)}) - fig = self.plt.figure() - ax = fig.add_axes((0, 0, 1, 1)) - df.plot(kind="scatter", ax=ax, x="a", y="b", c="a", cmap="hsv") - - # other examples - fig, ax = self.plt.subplots() - from mpl_toolkits.axes_grid1 import make_axes_locatable - - divider = make_axes_locatable(ax) - cax = divider.append_axes("right", size="5%", pad=0.05) - Series(rand(10)).plot(ax=ax) - Series(rand(10)).plot(ax=cax) - - fig, ax = self.plt.subplots() - from mpl_toolkits.axes_grid1.inset_locator import inset_axes - - iax = inset_axes(ax, width="30%", height=1.0, loc=3) - Series(rand(10)).plot(ax=ax) - Series(rand(10)).plot(ax=iax) - - def test_passed_bar_colors(self): - import matplotlib as mpl - - color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] - colormap = mpl.colors.ListedColormap(color_tuples) - barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) - assert color_tuples == [c.get_facecolor() for c in barplot.patches] - - def test_rcParams_bar_colors(self): - import matplotlib as mpl - - color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] - with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): - barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") - assert color_tuples == [c.get_facecolor() for c in barplot.patches] - - @pytest.mark.parametrize("method", ["line", "barh", "bar"]) - def test_secondary_axis_font_size(self, method): - # GH: 12565 - df = ( - pd.DataFrame(np.random.randn(15, 2), columns=list("AB")) - .assign(C=lambda df: df.B.cumsum()) - .assign(D=lambda df: df.C * 1.1) - ) - - fontsize = 20 - sy = ["C", "D"] - - kwargs = dict(secondary_y=sy, fontsize=fontsize, mark_right=True) - ax = getattr(df.plot, method)(**kwargs) - self._check_ticks_props(axes=ax.right_ax, ylabelsize=fontsize) - - @pytest.mark.slow - def test_x_string_values_ticks(self): - # Test if string plot index have a fixed xtick position - # GH: 7612, GH: 22334 - df = pd.DataFrame( - { - "sales": [3, 2, 3], - "visits": [20, 42, 28], - "day": ["Monday", "Tuesday", "Wednesday"], - } - ) - ax = df.plot.area(x="day") - ax.set_xlim(-1, 3) - xticklabels = [t.get_text() for t in ax.get_xticklabels()] - labels_position = dict(zip(xticklabels, ax.get_xticks())) - # Testing if the label stayed at the right position - assert labels_position["Monday"] == 0.0 - assert labels_position["Tuesday"] == 1.0 - assert labels_position["Wednesday"] == 2.0 - - @pytest.mark.slow - def test_x_multiindex_values_ticks(self): - # Test if multiindex plot index have a fixed xtick position - # GH: 15912 - index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]]) - df = pd.DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) - ax = df.plot() - ax.set_xlim(-1, 4) - xticklabels = [t.get_text() for t in ax.get_xticklabels()] - labels_position = dict(zip(xticklabels, ax.get_xticks())) - # Testing if the label stayed at the right position - assert labels_position["(2012, 1)"] == 0.0 - assert labels_position["(2012, 2)"] == 1.0 - assert labels_position["(2013, 1)"] == 2.0 - assert labels_position["(2013, 2)"] == 3.0 - - @pytest.mark.parametrize("kind", ["line", "area"]) - def test_xlim_plot_line(self, kind): - # test if xlim is set correctly in plot.line and plot.area - # GH 27686 - df = pd.DataFrame([2, 4], index=[1, 2]) - ax = df.plot(kind=kind) - xlims = ax.get_xlim() - assert xlims[0] < 1 - assert xlims[1] > 2 - - def test_xlim_plot_line_correctly_in_mixed_plot_type(self): - # test if xlim is set correctly when ax contains multiple different kinds - # of plots, GH 27686 - fig, ax = self.plt.subplots() - - indexes = ["k1", "k2", "k3", "k4"] - df = pd.DataFrame( - { - "s1": [1000, 2000, 1500, 2000], - "s2": [900, 1400, 2000, 3000], - "s3": [1500, 1500, 1600, 1200], - "secondary_y": [1, 3, 4, 3], - }, - index=indexes, - ) - df[["s1", "s2", "s3"]].plot.bar(ax=ax, stacked=False) - df[["secondary_y"]].plot(ax=ax, secondary_y=True) - - xlims = ax.get_xlim() - assert xlims[0] < 0 - assert xlims[1] > 3 - - # make sure axis labels are plotted correctly as well - xticklabels = [t.get_text() for t in ax.get_xticklabels()] - assert xticklabels == indexes - - def test_subplots_sharex_false(self): - # test when sharex is set to False, two plots should have different - # labels, GH 25160 - df = pd.DataFrame(np.random.rand(10, 2)) - df.iloc[5:, 1] = np.nan - df.iloc[:5, 0] = np.nan - - figs, axs = self.plt.subplots(2, 1) - df.plot.line(ax=axs, subplots=True, sharex=False) - - expected_ax1 = np.arange(4.5, 10, 0.5) - expected_ax2 = np.arange(-0.5, 5, 0.5) - - tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) - tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) - - @pytest.mark.parametrize("by", ["C", ["C", "D"]]) - @pytest.mark.parametrize("column", ["A", ["A", "B"], None]) - def test_hist_plot_by_argument(self, by, column, test_hist_with_by_df): - # GH 15079 - _check_plot_works(test_hist_with_by_df.plot.hist, column=column, by=by) - - @pytest.mark.slow - @pytest.mark.parametrize( - "by, column, layout, axes_num", - [ - (["C"], "A", (2, 2), 3), - ("C", "A", (2, 2), 3), - (["C"], ["A"], (1, 3), 3), - ("C", None, (3, 1), 3), - ("C", ["A", "B"], (3, 1), 3), - (["C", "D"], "A", (9, 1), 9), - (["C", "D"], "A", (3, 3), 9), - (["C", "D"], ["A"], (5, 2), 9), - (["C", "D"], ["A", "B"], (9, 1), 9), - (["C", "D"], None, (9, 1), 9), - (["C", "D"], ["A", "B"], (5, 2), 9), - ], - ) - def test_hist_plot_layout_with_by(self, by, column, layout, axes_num, test_hist_with_by_df): - # GH 15079 - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - test_hist_with_by_df.plot.hist, column=column, by=by, layout=layout - ) - self._check_axes_shape(axes, axes_num=axes_num, layout=layout) - - def test_hist_plot_invalid_layout_with_by_raises(self, test_hist_with_by_df): - # GH 15079, test if error is raised when invalid layout is given - - # layout too small for all 3 plots - msg = "larger than required size" - with pytest.raises(ValueError, match=msg): - test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) - - # invalid format for layout - msg = re.escape("Layout must be a tuple of (rows, columns)") - with pytest.raises(ValueError, match=msg): - test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) - - msg = "At least one dimension of layout must be positive" - with pytest.raises(ValueError, match=msg): - test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) - - @pytest.mark.slow - def test_axis_share_x_with_by(self, test_hist_with_by_df): - # GH 15079 - ax1, ax2, ax3 = test_hist_with_by_df.plot.hist(column="A", by="C", sharex=True) - - # share x - assert ax1._shared_x_axes.joined(ax1, ax2) - assert ax2._shared_x_axes.joined(ax1, ax2) - assert ax3._shared_x_axes.joined(ax1, ax3) - assert ax3._shared_x_axes.joined(ax2, ax3) - - # don't share y - assert not ax1._shared_y_axes.joined(ax1, ax2) - assert not ax2._shared_y_axes.joined(ax1, ax2) - assert not ax3._shared_y_axes.joined(ax1, ax3) - assert not ax3._shared_y_axes.joined(ax2, ax3) - - @pytest.mark.slow - def test_axis_share_y_with_by(self, test_hist_with_by_df): - # GH 15079 - ax1, ax2, ax3 = test_hist_with_by_df.plot.hist(column="A", by="C", sharey=True) - - # share y - assert ax1._shared_y_axes.joined(ax1, ax2) - assert ax2._shared_y_axes.joined(ax1, ax2) - assert ax3._shared_y_axes.joined(ax1, ax3) - assert ax3._shared_y_axes.joined(ax2, ax3) - - # don't share x - assert not ax1._shared_x_axes.joined(ax1, ax2) - assert not ax2._shared_x_axes.joined(ax1, ax2) - assert not ax3._shared_x_axes.joined(ax1, ax3) - assert not ax3._shared_x_axes.joined(ax2, ax3) - - @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) - def test_figure_shape_hist_with_by(self, figsize, test_hist_with_by_df): - # GH 15079 - axes = test_hist_with_by_df.plot.hist(column="A", by="C", figsize=figsize) - self._check_axes_shape(axes, axes_num=3, figsize=figsize) - - def test_plot_no_rows(self): - # GH 27758 - df = pd.DataFrame(columns=["foo"], dtype=int) - assert df.empty - ax = df.plot() - assert len(ax.get_lines()) == 1 - line = ax.get_lines()[0] - assert len(line.get_xdata()) == 0 - assert len(line.get_ydata()) == 0 - - def test_plot_no_numeric_data(self): - df = pd.DataFrame(["a", "b", "c"]) - with pytest.raises(TypeError): - df.plot() - - def test_missing_markers_legend(self): - # 14958 - df = pd.DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"]) - ax = df.plot(y=["A"], marker="x", linestyle="solid") - df.plot(y=["B"], marker="o", linestyle="dotted", ax=ax) - df.plot(y=["C"], marker="<", linestyle="dotted", ax=ax) - - self._check_legend_labels(ax, labels=["A", "B", "C"]) - self._check_legend_marker(ax, expected_markers=["x", "o", "<"]) - - def test_missing_markers_legend_using_style(self): - # 14563 - df = pd.DataFrame( - { - "A": [1, 2, 3, 4, 5, 6], - "B": [2, 4, 1, 3, 2, 4], - "C": [3, 3, 2, 6, 4, 2], - "X": [1, 2, 3, 4, 5, 6], - } - ) - - fig, ax = self.plt.subplots() - for kind in "ABC": - df.plot("X", kind, label=kind, ax=ax, style=".") - - self._check_legend_labels(ax, labels=["A", "B", "C"]) - self._check_legend_marker(ax, expected_markers=[".", ".", "."]) - - def test_colors_of_columns_with_same_name(self): - # ISSUE 11136 -> https://github.com/pandas-dev/pandas/issues/11136 - # Creating a DataFrame with duplicate column labels and testing colors of them. - df = pd.DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) - df1 = pd.DataFrame({"a": [2, 4, 6]}) - df_concat = pd.concat([df, df1], axis=1) - result = df_concat.plot() - for legend, line in zip(result.get_legend().legendHandles, result.lines): - assert legend.get_color() == line.get_color() - - @pytest.mark.parametrize( - "index_name, old_label, new_label", - [ - (None, "", "new"), - ("old", "old", "new"), - (None, "", ""), - (None, "", 1), - (None, "", [1, 2]), - ], - ) - @pytest.mark.parametrize("kind", ["line", "area", "bar"]) - def test_xlabel_ylabel_dataframe_single_plot( - self, kind, index_name, old_label, new_label - ): - # GH 9093 - df = pd.DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) - df.index.name = index_name - - # default is the ylabel is not shown and xlabel is index name - ax = df.plot(kind=kind) - assert ax.get_xlabel() == old_label - assert ax.get_ylabel() == "" - - # old xlabel will be overriden and assigned ylabel will be used as ylabel - ax = df.plot(kind=kind, ylabel=new_label, xlabel=new_label) - assert ax.get_ylabel() == str(new_label) - assert ax.get_xlabel() == str(new_label) - - @pytest.mark.parametrize( - "index_name, old_label, new_label", - [ - (None, "", "new"), - ("old", "old", "new"), - (None, "", ""), - (None, "", 1), - (None, "", [1, 2]), - ], - ) - @pytest.mark.parametrize("kind", ["line", "area", "bar"]) - def test_xlabel_ylabel_dataframe_subplots( - self, kind, index_name, old_label, new_label - ): - # GH 9093 - df = pd.DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) - df.index.name = index_name - - # default is the ylabel is not shown and xlabel is index name - axes = df.plot(kind=kind, subplots=True) - assert all(ax.get_ylabel() == "" for ax in axes) - assert all(ax.get_xlabel() == old_label for ax in axes) - - # old xlabel will be overriden and assigned ylabel will be used as ylabel - axes = df.plot(kind=kind, ylabel=new_label, xlabel=new_label, subplots=True) - assert all(ax.get_ylabel() == str(new_label) for ax in axes) - assert all(ax.get_xlabel() == str(new_label) for ax in axes) - - -def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt - import matplotlib as mpl - import matplotlib.gridspec # noqa - - gs = mpl.gridspec.GridSpec(2, 2) - ax_tl = plt.subplot(gs[0, 0]) - ax_ll = plt.subplot(gs[1, 0]) - ax_tr = plt.subplot(gs[0, 1]) - ax_lr = plt.subplot(gs[1, 1]) - - return gs, [ax_tl, ax_ll, ax_tr, ax_lr] diff --git a/pandas/tests/plotting/test_hist_by.py b/pandas/tests/plotting/test_hist_by.py new file mode 100644 index 0000000000000..7371e23026cf3 --- /dev/null +++ b/pandas/tests/plotting/test_hist_by.py @@ -0,0 +1,112 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.plotting.common import _check_axes_shape, _check_plot_works + + +@pytest.fixture(scope="module") +def test_hist_with_by_df(): + np.random.seed(0) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + return df + + +@pytest.mark.parametrize("by", ["C", ["C", "D"]]) +@pytest.mark.parametrize("column", ["A", ["A", "B"], None]) +def test_hist_plot_by_argument(by, column, test_hist_with_by_df): + # GH 15079 + _check_plot_works(test_hist_with_by_df.plot.hist, column=column, by=by) + + +@pytest.mark.slow +@pytest.mark.parametrize( + "by, column, layout, axes_num", + [ + (["C"], "A", (2, 2), 3), + ("C", "A", (2, 2), 3), + (["C"], ["A"], (1, 3), 3), + ("C", None, (3, 1), 3), + ("C", ["A", "B"], (3, 1), 3), + (["C", "D"], "A", (9, 1), 9), + (["C", "D"], "A", (3, 3), 9), + (["C", "D"], ["A"], (5, 2), 9), + (["C", "D"], ["A", "B"], (9, 1), 9), + (["C", "D"], None, (9, 1), 9), + (["C", "D"], ["A", "B"], (5, 2), 9), + ], +) +def test_hist_plot_layout_with_by(by, column, layout, axes_num, test_hist_with_by_df): + # GH 15079 + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + test_hist_with_by_df.plot.hist, column=column, by=by, layout=layout + ) + _check_axes_shape(axes, axes_num=axes_num, layout=layout) + + +def test_hist_plot_invalid_layout_with_by_raises(test_hist_with_by_df): + # GH 15079, test if error is raised when invalid layout is given + + # layout too small for all 3 plots + msg = "larger than required size" + with pytest.raises(ValueError, match=msg): + test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) + + # invalid format for layout + msg = re.escape("Layout must be a tuple of (rows, columns)") + with pytest.raises(ValueError, match=msg): + test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) + + +@pytest.mark.slow +def test_axis_share_x_with_by(test_hist_with_by_df): + # GH 15079 + ax1, ax2, ax3 = test_hist_with_by_df.plot.hist(column="A", by="C", sharex=True) + + # share x + assert ax1._shared_x_axes.joined(ax1, ax2) + assert ax2._shared_x_axes.joined(ax1, ax2) + assert ax3._shared_x_axes.joined(ax1, ax3) + assert ax3._shared_x_axes.joined(ax2, ax3) + + # don't share y + assert not ax1._shared_y_axes.joined(ax1, ax2) + assert not ax2._shared_y_axes.joined(ax1, ax2) + assert not ax3._shared_y_axes.joined(ax1, ax3) + assert not ax3._shared_y_axes.joined(ax2, ax3) + + +@pytest.mark.slow +def test_axis_share_y_with_by(test_hist_with_by_df): + # GH 15079 + ax1, ax2, ax3 = test_hist_with_by_df.plot.hist(column="A", by="C", sharey=True) + + # share y + assert ax1._shared_y_axes.joined(ax1, ax2) + assert ax2._shared_y_axes.joined(ax1, ax2) + assert ax3._shared_y_axes.joined(ax1, ax3) + assert ax3._shared_y_axes.joined(ax2, ax3) + + # don't share x + assert not ax1._shared_x_axes.joined(ax1, ax2) + assert not ax2._shared_x_axes.joined(ax1, ax2) + assert not ax3._shared_x_axes.joined(ax1, ax3) + assert not ax3._shared_x_axes.joined(ax2, ax3) + + +@pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) +def test_figure_shape_hist_with_by(figsize, test_hist_with_by_df): + # GH 15079 + axes = test_hist_with_by_df.plot.hist(column="A", by="C", figsize=figsize) + _check_axes_shape(axes, axes_num=3, figsize=figsize) \ No newline at end of file From f7bcdb7d49aa88e63dc87620918c9022a5e24b5d Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Fri, 21 May 2021 20:25:12 +0200 Subject: [PATCH 103/142] revert change --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b540f163250d9..9f3ccb3e14116 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -273,6 +273,7 @@ change, as ``fsspec`` will still bring in the same packages as before. Other enhancements ^^^^^^^^^^^^^^^^^^ + - Compatibility with matplotlib 3.3.0 (:issue:`34850`) - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:`32538`) - :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) From aeb32e5b3bf9cb628bb152323c45b94100eec015 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Fri, 21 May 2021 20:57:57 +0200 Subject: [PATCH 104/142] rebase --- pandas/plotting/_core.py | 2 +- pandas/plotting/_matplotlib/core.py | 1 + pandas/plotting/_matplotlib/hist.py | 1 + pandas/tests/plotting/test_hist_by.py | 190 +++++++++++++------------- 4 files changed, 100 insertions(+), 94 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index db654a5ae66a3..63d64d8e027f2 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1278,7 +1278,7 @@ def hist(self, by=None, bins=10, **kwargs): by : str or sequence, optional Column in the DataFrame to group by. - .. versionadded:: 1.1.0 + .. versionadded:: 1.3.0 bins : int, default 10 Number of histogram bins to be used. diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 2521fd5cb2aba..4fc98f740b5a7 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -59,6 +59,7 @@ handle_shared_axes, table, ) +from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by if TYPE_CHECKING: from matplotlib.axes import Axes diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 295808c77094d..59e3aa4769287 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -30,6 +30,7 @@ maybe_adjust_figure, set_ticks_props, ) +from pandas.plotting._matplotlib.groupby import create_iter_data_given_by, reformat_hist_y_given_by if TYPE_CHECKING: from matplotlib.axes import Axes diff --git a/pandas/tests/plotting/test_hist_by.py b/pandas/tests/plotting/test_hist_by.py index 7371e23026cf3..84b7f9ee9b434 100644 --- a/pandas/tests/plotting/test_hist_by.py +++ b/pandas/tests/plotting/test_hist_by.py @@ -5,7 +5,7 @@ from pandas import DataFrame import pandas._testing as tm -from pandas.tests.plotting.common import _check_axes_shape, _check_plot_works +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works @pytest.fixture(scope="module") @@ -17,96 +17,100 @@ def test_hist_with_by_df(): return df -@pytest.mark.parametrize("by", ["C", ["C", "D"]]) -@pytest.mark.parametrize("column", ["A", ["A", "B"], None]) -def test_hist_plot_by_argument(by, column, test_hist_with_by_df): +@td.skip_if_no_mpl +class TestDataFrameColor(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + self.hist_df = test_hist_with_by_df() + + @pytest.mark.parametrize("by", ["C", ["C", "D"]]) + @pytest.mark.parametrize("column", ["A", ["A", "B"], None]) + def test_hist_plot_by_argument(self, by, column): + # GH 15079 + _check_plot_works(self.hist_df.plot.hist, column=column, by=by) + + @pytest.mark.slow + @pytest.mark.parametrize( + "by, column, layout, axes_num", + [ + (["C"], "A", (2, 2), 3), + ("C", "A", (2, 2), 3), + (["C"], ["A"], (1, 3), 3), + ("C", None, (3, 1), 3), + ("C", ["A", "B"], (3, 1), 3), + (["C", "D"], "A", (9, 1), 9), + (["C", "D"], "A", (3, 3), 9), + (["C", "D"], ["A"], (5, 2), 9), + (["C", "D"], ["A", "B"], (9, 1), 9), + (["C", "D"], None, (9, 1), 9), + (["C", "D"], ["A", "B"], (5, 2), 9), + ], + ) + def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): + # GH 15079 + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + self.hist_df.plot.hist, column=column, by=by, layout=layout + ) + self._check_axes_shape(axes, axes_num=axes_num, layout=layout) + + def test_hist_plot_invalid_layout_with_by_raises(self): + # GH 15079, test if error is raised when invalid layout is given + + # layout too small for all 3 plots + msg = "larger than required size" + with pytest.raises(ValueError, match=msg): + self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) + + # invalid format for layout + msg = re.escape("Layout must be a tuple of (rows, columns)") + with pytest.raises(ValueError, match=msg): + self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) + + @pytest.mark.slow + def test_axis_share_x_with_by(self): + # GH 15079 + ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharex=True) + + # share x + assert ax1._shared_x_axes.joined(ax1, ax2) + assert ax2._shared_x_axes.joined(ax1, ax2) + assert ax3._shared_x_axes.joined(ax1, ax3) + assert ax3._shared_x_axes.joined(ax2, ax3) + + # don't share y + assert not ax1._shared_y_axes.joined(ax1, ax2) + assert not ax2._shared_y_axes.joined(ax1, ax2) + assert not ax3._shared_y_axes.joined(ax1, ax3) + assert not ax3._shared_y_axes.joined(ax2, ax3) + + @pytest.mark.slow + def test_axis_share_y_with_by(self): + # GH 15079 + ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharey=True) + + # share y + assert ax1._shared_y_axes.joined(ax1, ax2) + assert ax2._shared_y_axes.joined(ax1, ax2) + assert ax3._shared_y_axes.joined(ax1, ax3) + assert ax3._shared_y_axes.joined(ax2, ax3) + + # don't share x + assert not ax1._shared_x_axes.joined(ax1, ax2) + assert not ax2._shared_x_axes.joined(ax1, ax2) + assert not ax3._shared_x_axes.joined(ax1, ax3) + assert not ax3._shared_x_axes.joined(ax2, ax3) + + @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) + def test_figure_shape_hist_with_by(self, figsize): # GH 15079 - _check_plot_works(test_hist_with_by_df.plot.hist, column=column, by=by) - - -@pytest.mark.slow -@pytest.mark.parametrize( - "by, column, layout, axes_num", - [ - (["C"], "A", (2, 2), 3), - ("C", "A", (2, 2), 3), - (["C"], ["A"], (1, 3), 3), - ("C", None, (3, 1), 3), - ("C", ["A", "B"], (3, 1), 3), - (["C", "D"], "A", (9, 1), 9), - (["C", "D"], "A", (3, 3), 9), - (["C", "D"], ["A"], (5, 2), 9), - (["C", "D"], ["A", "B"], (9, 1), 9), - (["C", "D"], None, (9, 1), 9), - (["C", "D"], ["A", "B"], (5, 2), 9), - ], -) -def test_hist_plot_layout_with_by(by, column, layout, axes_num, test_hist_with_by_df): - # GH 15079 - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - test_hist_with_by_df.plot.hist, column=column, by=by, layout=layout - ) - _check_axes_shape(axes, axes_num=axes_num, layout=layout) - - -def test_hist_plot_invalid_layout_with_by_raises(test_hist_with_by_df): - # GH 15079, test if error is raised when invalid layout is given - - # layout too small for all 3 plots - msg = "larger than required size" - with pytest.raises(ValueError, match=msg): - test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) - - # invalid format for layout - msg = re.escape("Layout must be a tuple of (rows, columns)") - with pytest.raises(ValueError, match=msg): - test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) - - msg = "At least one dimension of layout must be positive" - with pytest.raises(ValueError, match=msg): - test_hist_with_by_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) - - -@pytest.mark.slow -def test_axis_share_x_with_by(test_hist_with_by_df): - # GH 15079 - ax1, ax2, ax3 = test_hist_with_by_df.plot.hist(column="A", by="C", sharex=True) - - # share x - assert ax1._shared_x_axes.joined(ax1, ax2) - assert ax2._shared_x_axes.joined(ax1, ax2) - assert ax3._shared_x_axes.joined(ax1, ax3) - assert ax3._shared_x_axes.joined(ax2, ax3) - - # don't share y - assert not ax1._shared_y_axes.joined(ax1, ax2) - assert not ax2._shared_y_axes.joined(ax1, ax2) - assert not ax3._shared_y_axes.joined(ax1, ax3) - assert not ax3._shared_y_axes.joined(ax2, ax3) - - -@pytest.mark.slow -def test_axis_share_y_with_by(test_hist_with_by_df): - # GH 15079 - ax1, ax2, ax3 = test_hist_with_by_df.plot.hist(column="A", by="C", sharey=True) - - # share y - assert ax1._shared_y_axes.joined(ax1, ax2) - assert ax2._shared_y_axes.joined(ax1, ax2) - assert ax3._shared_y_axes.joined(ax1, ax3) - assert ax3._shared_y_axes.joined(ax2, ax3) - - # don't share x - assert not ax1._shared_x_axes.joined(ax1, ax2) - assert not ax2._shared_x_axes.joined(ax1, ax2) - assert not ax3._shared_x_axes.joined(ax1, ax3) - assert not ax3._shared_x_axes.joined(ax2, ax3) - - -@pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) -def test_figure_shape_hist_with_by(figsize, test_hist_with_by_df): - # GH 15079 - axes = test_hist_with_by_df.plot.hist(column="A", by="C", figsize=figsize) - _check_axes_shape(axes, axes_num=3, figsize=figsize) \ No newline at end of file + axes = self.hist_df.plot.hist(column="A", by="C", figsize=figsize) + self._check_axes_shape(axes, axes_num=3, figsize=figsize) \ No newline at end of file From dc1795983ff8b0217dd4caab210b65ab98df610b Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Fri, 21 May 2021 21:19:14 +0200 Subject: [PATCH 105/142] fixup --- pandas/plotting/_matplotlib/groupby.py | 25 +++++++++++++++++++------ pandas/tests/plotting/test_hist_by.py | 12 ++++++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 061f95aacec90..4de3ae0996483 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -1,16 +1,29 @@ -from typing import Dict, List, Optional, Union +from typing import ( + Dict, + List, + Optional, + Union, +) import numpy as np -from pandas._typing import FrameOrSeriesUnion, Label +from pandas._typing import ( + FrameOrSeriesUnion, + IndexLabel, +) from pandas.core.dtypes.missing import isna -from pandas import DataFrame, MultiIndex, Series, concat +from pandas import ( + DataFrame, + MultiIndex, + Series, + concat, +) def create_iter_data_given_by( - data: DataFrame, by: Optional[List[Label]] + data: DataFrame, by: Optional[List[IndexLabel]] ) -> Union[DataFrame, Dict[str, FrameOrSeriesUnion]]: """ Create data for iteration given `by` is assigned or not, and it is only @@ -58,7 +71,7 @@ def create_iter_data_given_by( def reconstruct_data_with_by( - data: DataFrame, by: Union[Label, List[Label]], cols: List[Label] + data: DataFrame, by: Union[IndexLabel, List[IndexLabel]], cols: List[IndexLabel] ) -> DataFrame: """ Internal function to group data, and reassign multiindex column names onto the @@ -101,7 +114,7 @@ def reconstruct_data_with_by( def reformat_hist_y_given_by( - y: Union[Series, np.array], by: Optional[Union[Label, List[Label]]] + y: Union[Series, np.array], by: Optional[Union[IndexLabel, List[IndexLabel]]] ) -> Union[Series, np.array]: """Internal function to reformat y given `by` is applied or not for hist plot. diff --git a/pandas/tests/plotting/test_hist_by.py b/pandas/tests/plotting/test_hist_by.py index 84b7f9ee9b434..1e126caf6ef50 100644 --- a/pandas/tests/plotting/test_hist_by.py +++ b/pandas/tests/plotting/test_hist_by.py @@ -3,12 +3,16 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame import pandas._testing as tm -from pandas.tests.plotting.common import TestPlotBase, _check_plot_works +from pandas.tests.plotting.common import ( + TestPlotBase, + _check_plot_works, +) -@pytest.fixture(scope="module") def test_hist_with_by_df(): np.random.seed(0) df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) @@ -111,6 +115,6 @@ def test_axis_share_y_with_by(self): @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) def test_figure_shape_hist_with_by(self, figsize): - # GH 15079 + # GH 15079 axes = self.hist_df.plot.hist(column="A", by="C", figsize=figsize) - self._check_axes_shape(axes, axes_num=3, figsize=figsize) \ No newline at end of file + self._check_axes_shape(axes, axes_num=3, figsize=figsize) From 4aee3e0d91cd3b137bc142fc376ed94215aab5a1 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Fri, 21 May 2021 21:20:01 +0200 Subject: [PATCH 106/142] black --- pandas/plotting/_matplotlib/hist.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 59e3aa4769287..603379c93e995 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -24,13 +24,16 @@ LinePlot, MPLPlot, ) +from pandas.plotting._matplotlib.groupby import ( + create_iter_data_given_by, + reformat_hist_y_given_by, +) from pandas.plotting._matplotlib.tools import ( create_subplots, flatten_axes, maybe_adjust_figure, set_ticks_props, ) -from pandas.plotting._matplotlib.groupby import create_iter_data_given_by, reformat_hist_y_given_by if TYPE_CHECKING: from matplotlib.axes import Axes From 4eb466fbf3ff8c15f192964921cfd78a843a27fd Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Fri, 21 May 2021 21:25:14 +0200 Subject: [PATCH 107/142] fixup --- pandas/plotting/_matplotlib/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 4fc98f740b5a7..03fb77c2d9166 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -60,6 +60,7 @@ table, ) from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by +from pandas._typing import IndexLabel if TYPE_CHECKING: from matplotlib.axes import Axes @@ -125,7 +126,7 @@ def __init__( table=False, layout=None, include_bool=False, - column: Optional[Label] = None, + column: IndexLabel | None = None, **kwds, ): From 51602240ece1edc8747155f66aabf60b11873026 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 09:58:55 +0200 Subject: [PATCH 108/142] fix mypy --- pandas/plotting/_matplotlib/core.py | 4 ++-- pandas/plotting/_matplotlib/groupby.py | 10 ++++------ pandas/plotting/_matplotlib/hist.py | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 03fb77c2d9166..bfc6c5e228453 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -9,6 +9,7 @@ from matplotlib.artist import Artist import numpy as np +from pandas._typing import IndexLabel from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -42,6 +43,7 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters +from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( decorate_axes, @@ -59,8 +61,6 @@ handle_shared_axes, table, ) -from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by -from pandas._typing import IndexLabel if TYPE_CHECKING: from matplotlib.axes import Axes diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 4de3ae0996483..5e36391c1286c 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -1,7 +1,5 @@ from typing import ( Dict, - List, - Optional, Union, ) @@ -23,7 +21,7 @@ def create_iter_data_given_by( - data: DataFrame, by: Optional[List[IndexLabel]] + data: DataFrame, by: IndexLabel | None = None ) -> Union[DataFrame, Dict[str, FrameOrSeriesUnion]]: """ Create data for iteration given `by` is assigned or not, and it is only @@ -71,7 +69,7 @@ def create_iter_data_given_by( def reconstruct_data_with_by( - data: DataFrame, by: Union[IndexLabel, List[IndexLabel]], cols: List[IndexLabel] + data: DataFrame, by: IndexLabel, cols: IndexLabel ) -> DataFrame: """ Internal function to group data, and reassign multiindex column names onto the @@ -114,8 +112,8 @@ def reconstruct_data_with_by( def reformat_hist_y_given_by( - y: Union[Series, np.array], by: Optional[Union[IndexLabel, List[IndexLabel]]] -) -> Union[Series, np.array]: + y: Union[Series, np.ndarray], by: IndexLabel | None = None +) -> Union[Series, np.ndarray]: """Internal function to reformat y given `by` is applied or not for hist plot. If by is None, input y is 1-d with NaN removed; and if by is not None, groupby diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 603379c93e995..00b27705d558e 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -63,7 +63,7 @@ def _args_adjust(self): if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - def _calculate_bins(self, data: DataFrame) -> np.array: + def _calculate_bins(self, data: DataFrame) -> np.ndarray: """Calculate bins given data""" values = data._convert(datetime=True)._get_numeric_data() values = np.ravel(values) From e2de0d395bd40fc603886614197bfc94b67600d4 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 10:28:52 +0200 Subject: [PATCH 109/142] fix mypy --- pandas/plotting/_matplotlib/core.py | 3 ++- pandas/plotting/_matplotlib/groupby.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index bfc6c5e228453..a76a6c1871199 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Hashable, + Optional ) import warnings @@ -126,7 +127,7 @@ def __init__( table=False, layout=None, include_bool=False, - column: IndexLabel | None = None, + column: Optional[IndexLabel] = None, **kwds, ): diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 5e36391c1286c..0f15a1b34f01e 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -1,5 +1,6 @@ from typing import ( Dict, + Optional, Union, ) @@ -21,7 +22,7 @@ def create_iter_data_given_by( - data: DataFrame, by: IndexLabel | None = None + data: DataFrame, by: Optional[IndexLabel] = None ) -> Union[DataFrame, Dict[str, FrameOrSeriesUnion]]: """ Create data for iteration given `by` is assigned or not, and it is only @@ -112,7 +113,7 @@ def reconstruct_data_with_by( def reformat_hist_y_given_by( - y: Union[Series, np.ndarray], by: IndexLabel | None = None + y: Union[Series, np.ndarray], by: Optional[IndexLabel] = None ) -> Union[Series, np.ndarray]: """Internal function to reformat y given `by` is applied or not for hist plot. From b2b33ac65c914ff5be4979f0302ccf9cdc9e7b40 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 10:39:04 +0200 Subject: [PATCH 110/142] fix mypy --- pandas/plotting/_matplotlib/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index a76a6c1871199..bfc6c5e228453 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -3,7 +3,6 @@ from typing import ( TYPE_CHECKING, Hashable, - Optional ) import warnings @@ -127,7 +126,7 @@ def __init__( table=False, layout=None, include_bool=False, - column: Optional[IndexLabel] = None, + column: IndexLabel | None = None, **kwds, ): From 1199a93639b63839ac48ec85b4c9c851e089c728 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 13:04:14 +0200 Subject: [PATCH 111/142] fix mypy --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index bfc6c5e228453..e23a5d9ea8386 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -143,7 +143,7 @@ def __init__( if self.by and column is None: self.columns = [col for col in data.columns if col not in self.by] else: - self.columns = com.convert_to_list_like(column) + self.columns = com.maybe_make_list(column) self.kind = kind From c4a584261f508b6fc4efb35420be3798276d06e7 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 13:05:14 +0200 Subject: [PATCH 112/142] fix mypy --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index e23a5d9ea8386..96288bd33be7c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -101,7 +101,7 @@ def __init__( self, data, kind=None, - by=None, + by: IndexLabel | None = None, subplots=False, sharex=None, sharey=False, From 65564147cc92ff3e366c05eb158d5669f6ec942b Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 13:32:17 +0200 Subject: [PATCH 113/142] fix mypy --- pandas/plotting/_matplotlib/groupby.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 0f15a1b34f01e..e8ed5ed2a8b12 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -11,7 +11,10 @@ IndexLabel, ) -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import ( + isna, + remove_na_arraylike, +) from pandas import ( DataFrame, @@ -121,8 +124,7 @@ def reformat_hist_y_given_by( will take place and input y is multi-dimensional array. """ if by is not None and len(y.shape) > 1: - notna = [col[~isna(col)] for col in y.T] - y = np.array(np.array(notna).T) + y = np.array([remove_na_arraylike(col) for col in y.T]).T else: - y = y[~isna(y)] + y = remove_na_arraylike(y) return y From 826f277c164a3f4c6a0307b31cc316c9a6593d66 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 13:56:22 +0200 Subject: [PATCH 114/142] fix flake8 --- pandas/plotting/_matplotlib/groupby.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index e8ed5ed2a8b12..4145b3359bafa 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -11,10 +11,7 @@ IndexLabel, ) -from pandas.core.dtypes.missing import ( - isna, - remove_na_arraylike, -) +from pandas.core.dtypes.missing import remove_na_arraylike from pandas import ( DataFrame, From 891dc55ba5a6d6de93bf44e8a997873cfdb2b91a Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 14:53:44 +0200 Subject: [PATCH 115/142] add by support for boxplot --- pandas/plotting/_matplotlib/boxplot.py | 15 ++++++++++++++- pandas/plotting/_matplotlib/core.py | 18 ++++++++++++++---- pandas/plotting/_matplotlib/groupby.py | 17 +++++++++++++---- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 21f30c1311e17..b3526845604cb 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -18,6 +18,7 @@ LinePlot, MPLPlot, ) +from pandas.plotting._matplotlib.groupby import create_iter_data_given_by from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.tools import ( create_subplots, @@ -135,10 +136,17 @@ def _make_plot(self): if self.subplots: self._return_obj = pd.Series(dtype=object) - for i, (label, y) in enumerate(self._iter_data()): + data = create_iter_data_given_by(self.data, self.by, self._kind) + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() + # When by is applied, show title for subplots to know which group it is + # just like df.boxplot, and need to apply T on y to provide right input + if self.by is not None: + y = y.T + ax.set_title(pprint_thing(label)) + ret, bp = self._plot( ax, y, column_num=i, return_type=self.return_type, **kwds ) @@ -146,6 +154,11 @@ def _make_plot(self): self._return_obj[label] = ret label = [pprint_thing(label)] + + # When `by` is assigned, the ticklabels will become unique grouped + # values, instead of label which is used as subtitle in this case. + if self.by is not None: + label = self.data.columns.levels[0] self._set_ticklabels(ax, label) else: y = self.data.values.T diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 96288bd33be7c..c71d084844a8e 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -135,16 +135,26 @@ def __init__( self.data = data self.by = com.maybe_make_list(by) - if self.by: - self._grouped_data_size = len(data.groupby(self.by)) - # Assign the rest of columns into self.columns if by is explicitly defined # while column is not, so as to keep the same behaviour with current df.hist + # or df.boxplot. if self.by and column is None: - self.columns = [col for col in data.columns if col not in self.by] + self.columns = [ + col + for col in data.columns + if col not in self.by and is_numeric_dtype(data[col]) + ] else: self.columns = com.maybe_make_list(column) + # When `by` is explicitly assigned, grouped data size will be defined, and + # this will determine number of subplots to have, aka the size of `self.axes` + if self.by: + if self._kind == "hist": + self._grouped_data_size = len(data.groupby(self.by)) + elif self._kind == "box": + self._grouped_data_size = len(self.columns) + self.kind = kind self.sort_columns = sort_columns diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 4145b3359bafa..59ae63964a4d7 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -22,7 +22,7 @@ def create_iter_data_given_by( - data: DataFrame, by: Optional[IndexLabel] = None + data: DataFrame, by: Optional[IndexLabel] = None, kind: str = "hist" ) -> Union[DataFrame, Dict[str, FrameOrSeriesUnion]]: """ Create data for iteration given `by` is assigned or not, and it is only @@ -35,8 +35,9 @@ def create_iter_data_given_by( Parameters ---------- - data: reformatted grouped data from `_compute_plot_data` method + data: reformatted grouped data from `_compute_plot_data` method. by: list or None, value assigned to `by`. + kind: str, plot kind. This function is only used for `hist` and `box` plots. Returns ------- @@ -56,15 +57,23 @@ def create_iter_data_given_by( {'h1': DataFrame({'a': [1, 3, np.nan], 'b': [3, 4, np.nan]}), 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} """ + if kind == "hist": + level = 0 + elif kind == "box": + level = 1 + else: + raise ValueError("This function is only used for hist and box plot") + iter_data: Union[DataFrame, Dict[str, FrameOrSeriesUnion]] if not by: iter_data = data else: # Select sub-columns based on the value of first level of MI assert isinstance(data.columns, MultiIndex) - cols = data.columns.levels[0] + cols = data.columns.levels[level] iter_data = { - col: data.loc[:, data.columns.get_level_values(0) == col] for col in cols + col: data.loc[:, data.columns.get_level_values(level) == col] + for col in cols } return iter_data From 4c4a15899f2583bdd223b7030a2de0ed5f64a086 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 14:55:41 +0200 Subject: [PATCH 116/142] doc --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c71d084844a8e..dffe64da186ef 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -148,7 +148,7 @@ def __init__( self.columns = com.maybe_make_list(column) # When `by` is explicitly assigned, grouped data size will be defined, and - # this will determine number of subplots to have, aka the size of `self.axes` + # this will determine number of subplots to have, aka `self.nseries` if self.by: if self._kind == "hist": self._grouped_data_size = len(data.groupby(self.by)) From ea7e5b15594e6f865cf59d33e542c22366643ad8 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 20:53:32 +0200 Subject: [PATCH 117/142] Add tests --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/tests/plotting/test_hist_box_by.py | 287 ++++++++++++++++++++++ pandas/tests/plotting/test_hist_by.py | 120 --------- 3 files changed, 288 insertions(+), 120 deletions(-) create mode 100644 pandas/tests/plotting/test_hist_box_by.py delete mode 100644 pandas/tests/plotting/test_hist_by.py diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c26f8288f59ab..e29a5928e6abc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -230,6 +230,7 @@ Other enhancements - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) +- Add support for assigning values to ``by`` argument in :meth:``DataFrame.plot.hist`` and :meth:``DataFrame.plot.box`` (:issue:`15079`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/plotting/test_hist_box_by.py b/pandas/tests/plotting/test_hist_box_by.py new file mode 100644 index 0000000000000..43cb02270f338 --- /dev/null +++ b/pandas/tests/plotting/test_hist_box_by.py @@ -0,0 +1,287 @@ +import re + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.plotting.common import ( + TestPlotBase, + _check_plot_works, +) + + +def test_hist_box_with_by_df(): + np.random.seed(0) + df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) + df["C"] = np.random.choice(["a", "b", "c"], 30) + df["D"] = np.random.choice(["a", "b", "c"], 30) + return df + + +@td.skip_if_no_mpl +class TestHistWithBy(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + self.hist_df = test_hist_box_with_by_df() + + @pytest.mark.parametrize( + "by, column, titles, legends", + [ + ("C", "A", ["a", "b", "c"], [["A"]] * 3), + ("C", ["A", "B"], ["a", "b", "c"], [["A", "B"]] * 3), + ("C", None, ["a", "b", "c"], [["A", "B"]] * 3), + ( + ["C", "D"], + "A", + [ + "(a, a)", + "(a, b)", + "(a, c)", + "(b, a)", + "(b, b)", + "(b, c)", + "(c, a)", + "(c, b)", + "(c, c)", + ], + [["A"]] * 9, + ), + ( + ["C", "D"], + ["A", "B"], + [ + "(a, a)", + "(a, b)", + "(a, c)", + "(b, a)", + "(b, b)", + "(b, c)", + "(c, a)", + "(c, b)", + "(c, c)", + ], + [["A", "B"]] * 9, + ), + ( + ["C", "D"], + None, + [ + "(a, a)", + "(a, b)", + "(a, c)", + "(b, a)", + "(b, b)", + "(b, c)", + "(c, a)", + "(c, b)", + "(c, c)", + ], + [["A", "B"]] * 9, + ), + ], + ) + def test_hist_plot_by_argument(self, by, column, titles, legends): + # GH 15079 + axes = _check_plot_works(self.hist_df.plot.hist, column=column, by=by) + result_titles = [ax.get_title() for ax in axes] + result_legends = [[l.get_text() for l in ax.get_legend().texts] for ax in axes] + + assert result_legends == legends + assert result_titles == titles + + @pytest.mark.slow + @pytest.mark.parametrize( + "by, column, layout, axes_num", + [ + (["C"], "A", (2, 2), 3), + ("C", "A", (2, 2), 3), + (["C"], ["A"], (1, 3), 3), + ("C", None, (3, 1), 3), + ("C", ["A", "B"], (3, 1), 3), + (["C", "D"], "A", (9, 1), 9), + (["C", "D"], "A", (3, 3), 9), + (["C", "D"], ["A"], (5, 2), 9), + (["C", "D"], ["A", "B"], (9, 1), 9), + (["C", "D"], None, (9, 1), 9), + (["C", "D"], ["A", "B"], (5, 2), 9), + ], + ) + def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): + # GH 15079 + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + self.hist_df.plot.hist, column=column, by=by, layout=layout + ) + self._check_axes_shape(axes, axes_num=axes_num, layout=layout) + + def test_hist_plot_invalid_layout_with_by_raises(self): + # GH 15079, test if error is raised when invalid layout is given + + # layout too small for all 3 plots + msg = "larger than required size" + with pytest.raises(ValueError, match=msg): + self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) + + # invalid format for layout + msg = re.escape("Layout must be a tuple of (rows, columns)") + with pytest.raises(ValueError, match=msg): + self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) + + @pytest.mark.slow + def test_axis_share_x_with_by(self): + # GH 15079 + ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharex=True) + + # share x + assert ax1._shared_x_axes.joined(ax1, ax2) + assert ax2._shared_x_axes.joined(ax1, ax2) + assert ax3._shared_x_axes.joined(ax1, ax3) + assert ax3._shared_x_axes.joined(ax2, ax3) + + # don't share y + assert not ax1._shared_y_axes.joined(ax1, ax2) + assert not ax2._shared_y_axes.joined(ax1, ax2) + assert not ax3._shared_y_axes.joined(ax1, ax3) + assert not ax3._shared_y_axes.joined(ax2, ax3) + + @pytest.mark.slow + def test_axis_share_y_with_by(self): + # GH 15079 + ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharey=True) + + # share y + assert ax1._shared_y_axes.joined(ax1, ax2) + assert ax2._shared_y_axes.joined(ax1, ax2) + assert ax3._shared_y_axes.joined(ax1, ax3) + assert ax3._shared_y_axes.joined(ax2, ax3) + + # don't share x + assert not ax1._shared_x_axes.joined(ax1, ax2) + assert not ax2._shared_x_axes.joined(ax1, ax2) + assert not ax3._shared_x_axes.joined(ax1, ax3) + assert not ax3._shared_x_axes.joined(ax2, ax3) + + @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) + def test_figure_shape_hist_with_by(self, figsize): + # GH 15079 + axes = self.hist_df.plot.hist(column="A", by="C", figsize=figsize) + self._check_axes_shape(axes, axes_num=3, figsize=figsize) + + +@td.skip_if_no_mpl +class TestBoxWithBy(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + self.box_df = test_hist_box_with_by_df() + + @pytest.mark.parametrize( + "by, column, titles, xticklabels", + [ + ("C", "A", ["A"], [["a", "b", "c"]]), + ( + ["C", "D"], + "A", + ["A"], + [ + [ + "('a', 'a')", + "('a', 'b')", + "('a', 'c')", + "('b', 'a')", + "('b', 'b')", + "('b', 'c')", + "('c', 'a')", + "('c', 'b')", + "('c', 'c')", + ] + ], + ), + ("C", ["A", "B"], ["A", "B"], [["a", "b", "c"]] * 2), + ( + ["C", "D"], + ["A", "B"], + ["A", "B"], + [ + [ + "('a', 'a')", + "('a', 'b')", + "('a', 'c')", + "('b', 'a')", + "('b', 'b')", + "('b', 'c')", + "('c', 'a')", + "('c', 'b')", + "('c', 'c')", + ] + ] + * 2, + ), + (["C"], None, ["A", "B"], [["a", "b", "c"]] * 2), + ], + ) + def test_box_plot_by_argument(self, by, column, titles, xticklabels): + # GH 15079 + axes = _check_plot_works(self.box_df.plot.box, column=column, by=by) + result_titles = [ax.get_title() for ax in axes] + result_xticklabels = [ + [i.get_text() for i in ax.get_xticklabels()] for ax in axes + ] + + assert result_xticklabels == xticklabels + assert result_titles == titles + + @pytest.mark.slow + @pytest.mark.parametrize( + "by, column, layout, axes_num", + [ + (["C"], "A", (1, 1), 1), + ("C", "A", (1, 1), 1), + ("C", None, (2, 1), 2), + ("C", ["A", "B"], (1, 2), 2), + (["C", "D"], "A", (1, 1), 1), + (["C", "D"], None, (1, 2), 2), + ], + ) + def test_box_plot_layout_with_by(self, by, column, layout, axes_num): + # GH 15079 + axes = _check_plot_works( + self.box_df.plot.box, column=column, by=by, layout=layout + ) + self._check_axes_shape(axes, axes_num=axes_num, layout=layout) + + def test_box_plot_invalid_layout_with_by_raises(self): + # GH 15079, test if error is raised when invalid layout is given + + # layout too small for all 3 plots + msg = "larger than required size" + with pytest.raises(ValueError, match=msg): + self.box_df.plot.box(column=["A", "B"], by=["C", "D"], layout=(1, 1)) + + # invalid format for layout + msg = re.escape("Layout must be a tuple of (rows, columns)") + with pytest.raises(ValueError, match=msg): + self.box_df.plot.box(column=["A", "B"], by="C", layout=(1,)) + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + self.box_df.plot.box(column=["A", "B"], by="C", layout=(-1, -1)) + + @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) + def test_figure_shape_hist_with_by(self, figsize): + # GH 15079 + axes = self.box_df.plot.box(column="A", by="C", figsize=figsize) + self._check_axes_shape(axes, axes_num=1, figsize=figsize) diff --git a/pandas/tests/plotting/test_hist_by.py b/pandas/tests/plotting/test_hist_by.py deleted file mode 100644 index 1e126caf6ef50..0000000000000 --- a/pandas/tests/plotting/test_hist_by.py +++ /dev/null @@ -1,120 +0,0 @@ -import re - -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas import DataFrame -import pandas._testing as tm -from pandas.tests.plotting.common import ( - TestPlotBase, - _check_plot_works, -) - - -def test_hist_with_by_df(): - np.random.seed(0) - df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) - df["C"] = np.random.choice(["a", "b", "c"], 30) - df["D"] = np.random.choice(["a", "b", "c"], 30) - return df - - -@td.skip_if_no_mpl -class TestDataFrameColor(TestPlotBase): - def setup_method(self, method): - TestPlotBase.setup_method(self, method) - import matplotlib as mpl - - mpl.rcdefaults() - self.hist_df = test_hist_with_by_df() - - @pytest.mark.parametrize("by", ["C", ["C", "D"]]) - @pytest.mark.parametrize("column", ["A", ["A", "B"], None]) - def test_hist_plot_by_argument(self, by, column): - # GH 15079 - _check_plot_works(self.hist_df.plot.hist, column=column, by=by) - - @pytest.mark.slow - @pytest.mark.parametrize( - "by, column, layout, axes_num", - [ - (["C"], "A", (2, 2), 3), - ("C", "A", (2, 2), 3), - (["C"], ["A"], (1, 3), 3), - ("C", None, (3, 1), 3), - ("C", ["A", "B"], (3, 1), 3), - (["C", "D"], "A", (9, 1), 9), - (["C", "D"], "A", (3, 3), 9), - (["C", "D"], ["A"], (5, 2), 9), - (["C", "D"], ["A", "B"], (9, 1), 9), - (["C", "D"], None, (9, 1), 9), - (["C", "D"], ["A", "B"], (5, 2), 9), - ], - ) - def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): - # GH 15079 - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - self.hist_df.plot.hist, column=column, by=by, layout=layout - ) - self._check_axes_shape(axes, axes_num=axes_num, layout=layout) - - def test_hist_plot_invalid_layout_with_by_raises(self): - # GH 15079, test if error is raised when invalid layout is given - - # layout too small for all 3 plots - msg = "larger than required size" - with pytest.raises(ValueError, match=msg): - self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) - - # invalid format for layout - msg = re.escape("Layout must be a tuple of (rows, columns)") - with pytest.raises(ValueError, match=msg): - self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) - - msg = "At least one dimension of layout must be positive" - with pytest.raises(ValueError, match=msg): - self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) - - @pytest.mark.slow - def test_axis_share_x_with_by(self): - # GH 15079 - ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharex=True) - - # share x - assert ax1._shared_x_axes.joined(ax1, ax2) - assert ax2._shared_x_axes.joined(ax1, ax2) - assert ax3._shared_x_axes.joined(ax1, ax3) - assert ax3._shared_x_axes.joined(ax2, ax3) - - # don't share y - assert not ax1._shared_y_axes.joined(ax1, ax2) - assert not ax2._shared_y_axes.joined(ax1, ax2) - assert not ax3._shared_y_axes.joined(ax1, ax3) - assert not ax3._shared_y_axes.joined(ax2, ax3) - - @pytest.mark.slow - def test_axis_share_y_with_by(self): - # GH 15079 - ax1, ax2, ax3 = self.hist_df.plot.hist(column="A", by="C", sharey=True) - - # share y - assert ax1._shared_y_axes.joined(ax1, ax2) - assert ax2._shared_y_axes.joined(ax1, ax2) - assert ax3._shared_y_axes.joined(ax1, ax3) - assert ax3._shared_y_axes.joined(ax2, ax3) - - # don't share x - assert not ax1._shared_x_axes.joined(ax1, ax2) - assert not ax2._shared_x_axes.joined(ax1, ax2) - assert not ax3._shared_x_axes.joined(ax1, ax3) - assert not ax3._shared_x_axes.joined(ax2, ax3) - - @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) - def test_figure_shape_hist_with_by(self, figsize): - # GH 15079 - axes = self.hist_df.plot.hist(column="A", by="C", figsize=figsize) - self._check_axes_shape(axes, axes_num=3, figsize=figsize) From 006588eae3c8301e65d2f19ea813542a705b682c Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 21:00:16 +0200 Subject: [PATCH 118/142] flake8 --- pandas/tests/plotting/test_hist_box_by.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/test_hist_box_by.py b/pandas/tests/plotting/test_hist_box_by.py index 43cb02270f338..8ecfd5a467cff 100644 --- a/pandas/tests/plotting/test_hist_box_by.py +++ b/pandas/tests/plotting/test_hist_box_by.py @@ -90,7 +90,9 @@ def test_hist_plot_by_argument(self, by, column, titles, legends): # GH 15079 axes = _check_plot_works(self.hist_df.plot.hist, column=column, by=by) result_titles = [ax.get_title() for ax in axes] - result_legends = [[l.get_text() for l in ax.get_legend().texts] for ax in axes] + result_legends = [ + [legend.get_text() for legend in ax.get_legend().texts] for ax in axes + ] assert result_legends == legends assert result_titles == titles @@ -238,7 +240,7 @@ def test_box_plot_by_argument(self, by, column, titles, xticklabels): axes = _check_plot_works(self.box_df.plot.box, column=column, by=by) result_titles = [ax.get_title() for ax in axes] result_xticklabels = [ - [i.get_text() for i in ax.get_xticklabels()] for ax in axes + [label.get_text() for label in ax.get_xticklabels()] for ax in axes ] assert result_xticklabels == xticklabels From 4f0a1dc827d066a1a4373c8685cbe124067fd17f Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Sat, 22 May 2021 21:01:18 +0200 Subject: [PATCH 119/142] move file --- pandas/tests/plotting/{ => frame}/test_hist_box_by.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/plotting/{ => frame}/test_hist_box_by.py (100%) diff --git a/pandas/tests/plotting/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py similarity index 100% rename from pandas/tests/plotting/test_hist_box_by.py rename to pandas/tests/plotting/frame/test_hist_box_by.py From e1579e227f8cc5966e569fcf391497fd4be2eb28 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Mon, 24 May 2021 11:07:58 +0200 Subject: [PATCH 120/142] pprint label --- pandas/plotting/_matplotlib/boxplot.py | 2 +- .../tests/plotting/frame/test_hist_box_by.py | 36 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index b3526845604cb..7eac2329f38b8 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -158,7 +158,7 @@ def _make_plot(self): # When `by` is assigned, the ticklabels will become unique grouped # values, instead of label which is used as subtitle in this case. if self.by is not None: - label = self.data.columns.levels[0] + label = [pprint_thing(col) for col in self.data.columns.levels[0]] self._set_ticklabels(ax, label) else: y = self.data.values.T diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index 8ecfd5a467cff..11d87fa6bcbd1 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -200,15 +200,15 @@ def setup_method(self, method): ["A"], [ [ - "('a', 'a')", - "('a', 'b')", - "('a', 'c')", - "('b', 'a')", - "('b', 'b')", - "('b', 'c')", - "('c', 'a')", - "('c', 'b')", - "('c', 'c')", + "(a, a)", + "(a, b)", + "(a, c)", + "(b, a)", + "(b, b)", + "(b, c)", + "(c, a)", + "(c, b)", + "(c, c)", ] ], ), @@ -219,15 +219,15 @@ def setup_method(self, method): ["A", "B"], [ [ - "('a', 'a')", - "('a', 'b')", - "('a', 'c')", - "('b', 'a')", - "('b', 'b')", - "('b', 'c')", - "('c', 'a')", - "('c', 'b')", - "('c', 'c')", + "(a, a)", + "(a, b)", + "(a, c)", + "(b, a)", + "(b, b)", + "(b, c)", + "(c, a)", + "(c, b)", + "(c, c)", ] ] * 2, From e6e96d384cedb27893beb4deca5bf17b9825b197 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Tue, 29 Jun 2021 17:57:33 +0200 Subject: [PATCH 121/142] parametrize tests --- .../tests/plotting/frame/test_hist_box_by.py | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index 1383c113bd966..2856ab201ba06 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -123,22 +123,19 @@ def test_hist_plot_layout_with_by(self, by, column, layout, axes_num): ) self._check_axes_shape(axes, axes_num=axes_num, layout=layout) - def test_hist_plot_invalid_layout_with_by_raises(self): + @pytest.mark.parametrize( + "msg, by, layout", + [ + ("larger than required size", ["C", "D"], (1, 1)), + (re.escape("Layout must be a tuple of (rows, columns)"), "C", (1,)), + ("At least one dimension of layout must be positive", "C", (-1, -1)), + ], + ) + def test_hist_plot_invalid_layout_with_by_raises(self, msg, by, layout): # GH 15079, test if error is raised when invalid layout is given - # layout too small for all 3 plots - msg = "larger than required size" - with pytest.raises(ValueError, match=msg): - self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1, 1)) - - # invalid format for layout - msg = re.escape("Layout must be a tuple of (rows, columns)") - with pytest.raises(ValueError, match=msg): - self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(1,)) - - msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): - self.hist_df.plot.hist(column=["A", "B"], by="C", layout=(-1, -1)) + self.hist_df.plot.hist(column=["A", "B"], by=by, layout=layout) @pytest.mark.slow def test_axis_share_x_with_by(self): @@ -265,22 +262,19 @@ def test_box_plot_layout_with_by(self, by, column, layout, axes_num): ) self._check_axes_shape(axes, axes_num=axes_num, layout=layout) - def test_box_plot_invalid_layout_with_by_raises(self): + @pytest.mark.parametrize( + "msg, by, layout", + [ + ("larger than required size", ["C", "D"], (1, 1)), + (re.escape("Layout must be a tuple of (rows, columns)"), "C", (1,)), + ("At least one dimension of layout must be positive", "C", (-1, -1)), + ], + ) + def test_box_plot_invalid_layout_with_by_raises(self, msg, by, layout): # GH 15079, test if error is raised when invalid layout is given - # layout too small for all 3 plots - msg = "larger than required size" - with pytest.raises(ValueError, match=msg): - self.box_df.plot.box(column=["A", "B"], by=["C", "D"], layout=(1, 1)) - - # invalid format for layout - msg = re.escape("Layout must be a tuple of (rows, columns)") - with pytest.raises(ValueError, match=msg): - self.box_df.plot.box(column=["A", "B"], by="C", layout=(1,)) - - msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): - self.box_df.plot.box(column=["A", "B"], by="C", layout=(-1, -1)) + self.box_df.plot.box(column=["A", "B"], by=by, layout=layout) @pytest.mark.parametrize("figsize", [(12, 8), (20, 10)]) def test_figure_shape_hist_with_by(self, figsize): From 52e47f1206c24df40ba625fd9165f24155071cb9 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Tue, 29 Jun 2021 19:20:35 +0200 Subject: [PATCH 122/142] Fix test --- pandas/plotting/_matplotlib/core.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 85e96195ba56d..517660cbd25bf 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -135,6 +135,11 @@ def __init__( self.data = data self.by = com.maybe_make_list(by) + # For `hist` plot, need to get grouped original data before `self.data` is + # updated later + if self.by and self._kind == "hist": + self._grouped = data.groupby(self.by) + # Assign the rest of columns into self.columns if by is explicitly defined # while column is not, so as to keep the same behaviour with current df.hist # or df.boxplot. @@ -296,7 +301,7 @@ def nseries(self) -> int: if self.data.ndim == 1: return 1 elif self.by and self._kind == "hist": - return len(self.data.groupby(self.by)) + return len(self._grouped) elif self.by and self._kind == "box": return len(self.columns) else: From bc2f2821d414aec21c5283de3f7f1f8bb8fa4a44 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 20:22:57 +0200 Subject: [PATCH 123/142] Code changes based on Marc reviews --- doc/source/whatsnew/v1.3.0.rst | 1 - doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/plotting/_core.py | 4 +++- pandas/plotting/_matplotlib/boxplot.py | 14 ++++++++------ pandas/plotting/_matplotlib/core.py | 16 ++++++++-------- pandas/plotting/_matplotlib/groupby.py | 14 +++++++------- pandas/plotting/_matplotlib/hist.py | 7 +++---- pandas/tests/plotting/frame/test_hist_box_by.py | 6 +++--- 8 files changed, 33 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 2f573b6f16141..60dc7096c9d1e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -275,7 +275,6 @@ Other enhancements - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) -- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`) - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) - :meth:`Series.between` can now accept ``left`` or ``right`` as arguments to ``inclusive`` to include only the left or right boundary (:issue:`40245`) - :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 81545ada63ce5..d652097eca22f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -29,7 +29,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`, :issue:`28373`) - .. --------------------------------------------------------------------------- diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 1ad08ba021392..c8b1984a2f77a 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1237,6 +1237,8 @@ def box(self, by=None, **kwargs): ---------- by : str or sequence Column in the DataFrame to group by. + + .. versionchanged:: 1.4.0 **kwargs Additional keywords are documented in :meth:`DataFrame.plot`. @@ -1279,7 +1281,7 @@ def hist(self, by=None, bins=10, **kwargs): by : str or sequence, optional Column in the DataFrame to group by. - .. versionadded:: 1.3.0 + .. versionadded:: 1.4.0 bins : int, default 10 Number of histogram bins to be used. diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 766ce47fbcf6d..25986af0942a7 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -137,19 +137,19 @@ def _make_plot(self): self._return_obj = pd.Series(dtype=object) # Re-create iterated data if `by` is assigned by users - if self.by is None: - data = self.data - else: - data = create_iter_data_given_by(self.data, self._kind) + data = ( + create_iter_data_given_by(self.data, self._kind) + if self.by + else self.data + ) for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() - ticklabels = [pprint_thing(label)] # When by is applied, show title for subplots to know which group it is # just like df.boxplot, and need to apply T on y to provide right input - if self.by is not None: + if self.by: y = y.T ax.set_title(pprint_thing(label)) @@ -158,6 +158,8 @@ def _make_plot(self): ticklabels = [ pprint_thing(col) for col in self.data.columns.levels[0] ] + else: + ticklabels = [pprint_thing(label)] ret, bp = self._plot( ax, y, column_num=i, return_type=self.return_type, **kwds diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 517660cbd25bf..6735191354fbc 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -135,15 +135,10 @@ def __init__( self.data = data self.by = com.maybe_make_list(by) - # For `hist` plot, need to get grouped original data before `self.data` is - # updated later - if self.by and self._kind == "hist": - self._grouped = data.groupby(self.by) - # Assign the rest of columns into self.columns if by is explicitly defined - # while column is not, so as to keep the same behaviour with current df.hist - # or df.boxplot. - if self.by and column is None: + # while column is not + # TODO: Might deprecate `column` argument in future PR (#28373) + if column is None: self.columns = [ col for col in data.columns @@ -152,6 +147,11 @@ def __init__( else: self.columns = com.maybe_make_list(column) + # For `hist` plot, need to get grouped original data before `self.data` is + # updated later + if self.by and self._kind == "hist": + self._grouped = data.groupby(self.by) + self.kind = kind self.sort_columns = sort_columns diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 874bf9d7963cc..3d37506907236 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -57,9 +57,10 @@ def create_iter_data_given_by( 'h2': DataFrame({'a': [np.nan, np.nan, 5], 'b': [np.nan, np.nan, 6]})} """ - # For `hist` plot, before transformation, the values in level 0 are - # actual subplot titles, and used for column subselection and iteration; - # For `box` plot, that's values in level 1 + # For `hist` plot, before transformation, the values in level 0 are values + # in groups and subplot titles, and later used for column subselection and + # iteration; For `box` plot, values in level 1 are column names to show, + # and are used for iteration and as subplots titles. if kind == "hist": level = 0 elif kind == "box": @@ -74,11 +75,10 @@ def create_iter_data_given_by( # Select sub-columns based on the value of first level of MI assert isinstance(data.columns, MultiIndex) - cols = data.columns.levels[level] - iter_data = { - col: data.loc[:, data.columns.get_level_values(level) == col] for col in cols + return { + col: data.loc[:, data.columns.get_level_values(level) == col] + for col in data.columns.levels[level] } - return iter_data def reconstruct_data_with_by( diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 181cea57fff49..7fdf1977b8089 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -101,10 +101,9 @@ def _make_plot(self): stacking_id = self._get_stacking_id() # Re-create iterated data if `by` is assigned by users - if self.by is None: - data = self.data - else: - data = create_iter_data_given_by(self.data, self._kind) + data = ( + create_iter_data_given_by(self.data, self._kind) if self.by else self.data + ) for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index 2856ab201ba06..25380fe3238cf 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -13,7 +13,7 @@ ) -def create_hist_box_with_by_df(): +def _create_hist_box_with_by_df(): np.random.seed(0) df = DataFrame(np.random.randn(30, 2), columns=["A", "B"]) df["C"] = np.random.choice(["a", "b", "c"], 30) @@ -28,7 +28,7 @@ def setup_method(self, method): import matplotlib as mpl mpl.rcdefaults() - self.hist_df = create_hist_box_with_by_df() + self.hist_df = _create_hist_box_with_by_df() @pytest.mark.parametrize( "by, column, titles, legends", @@ -185,7 +185,7 @@ def setup_method(self, method): import matplotlib as mpl mpl.rcdefaults() - self.box_df = create_hist_box_with_by_df() + self.box_df = _create_hist_box_with_by_df() @pytest.mark.parametrize( "by, column, titles, xticklabels", From ceeb3c5ee04c8ce992a29e97dc33d5c1826e0a33 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 20:24:44 +0200 Subject: [PATCH 124/142] update doc --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d652097eca22f..8834e90a8b4b2 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -29,7 +29,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`, :issue:`28373`) +- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`) - .. --------------------------------------------------------------------------- From 4fea841d3c4c536f559f7806c6816e9904d4c919 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 20:32:44 +0200 Subject: [PATCH 125/142] version change --- pandas/plotting/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c8b1984a2f77a..1834aa4ac4013 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1238,7 +1238,7 @@ def box(self, by=None, **kwargs): by : str or sequence Column in the DataFrame to group by. - .. versionchanged:: 1.4.0 + .. versionadded:: 1.4.0 **kwargs Additional keywords are documented in :meth:`DataFrame.plot`. From 3ea2603913fca84255f12a656be7b8c9b177a6de Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 21:03:01 +0200 Subject: [PATCH 126/142] Use self.by --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/plotting/_matplotlib/groupby.py | 2 +- pandas/plotting/_matplotlib/hist.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6735191354fbc..df112fa39f72c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -451,7 +451,7 @@ def _compute_plot_data(self): data = data.to_frame(name=label) # GH15079 reconstruct data if by is defined - if self.by is not None: + if self.by: self.subplots = True data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 3d37506907236..a8853a66080cf 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -132,6 +132,6 @@ def reformat_hist_y_given_by( If by is None, input y is 1-d with NaN removed; and if by is not None, groupby will take place and input y is multi-dimensional array. """ - if by is not None and len(y.shape) > 1: + if by and len(y.shape) > 1: return np.array([remove_na_arraylike(col) for col in y.T]).T return remove_na_arraylike(y) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 7fdf1977b8089..961b13083293d 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -122,7 +122,7 @@ def _make_plot(self): # the bins is multi-dimension array now and each plot need only 1-d and # when by is applied, label should be columns that are grouped - if self.by is not None: + if self.by: kwds["bins"] = kwds["bins"][i] kwds["label"] = self.columns kwds.pop("color") @@ -139,7 +139,7 @@ def _make_plot(self): artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) # when by is applied, show title for subplots to know which group it is - if self.by is not None: + if self.by: ax.set_title(pprint_thing(label)) self._append_legend_handles_labels(artists[0], label) From 9f4813943ae2dd7adb60a8ade1cf95d786c8e012 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 21:05:31 +0200 Subject: [PATCH 127/142] better code --- pandas/plotting/_matplotlib/hist.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 961b13083293d..28cdf2bd57adb 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -53,12 +53,11 @@ def _args_adjust(self): # calculate bin number separately in different subplots # where subplots are created based on by argument if is_integer(self.bins): - if self.by is None: - self.bins = self._calculate_bins(self.data) - - else: + if self.by: grouped = self.data.groupby(self.by)[self.columns] self.bins = [self._calculate_bins(group) for key, group in grouped] + else: + self.bins = self._calculate_bins(self.data) if is_list_like(self.bottom): self.bottom = np.array(self.bottom) From b1094e35ab9a944c526338f0b9077b78600a7465 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 21:07:58 +0200 Subject: [PATCH 128/142] better inline comment --- pandas/plotting/_matplotlib/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index a8853a66080cf..558ce3575c07a 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -73,7 +73,8 @@ def create_iter_data_given_by( iter_data: Dict[str, FrameOrSeriesUnion] - # Select sub-columns based on the value of first level of MI + # Select sub-columns based on the value of level of MI, and if `by` is + # assigned, data must be a MI DataFrame assert isinstance(data.columns, MultiIndex) return { col: data.loc[:, data.columns.get_level_values(level) == col] From 97bde5959803a1a5541bf413dbf0b9bc16247524 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 21:11:25 +0200 Subject: [PATCH 129/142] code changes based on Marc reviews --- pandas/plotting/_matplotlib/core.py | 6 +++--- pandas/plotting/_matplotlib/groupby.py | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index df112fa39f72c..0a4cbf70245ed 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -138,14 +138,14 @@ def __init__( # Assign the rest of columns into self.columns if by is explicitly defined # while column is not # TODO: Might deprecate `column` argument in future PR (#28373) - if column is None: + if column: + self.columns = com.maybe_make_list(column) + else: self.columns = [ col for col in data.columns if col not in self.by and is_numeric_dtype(data[col]) ] - else: - self.columns = com.maybe_make_list(column) # For `hist` plot, need to get grouped original data before `self.data` is # updated later diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 558ce3575c07a..69bb335445433 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -71,8 +71,6 @@ def create_iter_data_given_by( f"kind 'hist' and 'box' plots, but used with '{kind}'" ) - iter_data: Dict[str, FrameOrSeriesUnion] - # Select sub-columns based on the value of level of MI, and if `by` is # assigned, data must be a MI DataFrame assert isinstance(data.columns, MultiIndex) @@ -126,7 +124,7 @@ def reconstruct_data_with_by( def reformat_hist_y_given_by( - y: Union[Series, np.ndarray], by: Optional[IndexLabel] = None + y: Union[Series, np.ndarray], by: IndexLabel | None = None ) -> Union[Series, np.ndarray]: """Internal function to reformat y given `by` is applied or not for hist plot. From 444a964f883949da5ea60747374cd2406741063e Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 21:19:21 +0200 Subject: [PATCH 130/142] minor fix --- pandas/plotting/_matplotlib/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 69bb335445433..1af8257c8f8f9 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -1,6 +1,5 @@ from typing import ( Dict, - Optional, Union, ) From b66dad0e0f02462370d45a3c6ce6b61ac055dc55 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 21:50:28 +0200 Subject: [PATCH 131/142] mypy --- pandas/plotting/_matplotlib/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 1af8257c8f8f9..d46a57a935ee7 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -123,7 +123,7 @@ def reconstruct_data_with_by( def reformat_hist_y_given_by( - y: Union[Series, np.ndarray], by: IndexLabel | None = None + y: Union[Series, np.ndarray], by: IndexLabel | None ) -> Union[Series, np.ndarray]: """Internal function to reformat y given `by` is applied or not for hist plot. From 982f56237d1b19db078b3213a0a1a84b54f858d9 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 21:57:09 +0200 Subject: [PATCH 132/142] add future annotation --- pandas/plotting/_matplotlib/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index d46a57a935ee7..24ebaba943416 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import ( Dict, Union, From c76ad67431b073c05abfe3f0a42ea62b71942eb6 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 22:12:00 +0200 Subject: [PATCH 133/142] fix pre commit --- pandas/plotting/_matplotlib/groupby.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 24ebaba943416..0c37f6092b63a 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -1,10 +1,5 @@ from __future__ import annotations -from typing import ( - Dict, - Union, -) - import numpy as np from pandas._typing import ( @@ -24,7 +19,7 @@ def create_iter_data_given_by( data: DataFrame, kind: str = "hist" -) -> Dict[str, FrameOrSeriesUnion]: +) -> dict[str, FrameOrSeriesUnion]: """ Create data for iteration given `by` is assigned or not, and it is only used in both hist and boxplot. @@ -125,8 +120,8 @@ def reconstruct_data_with_by( def reformat_hist_y_given_by( - y: Union[Series, np.ndarray], by: IndexLabel | None -) -> Union[Series, np.ndarray]: + y: Series | np.ndarray, by: IndexLabel | None +) -> Series | np.ndarray: """Internal function to reformat y given `by` is applied or not for hist plot. If by is None, input y is 1-d with NaN removed; and if by is not None, groupby From 2c1aa33876701059b9a9e873e170ff7e81d61265 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 22:36:49 +0200 Subject: [PATCH 134/142] minor experimental fix --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 0a4cbf70245ed..b34f88d975abd 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -144,7 +144,7 @@ def __init__( self.columns = [ col for col in data.columns - if col not in self.by and is_numeric_dtype(data[col]) + if self.by and col not in self.by and is_numeric_dtype(data[col]) ] # For `hist` plot, need to get grouped original data before `self.data` is From 68965463e572c070213082d40e38c479f79579d5 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Wed, 30 Jun 2021 22:42:35 +0200 Subject: [PATCH 135/142] better doc string --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index b34f88d975abd..4949c765dd398 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -136,7 +136,7 @@ def __init__( self.by = com.maybe_make_list(by) # Assign the rest of columns into self.columns if by is explicitly defined - # while column is not + # while column is not, only need `columns` in hist/box plot. # TODO: Might deprecate `column` argument in future PR (#28373) if column: self.columns = com.maybe_make_list(column) From 3c5430249346006683d068428d00dc6d2318b339 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Thu, 1 Jul 2021 07:19:25 +0200 Subject: [PATCH 136/142] fixup doc fail --- pandas/plotting/_matplotlib/core.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 4949c765dd398..20701f92a690d 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -39,6 +39,7 @@ ) import pandas.core.common as com +from pandas.core.frame import DataFrame from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 @@ -136,16 +137,17 @@ def __init__( self.by = com.maybe_make_list(by) # Assign the rest of columns into self.columns if by is explicitly defined - # while column is not, only need `columns` in hist/box plot. + # while column is not, only need `columns` in hist/box plot when it's DF # TODO: Might deprecate `column` argument in future PR (#28373) - if column: - self.columns = com.maybe_make_list(column) - else: - self.columns = [ - col - for col in data.columns - if self.by and col not in self.by and is_numeric_dtype(data[col]) - ] + if isinstance(data, DataFrame): + if column: + self.columns = com.maybe_make_list(column) + else: + self.columns = [ + col + for col in data.columns + if self.by and col not in self.by and is_numeric_dtype(data[col]) + ] # For `hist` plot, need to get grouped original data before `self.data` is # updated later From 2d20178acebe9854ef9d4b80ecca397476a36a1b Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Thu, 1 Jul 2021 10:26:27 +0200 Subject: [PATCH 137/142] code change on Macro reviews --- pandas/plotting/_matplotlib/core.py | 18 +++++-- .../tests/plotting/frame/test_hist_box_by.py | 50 +++++++++++++++++++ 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 20701f92a690d..973a127f0c801 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -143,11 +143,16 @@ def __init__( if column: self.columns = com.maybe_make_list(column) else: - self.columns = [ - col - for col in data.columns - if self.by and col not in self.by and is_numeric_dtype(data[col]) - ] + if self.by: + self.columns = [ + col + for col in data.columns + if col not in self.by and is_numeric_dtype(data[col]) + ] + else: + self.columns = [ + col for col in data.columns if is_numeric_dtype(data[col]) + ] # For `hist` plot, need to get grouped original data before `self.data` is # updated later @@ -451,6 +456,9 @@ def _compute_plot_data(self): if label is None and data.name is None: label = "None" data = data.to_frame(name=label) + else: + cols = self.columns if self.by is None else self.columns + self.by + data = data.loc[:, cols] # GH15079 reconstruct data if by is defined if self.by: diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index 25380fe3238cf..a3014a9b3b5bb 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -97,6 +97,31 @@ def test_hist_plot_by_argument(self, by, column, titles, legends): assert result_legends == legends assert result_titles == titles + @pytest.mark.parametrize( + "by, column, legends, title", + [ + ([], ["A"], ["A"], None), + (None, "A", ["A"], "hist A"), + ([], ["A", "B"], ["A", "B"], "hist A and B"), + (None, ["A", "B"], ["A", "B"], "hist A and B"), + ], + ) + def test_hist_plot_with_none_empty_list_by(self, by, column, legends, title): + # GH 15079 + axes = _check_plot_works( + self.hist_df.plot.hist, column=column, by=by, title=title + ) + result_titles = axes.get_title() + result_legends = [legend.get_text() for legend in axes.get_legend().texts] + + assert result_legends == legends + + # Should be no title if it is not subplots + if title is None: + assert result_titles == "" + else: + assert result_titles == title + @pytest.mark.slow @pytest.mark.parametrize( "by, column, layout, axes_num", @@ -243,6 +268,31 @@ def test_box_plot_by_argument(self, by, column, titles, xticklabels): assert result_xticklabels == xticklabels assert result_titles == titles + @pytest.mark.parametrize( + "by, column, xticklabels, title", + [ + ([], ["A"], ["A"], None), + (None, "A", ["A"], "box A"), + ([], ["A", "B"], ["A", "B"], "box A and B"), + (None, ["A", "B"], ["A", "B"], "box A and B"), + ], + ) + def test_box_plot_with_none_empty_list_by(self, by, column, xticklabels, title): + # GH 15079 + axes = _check_plot_works( + self.box_df.plot.box, column=column, by=by, title=title + ) + result_titles = axes.get_title() + result_legends = [xtick.get_text() for xtick in axes.get_xticklabels()] + + assert result_legends == xticklabels + + # Should be no title if it is not subplots + if title is None: + assert result_titles == "" + else: + assert result_titles == title + @pytest.mark.slow @pytest.mark.parametrize( "by, column, layout, axes_num", From a169dfd3887175fc28cf8801aa6c054cc5e0eacb Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Thu, 1 Jul 2021 10:32:12 +0200 Subject: [PATCH 138/142] Add more tests --- pandas/tests/plotting/frame/test_hist_box_by.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index a3014a9b3b5bb..a0e32ee14650e 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -103,6 +103,7 @@ def test_hist_plot_by_argument(self, by, column, titles, legends): ([], ["A"], ["A"], None), (None, "A", ["A"], "hist A"), ([], ["A", "B"], ["A", "B"], "hist A and B"), + ([], None, ["A", "B"], "hist A and B"), (None, ["A", "B"], ["A", "B"], "hist A and B"), ], ) @@ -274,6 +275,7 @@ def test_box_plot_by_argument(self, by, column, titles, xticklabels): ([], ["A"], ["A"], None), (None, "A", ["A"], "box A"), ([], ["A", "B"], ["A", "B"], "box A and B"), + ([], None, ["A", "B"], "box A and B"), (None, ["A", "B"], ["A", "B"], "box A and B"), ], ) From d0b56ff1b63e3400b17b4e922b4261e89e72f6dd Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Thu, 1 Jul 2021 11:28:27 +0200 Subject: [PATCH 139/142] fixup --- pandas/plotting/_matplotlib/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 973a127f0c801..64b3c60f308a5 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -456,7 +456,7 @@ def _compute_plot_data(self): if label is None and data.name is None: label = "None" data = data.to_frame(name=label) - else: + elif self._kind in ("hist", "box"): cols = self.columns if self.by is None else self.columns + self.by data = data.loc[:, cols] From 143f286326731cd06ec16ff96fac89fe14dafcf0 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Mon, 12 Jul 2021 08:13:32 +0200 Subject: [PATCH 140/142] changes based on Jeff review --- pandas/plotting/_matplotlib/core.py | 6 +- .../tests/plotting/frame/test_hist_box_by.py | 58 ++++++------------- 2 files changed, 21 insertions(+), 43 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 9ab513d4383d2..85b00d4f2852c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -135,10 +135,10 @@ def __init__( self.data = data - # if users assign an empty list or tuple, treat them as None - # then no group-by will be conducted. + # if users assign an empty list or tuple, raise `ValueError` + # similar to current `df.box` and `df.hist` APIs. if by in ([], ()): - by = None + raise ValueError("No group keys passed!") self.by = com.maybe_make_list(by) # Assign the rest of columns into self.columns if by is explicitly defined diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index e19bbeedbf22b..339120a503411 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -135,30 +135,19 @@ def test_hist_plot_by_0(self, by, column, titles, legends): assert result_titles == titles @pytest.mark.parametrize( - "by, column, legends, title", + "by, column", [ - ([], ["A"], ["A"], None), - (None, "A", ["A"], "hist A"), - ([], ["A", "B"], ["A", "B"], "hist A and B"), - ([], None, ["A", "B"], "hist A and B"), - (None, ["A", "B"], ["A", "B"], "hist A and B"), + ([], ["A"]), + ([], ["A", "B"]), + ((), None), + ((), ["A", "B"]), ], ) - def test_hist_plot_with_none_empty_list_by(self, by, column, legends, title): + def test_hist_plot_empty_list_string_tuple_by(self, by, column): # GH 15079 - axes = _check_plot_works( - self.hist_df.plot.hist, column=column, by=by, title=title - ) - result_titles = axes.get_title() - result_legends = [legend.get_text() for legend in axes.get_legend().texts] - - assert result_legends == legends - - # Should be no title if it is not subplots - if title is None: - assert result_titles == "" - else: - assert result_titles == title + msg = "No group keys passed" + with pytest.raises(ValueError, match=msg): + axes = _check_plot_works(self.hist_df.plot.hist, column=column, by=by) @pytest.mark.slow @pytest.mark.parametrize( @@ -346,30 +335,19 @@ def test_box_plot_by_0(self, by, column, titles, xticklabels): assert result_titles == titles @pytest.mark.parametrize( - "by, column, xticklabels, title", + "by, column", [ - ([], ["A"], ["A"], None), - (None, "A", ["A"], "box A"), - ([], ["A", "B"], ["A", "B"], "box A and B"), - ([], None, ["A", "B"], "box A and B"), - (None, ["A", "B"], ["A", "B"], "box A and B"), + ([], ["A"]), + ((), "A"), + ([], None), + ((), ["A", "B"]), ], ) - def test_box_plot_with_none_empty_list_by(self, by, column, xticklabels, title): + def test_box_plot_with_none_empty_list_by(self, by, column): # GH 15079 - axes = _check_plot_works( - self.box_df.plot.box, column=column, by=by, title=title - ) - result_titles = axes.get_title() - result_legends = [xtick.get_text() for xtick in axes.get_xticklabels()] - - assert result_legends == xticklabels - - # Should be no title if it is not subplots - if title is None: - assert result_titles == "" - else: - assert result_titles == title + msg = "No group keys passed" + with pytest.raises(ValueError, match=msg): + axes = _check_plot_works(self.box_df.plot.box, column=column, by=by) @pytest.mark.slow @pytest.mark.parametrize( From 283286fe356212c42e45ba1028a4b5c2f6a842b4 Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Mon, 12 Jul 2021 08:14:54 +0200 Subject: [PATCH 141/142] doc --- pandas/plotting/_core.py | 9 +++++++-- pandas/plotting/_matplotlib/groupby.py | 7 +------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index bd1b8547102b7..e7d1ce869f511 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1238,7 +1238,10 @@ def box(self, by=None, **kwargs): by : str or sequence Column in the DataFrame to group by. - .. versionadded:: 1.4.0 + .. versionchanged:: 1.4.0 + + Previously, `by` is silently ignore and makes no groupings + **kwargs Additional keywords are documented in :meth:`DataFrame.plot`. @@ -1281,7 +1284,9 @@ def hist(self, by=None, bins=10, **kwargs): by : str or sequence, optional Column in the DataFrame to group by. - .. versionadded:: 1.4.0 + .. versionchanged:: 1.4.0 + + Previously, `by` is silently ignore and makes no groupings bins : int, default 10 Number of histogram bins to be used. diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index 1c144d916ec9c..37cc3186fe097 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -59,13 +59,8 @@ def create_iter_data_given_by( # and are used for iteration and as subplots titles. if kind == "hist": level = 0 - elif kind == "box": - level = 1 else: - raise ValueError( - f"create_iter_data_given_by can only be used with " - f"kind 'hist' and 'box' plots, but used with '{kind}'" - ) + level = 1 # Select sub-columns based on the value of level of MI, and if `by` is # assigned, data must be a MI DataFrame From f1aeee0d2725656a7f5a929f4dd8c63783b6ed8d Mon Sep 17 00:00:00 2001 From: kaiqi Dong Date: Mon, 12 Jul 2021 08:22:08 +0200 Subject: [PATCH 142/142] fix flake8 --- pandas/tests/plotting/frame/test_hist_box_by.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/frame/test_hist_box_by.py b/pandas/tests/plotting/frame/test_hist_box_by.py index 339120a503411..ba6d232733762 100644 --- a/pandas/tests/plotting/frame/test_hist_box_by.py +++ b/pandas/tests/plotting/frame/test_hist_box_by.py @@ -147,7 +147,7 @@ def test_hist_plot_empty_list_string_tuple_by(self, by, column): # GH 15079 msg = "No group keys passed" with pytest.raises(ValueError, match=msg): - axes = _check_plot_works(self.hist_df.plot.hist, column=column, by=by) + _check_plot_works(self.hist_df.plot.hist, column=column, by=by) @pytest.mark.slow @pytest.mark.parametrize( @@ -347,7 +347,7 @@ def test_box_plot_with_none_empty_list_by(self, by, column): # GH 15079 msg = "No group keys passed" with pytest.raises(ValueError, match=msg): - axes = _check_plot_works(self.box_df.plot.box, column=column, by=by) + _check_plot_works(self.box_df.plot.box, column=column, by=by) @pytest.mark.slow @pytest.mark.parametrize(