From 7bb4fad21f75e4b00e57651e58c1b18811066b79 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 26 Feb 2023 22:09:37 -0500 Subject: [PATCH 1/6] DEPR: observed=False default in groupby --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/frame.py | 2 +- pandas/core/groupby/groupby.py | 17 +++++++++- pandas/core/indexes/base.py | 6 +++- pandas/core/series.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 4 +-- pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_categorical.py | 31 +++++++++++++------ pandas/tests/groupby/test_groupby.py | 4 +-- pandas/tests/groupby/test_groupby_dropna.py | 6 ++-- pandas/tests/groupby/test_min_max.py | 2 +- pandas/tests/groupby/test_rank.py | 4 +-- pandas/tests/groupby/test_size.py | 2 +- .../tests/groupby/transform/test_transform.py | 2 +- 14 files changed, 58 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 45b5c16415f9d..ef6dc69e40e2d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,7 +93,7 @@ Other API changes Deprecations ~~~~~~~~~~~~ - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`) -- +- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`43999`) .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49416cc2d53c0..de5e7c371138d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8216,7 +8216,7 @@ def groupby( as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> DataFrameGroupBy: from pandas.core.groupby.generic import DataFrameGroupBy diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 55e14bc11246b..5e6b5cc21f50b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -68,6 +68,7 @@ class providing the base-class of operations. cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ensure_dtype_can_hold_na from pandas.core.dtypes.common import ( @@ -905,7 +906,7 @@ def __init__( as_index: bool = True, sort: bool = True, group_keys: bool | lib.NoDefault = True, - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> None: self._selection = selection @@ -941,6 +942,18 @@ def __init__( self.grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() + if observed is lib.no_default: + if any(ping._passed_categorical for ping in grouper.groupings): + warnings.warn( + "The default of observed=False is deprecated and will be changed " + "to True in a future version of pandas. Pass observed=False to " + "retain current behavior or observed=True to adopt the future " + "default and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + self.observed = False + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) @@ -2125,6 +2138,8 @@ def _value_counts( result_series.index.droplevel(levels), sort=self.sort, dropna=self.dropna, + # GH#43999 - deprecation of observed=False + observed=False, ).transform("sum") result_series /= indexed_group_size diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index acebe8a498f03..3f77ea4fed90a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -712,7 +712,11 @@ def _format_duplicate_message(self) -> DataFrame: duplicates = self[self.duplicated(keep="first")].unique() assert len(duplicates) - out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] + out = ( + Series(np.arange(len(self))) + .groupby(self, observed=False) + .agg(list)[duplicates] + ) if self._is_multi: # test_format_duplicate_labels_message_multi # error: "Type[Index]" has no attribute "from_tuples" [attr-defined] diff --git a/pandas/core/series.py b/pandas/core/series.py index 06e9611c318cd..fd1fd313fc04f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1970,7 +1970,7 @@ def groupby( as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> SeriesGroupBy: from pandas.core.groupby.generic import SeriesGroupBy diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d658de4a7d7c3..ac08297912e3c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1240,7 +1240,7 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg(grp_col_dict) + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( @@ -1279,7 +1279,7 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg(grp_col_dict) + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5fa7ed15a01d4..efa6232a668b1 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -880,7 +880,7 @@ def test_apply_multi_level_name(category): df = DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) - result = df.groupby("B").apply(lambda x: x.sum()) + result = df.groupby("B", observed=False).apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fa8df166d56ac..32a711bd659bf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -739,7 +739,7 @@ def test_categorical_series(series, data): # Group the given series by a series with categorical data type such that group A # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in # the given data. - groupby = series.groupby(Series(list("ABBA"), dtype="category")) + groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False) result = groupby.aggregate(list) expected = Series(data, index=CategoricalIndex(data.keys())) tm.assert_series_equal(result, expected) @@ -1115,7 +1115,7 @@ def test_groupby_multiindex_categorical_datetime(): "values": np.arange(9), } ) - result = df.groupby(["key1", "key2"]).mean() + result = df.groupby(["key1", "key2"], observed=False).mean() idx = MultiIndex.from_product( [ @@ -1291,8 +1291,8 @@ def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): def test_groupby_categorical_series_dataframe_consistent(df_cat): # GH 20416 - expected = df_cat.groupby(["A", "B"])["C"].mean() - result = df_cat.groupby(["A", "B"]).mean()["C"] + expected = df_cat.groupby(["A", "B"], observed=False)["C"].mean() + result = df_cat.groupby(["A", "B"], observed=False).mean()["C"] tm.assert_series_equal(result, expected) @@ -1301,8 +1301,8 @@ def test_groupby_categorical_axis_1(code): # GH 13420 df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]}) cat = Categorical.from_codes(code, categories=list("abc")) - result = df.groupby(cat, axis=1).mean() - expected = df.T.groupby(cat, axis=0).mean().T + result = df.groupby(cat, axis=1, observed=False).mean() + expected = df.T.groupby(cat, axis=0, observed=False).mean().T tm.assert_frame_equal(result, expected) @@ -1472,7 +1472,7 @@ def test_series_groupby_categorical_aggregation_getitem(): df = DataFrame(d) cat = pd.cut(df["foo"], np.linspace(0, 20, 5)) df["range"] = cat - groups = df.groupby(["range", "baz"], as_index=True, sort=True) + groups = df.groupby(["range", "baz"], as_index=True, sort=True, observed=False) result = groups["foo"].agg("mean") expected = groups.agg("mean")["foo"] tm.assert_series_equal(result, expected) @@ -1533,7 +1533,7 @@ def test_read_only_category_no_sort(): {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))} ) expected = DataFrame(data={"a": [2.0, 6.0]}, index=CategoricalIndex(cats, name="b")) - result = df.groupby("b", sort=False).mean() + result = df.groupby("b", sort=False, observed=False).mean() tm.assert_frame_equal(result, expected) @@ -1577,7 +1577,7 @@ def test_sorted_missing_category_values(): dtype="category", ) - result = df.groupby(["bar", "foo"]).size().unstack() + result = df.groupby(["bar", "foo"], observed=False).size().unstack() tm.assert_frame_equal(result, expected) @@ -1742,7 +1742,7 @@ def test_groupby_categorical_indices_unused_categories(): "col": range(3), } ) - grouped = df.groupby("key", sort=False) + grouped = df.groupby("key", sort=False, observed=False) result = grouped.indices expected = { "b": np.array([0, 1], dtype="intp"), @@ -2007,3 +2007,14 @@ def test_many_categories(as_index, sort, index_kind, ordered): expected = DataFrame({"a": Series(index), "b": data}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("cat_columns", ["a", "b", ["a", "b"]]) +@pytest.mark.parametrize("keys", ["a", "b", ["a", "b"]]) +def test_groupby_default_depr(cat_columns, keys): + df = DataFrame({"a": [1, 1, 2, 3], "b": [4, 5, 6, 7]}) + df[cat_columns] = df[cat_columns].astype("category") + msg = "The default of observed=False is deprecated" + klass = FutureWarning if set(cat_columns) & set(keys) else None + with tm.assert_produces_warning(klass, match=msg): + df.groupby(keys) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e225ff5a0fa43..3da802b1766e7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1905,7 +1905,7 @@ def test_empty_groupby( df = df.iloc[:0] - gb = df.groupby(keys, group_keys=False, dropna=dropna)[columns] + gb = df.groupby(keys, group_keys=False, dropna=dropna, observed=False)[columns] def get_result(**kwargs): if method == "attr": @@ -2602,7 +2602,7 @@ def test_datetime_categorical_multikey_groupby_indices(): "c": Categorical.from_codes([-1, 0, 1], categories=[0, 1]), } ) - result = df.groupby(["a", "b"]).indices + result = df.groupby(["a", "b"], observed=False).indices expected = { ("a", Timestamp("2018-01-01 00:00:00")): np.array([0]), ("b", Timestamp("2018-02-01 00:00:00")): np.array([1]), diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 31a8e7a7d36ac..1fab736453ea4 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -448,7 +448,7 @@ def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): "a": [0, 1, 2, 3], } ) - gb = df.groupby("key", dropna=False, sort=False, as_index=as_index) + gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False) if test_series: gb = gb["a"] result = gb.sum() @@ -666,7 +666,7 @@ def test_categorical_agg(): df = pd.DataFrame( {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} ) - gb = df.groupby("x", dropna=False) + gb = df.groupby("x", dropna=False, observed=False) result = gb.agg(lambda x: x.sum()) expected = gb.sum() tm.assert_frame_equal(result, expected) @@ -678,7 +678,7 @@ def test_categorical_transform(): df = pd.DataFrame( {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} ) - gb = df.groupby("x", dropna=False) + gb = df.groupby("x", dropna=False, observed=False) result = gb.transform(lambda x: x.sum()) expected = gb.transform("sum") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 11f62c5d03c49..8602f8bdb1aa1 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -236,7 +236,7 @@ def test_min_max_nullable_uint64_empty_group(): # don't raise NotImplementedError from libgroupby cat = pd.Categorical([0] * 10, categories=[0, 1]) df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) - gb = df.groupby("A") + gb = df.groupby("A", observed=False) res = gb.min() diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index d0b848a567346..becf42ce78798 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -21,11 +21,11 @@ def test_rank_unordered_categorical_typeerror(): msg = "Cannot perform rank with non-ordered Categorical" - gb = ser.groupby(cat) + gb = ser.groupby(cat, observed=False) with pytest.raises(TypeError, match=msg): gb.rank() - gb2 = df.groupby(cat) + gb2 = df.groupby(cat, observed=False) with pytest.raises(TypeError, match=msg): gb2.rank() diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index c0c98562eda68..cb706dc2341dd 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -81,7 +81,7 @@ def test_size_period_index(): def test_size_on_categorical(as_index): df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) df["A"] = df["A"].astype("category") - result = df.groupby(["A", "B"], as_index=as_index).size() + result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() expected = DataFrame( [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 8abcc52db0500..6f7613d5a6958 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1062,7 +1062,7 @@ def test_transform_absent_categories(func): x_cats = range(2) y = [1] df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y}) - result = getattr(df.y.groupby(df.x), func)() + result = getattr(df.y.groupby(df.x, observed=False), func)() expected = df.y tm.assert_series_equal(result, expected) From 612ca05ae67f27ccd5f74951391b392b5bf70481 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 6 Mar 2023 18:48:27 -0500 Subject: [PATCH 2/6] Fixup docs --- doc/source/user_guide/10min.rst | 4 ++-- doc/source/user_guide/advanced.rst | 4 ++-- doc/source/user_guide/categorical.rst | 10 +++++----- doc/source/user_guide/groupby.rst | 2 +- doc/source/whatsnew/v0.15.0.rst | 2 +- doc/source/whatsnew/v0.19.0.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 4 ++-- doc/source/whatsnew/v0.22.0.rst | 6 +++--- pandas/core/shared_docs.py | 2 +- pandas/plotting/_matplotlib/boxplot.py | 2 +- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 6fc53fe09d791..7c98c99fecd5b 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -702,11 +702,11 @@ Sorting is per order in the categories, not lexical order: df.sort_values(by="grade") -Grouping by a categorical column also shows empty categories: +Grouping by a categorical column with ``observed=False`` also shows empty categories: .. ipython:: python - df.groupby("grade").size() + df.groupby("grade", observed=False).size() Plotting diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index ef08d709822d2..3ce54cfebf65a 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -800,8 +800,8 @@ Groupby operations on the index will preserve the index nature as well. .. ipython:: python - df2.groupby(level=0).sum() - df2.groupby(level=0).sum().index + df2.groupby(level=0, observed=True).sum() + df2.groupby(level=0, observed=True).sum().index Reindexing operations will return a resulting index based on the type of the passed indexer. Passing a list will return a plain-old ``Index``; indexing with diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 0b2224fe9bb32..e486235f044f5 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -607,7 +607,7 @@ even if some categories are not present in the data: s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"])) s.value_counts() -``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories. +``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories when ``observed=False``. .. ipython:: python @@ -618,9 +618,9 @@ even if some categories are not present in the data: data=[[1, 2, 3], [4, 5, 6]], columns=pd.MultiIndex.from_arrays([["A", "B", "B"], columns]), ).T - df.groupby(level=1).sum() + df.groupby(level=1, observed=False).sum() -Groupby will also show "unused" categories: +Groupby will also show "unused" categories when ``observed=False``: .. ipython:: python @@ -628,7 +628,7 @@ Groupby will also show "unused" categories: ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"] ) df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]}) - df.groupby("cats").mean() + df.groupby("cats", observed=False).mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) df2 = pd.DataFrame( @@ -638,7 +638,7 @@ Groupby will also show "unused" categories: "values": [1, 2, 3, 4], } ) - df2.groupby(["cats", "B"]).mean() + df2.groupby(["cats", "B"], observed=False).mean() Pivot tables: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index b5bf7ee25a50f..b64b1814e13f7 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1281,7 +1281,7 @@ can be used as group keys. If so, the order of the levels will be preserved: factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) - data.groupby(factor).mean() + data.groupby(factor, observed=False).mean() .. _groupby.specify: diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index f52253687ecfd..67e91751e9527 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -85,7 +85,7 @@ For full docs, see the :ref:`categorical introduction ` and the "medium", "good", "very good"]) df["grade"] df.sort_values("grade") - df.groupby("grade").size() + df.groupby("grade", observed=False).size() - ``pandas.core.group_agg`` and ``pandas.core.factor_agg`` were removed. As an alternative, construct a dataframe and use ``df.groupby().agg()``. diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index feeb7b5ee30ce..ab17cacd830e5 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1134,7 +1134,7 @@ As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes .. ipython:: python df = pd.DataFrame({"A": [0, 1], "B": [10, 11], "C": cat}) - df_grouped = df.groupby(by=["A", "C"]).first() + df_grouped = df.groupby(by=["A", "C"], observed=False).first() df_set_idx = df.set_index(["A", "C"]) **Previous behavior**: diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index b41a469fe0c1f..34a875f59e808 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -289,7 +289,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr .. code-block:: ipython - In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + In [3]: df[df.chromosomes != '1'].groupby('chromosomes', observed=False, sort=False).sum() --------------------------------------------------------------------------- ValueError: items in new_categories are not the same as in old categories @@ -297,7 +297,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr .. ipython:: python - df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + df[df.chromosomes != '1'].groupby('chromosomes', observed=False, sort=False).sum() .. _whatsnew_0200.enhancements.table_schema: diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index ec9769c22e76b..c494b4f286662 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -109,7 +109,7 @@ instead of ``NaN``. In [8]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) - In [9]: pd.Series([1, 2]).groupby(grouper).sum() + In [9]: pd.Series([1, 2]).groupby(grouper, observed=False).sum() Out[9]: a 3.0 b NaN @@ -120,14 +120,14 @@ instead of ``NaN``. .. ipython:: python grouper = pd.Categorical(["a", "a"], categories=["a", "b"]) - pd.Series([1, 2]).groupby(grouper).sum() + pd.Series([1, 2]).groupby(grouper, observed=False).sum() To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, use ``min_count>=1``. .. ipython:: python - pd.Series([1, 2]).groupby(grouper).sum(min_count=1) + pd.Series([1, 2]).groupby(grouper, observed=False).sum(min_count=1) Resample ^^^^^^^^ diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 8df13200342c1..7f7f55b133f6f 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -145,7 +145,7 @@ If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. - .. deprecated:: 2.2.0 + .. deprecated:: 2.1.0 The default value will change to True in a future version of pandas. diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index e2f30da1b839c..1c5f122395fb5 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -253,7 +253,7 @@ def _grouped_plot_by_column( return_type=None, **kwargs, ): - grouped = data.groupby(by) + grouped = data.groupby(by, observed=False) if columns is None: if not isinstance(by, (list, tuple)): by = [by] From 32f4003728554ea5779027980d524403dd3cb880 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 26 Feb 2023 22:09:37 -0500 Subject: [PATCH 3/6] DEPR: observed=False default in groupby --- doc/source/user_guide/10min.rst | 4 +-- doc/source/user_guide/advanced.rst | 4 +-- doc/source/user_guide/categorical.rst | 10 +++--- doc/source/user_guide/groupby.rst | 2 +- doc/source/whatsnew/v0.15.0.rst | 2 +- doc/source/whatsnew/v0.19.0.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 4 +-- doc/source/whatsnew/v0.22.0.rst | 6 ++-- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/frame.py | 2 +- pandas/core/groupby/groupby.py | 17 +++++++++- pandas/core/indexes/base.py | 6 +++- pandas/core/series.py | 2 +- pandas/core/shared_docs.py | 5 +++ pandas/plotting/_matplotlib/boxplot.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 4 +-- pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_categorical.py | 32 +++++++++++++------ pandas/tests/groupby/test_groupby.py | 4 +-- pandas/tests/groupby/test_groupby_dropna.py | 6 ++-- pandas/tests/groupby/test_min_max.py | 2 +- pandas/tests/groupby/test_rank.py | 4 +-- pandas/tests/groupby/test_size.py | 2 +- .../tests/groupby/transform/test_transform.py | 2 +- 24 files changed, 82 insertions(+), 46 deletions(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 6fc53fe09d791..7c98c99fecd5b 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -702,11 +702,11 @@ Sorting is per order in the categories, not lexical order: df.sort_values(by="grade") -Grouping by a categorical column also shows empty categories: +Grouping by a categorical column with ``observed=False`` also shows empty categories: .. ipython:: python - df.groupby("grade").size() + df.groupby("grade", observed=False).size() Plotting diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index ef08d709822d2..3ce54cfebf65a 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -800,8 +800,8 @@ Groupby operations on the index will preserve the index nature as well. .. ipython:: python - df2.groupby(level=0).sum() - df2.groupby(level=0).sum().index + df2.groupby(level=0, observed=True).sum() + df2.groupby(level=0, observed=True).sum().index Reindexing operations will return a resulting index based on the type of the passed indexer. Passing a list will return a plain-old ``Index``; indexing with diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 0b2224fe9bb32..e486235f044f5 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -607,7 +607,7 @@ even if some categories are not present in the data: s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"])) s.value_counts() -``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories. +``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories when ``observed=False``. .. ipython:: python @@ -618,9 +618,9 @@ even if some categories are not present in the data: data=[[1, 2, 3], [4, 5, 6]], columns=pd.MultiIndex.from_arrays([["A", "B", "B"], columns]), ).T - df.groupby(level=1).sum() + df.groupby(level=1, observed=False).sum() -Groupby will also show "unused" categories: +Groupby will also show "unused" categories when ``observed=False``: .. ipython:: python @@ -628,7 +628,7 @@ Groupby will also show "unused" categories: ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"] ) df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]}) - df.groupby("cats").mean() + df.groupby("cats", observed=False).mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) df2 = pd.DataFrame( @@ -638,7 +638,7 @@ Groupby will also show "unused" categories: "values": [1, 2, 3, 4], } ) - df2.groupby(["cats", "B"]).mean() + df2.groupby(["cats", "B"], observed=False).mean() Pivot tables: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index b5bf7ee25a50f..b64b1814e13f7 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1281,7 +1281,7 @@ can be used as group keys. If so, the order of the levels will be preserved: factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) - data.groupby(factor).mean() + data.groupby(factor, observed=False).mean() .. _groupby.specify: diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index f52253687ecfd..67e91751e9527 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -85,7 +85,7 @@ For full docs, see the :ref:`categorical introduction ` and the "medium", "good", "very good"]) df["grade"] df.sort_values("grade") - df.groupby("grade").size() + df.groupby("grade", observed=False).size() - ``pandas.core.group_agg`` and ``pandas.core.factor_agg`` were removed. As an alternative, construct a dataframe and use ``df.groupby().agg()``. diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index feeb7b5ee30ce..ab17cacd830e5 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1134,7 +1134,7 @@ As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes .. ipython:: python df = pd.DataFrame({"A": [0, 1], "B": [10, 11], "C": cat}) - df_grouped = df.groupby(by=["A", "C"]).first() + df_grouped = df.groupby(by=["A", "C"], observed=False).first() df_set_idx = df.set_index(["A", "C"]) **Previous behavior**: diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index b41a469fe0c1f..34a875f59e808 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -289,7 +289,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr .. code-block:: ipython - In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + In [3]: df[df.chromosomes != '1'].groupby('chromosomes', observed=False, sort=False).sum() --------------------------------------------------------------------------- ValueError: items in new_categories are not the same as in old categories @@ -297,7 +297,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr .. ipython:: python - df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + df[df.chromosomes != '1'].groupby('chromosomes', observed=False, sort=False).sum() .. _whatsnew_0200.enhancements.table_schema: diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index ec9769c22e76b..c494b4f286662 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -109,7 +109,7 @@ instead of ``NaN``. In [8]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) - In [9]: pd.Series([1, 2]).groupby(grouper).sum() + In [9]: pd.Series([1, 2]).groupby(grouper, observed=False).sum() Out[9]: a 3.0 b NaN @@ -120,14 +120,14 @@ instead of ``NaN``. .. ipython:: python grouper = pd.Categorical(["a", "a"], categories=["a", "b"]) - pd.Series([1, 2]).groupby(grouper).sum() + pd.Series([1, 2]).groupby(grouper, observed=False).sum() To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, use ``min_count>=1``. .. ipython:: python - pd.Series([1, 2]).groupby(grouper).sum(min_count=1) + pd.Series([1, 2]).groupby(grouper, observed=False).sum(min_count=1) Resample ^^^^^^^^ diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index cb2ca6d16ec0a..9c62f34b02779 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,10 +93,10 @@ Other API changes Deprecations ~~~~~~~~~~~~ - Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) +- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`43999`) - Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`) - Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`) - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 98acab52e62f0..b997cc5ba4371 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8254,7 +8254,7 @@ def groupby( as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> DataFrameGroupBy: if axis is not lib.no_default: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 457352564f255..537435b326f11 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -68,6 +68,7 @@ class providing the base-class of operations. cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ensure_dtype_can_hold_na from pandas.core.dtypes.common import ( @@ -905,7 +906,7 @@ def __init__( as_index: bool = True, sort: bool = True, group_keys: bool | lib.NoDefault = True, - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> None: self._selection = selection @@ -941,6 +942,18 @@ def __init__( self.grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() + if observed is lib.no_default: + if any(ping._passed_categorical for ping in grouper.groupings): + warnings.warn( + "The default of observed=False is deprecated and will be changed " + "to True in a future version of pandas. Pass observed=False to " + "retain current behavior or observed=True to adopt the future " + "default and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + self.observed = False + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) @@ -2125,6 +2138,8 @@ def _value_counts( result_series.index.droplevel(levels), sort=self.sort, dropna=self.dropna, + # GH#43999 - deprecation of observed=False + observed=False, ).transform("sum") result_series /= indexed_group_size diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index acebe8a498f03..3f77ea4fed90a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -712,7 +712,11 @@ def _format_duplicate_message(self) -> DataFrame: duplicates = self[self.duplicated(keep="first")].unique() assert len(duplicates) - out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] + out = ( + Series(np.arange(len(self))) + .groupby(self, observed=False) + .agg(list)[duplicates] + ) if self._is_multi: # test_format_duplicate_labels_message_multi # error: "Type[Index]" has no attribute "from_tuples" [attr-defined] diff --git a/pandas/core/series.py b/pandas/core/series.py index 95ee3f1af58f1..9f06d7ad6d02c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1975,7 +1975,7 @@ def groupby( as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, dropna: bool = True, ) -> SeriesGroupBy: from pandas.core.groupby.generic import SeriesGroupBy diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 184b77c880238..7f7f55b133f6f 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -144,6 +144,11 @@ This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + + .. deprecated:: 2.1.0 + + The default value will change to True in a future version of pandas. + dropna : bool, default True If True, and if group keys contain NA values, NA values together with row/column will be dropped. diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index e2f30da1b839c..1c5f122395fb5 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -253,7 +253,7 @@ def _grouped_plot_by_column( return_type=None, **kwargs, ): - grouped = data.groupby(by) + grouped = data.groupby(by, observed=False) if columns is None: if not isinstance(by, (list, tuple)): by = [by] diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 14bd466b052bf..205846ad694b2 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1250,7 +1250,7 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg(grp_col_dict) + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( @@ -1289,7 +1289,7 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg(grp_col_dict) + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a7ba1e8e81848..0699b7c1369f2 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -883,7 +883,7 @@ def test_apply_multi_level_name(category): df = DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) - result = df.groupby("B").apply(lambda x: x.sum()) + result = df.groupby("B", observed=False).apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index dbbfab14d5c76..e4dd07f790f47 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -739,7 +739,7 @@ def test_categorical_series(series, data): # Group the given series by a series with categorical data type such that group A # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in # the given data. - groupby = series.groupby(Series(list("ABBA"), dtype="category")) + groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False) result = groupby.aggregate(list) expected = Series(data, index=CategoricalIndex(data.keys())) tm.assert_series_equal(result, expected) @@ -1115,7 +1115,7 @@ def test_groupby_multiindex_categorical_datetime(): "values": np.arange(9), } ) - result = df.groupby(["key1", "key2"]).mean() + result = df.groupby(["key1", "key2"], observed=False).mean() idx = MultiIndex.from_product( [ @@ -1291,8 +1291,8 @@ def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): def test_groupby_categorical_series_dataframe_consistent(df_cat): # GH 20416 - expected = df_cat.groupby(["A", "B"])["C"].mean() - result = df_cat.groupby(["A", "B"]).mean()["C"] + expected = df_cat.groupby(["A", "B"], observed=False)["C"].mean() + result = df_cat.groupby(["A", "B"], observed=False).mean()["C"] tm.assert_series_equal(result, expected) @@ -1303,11 +1303,11 @@ def test_groupby_categorical_axis_1(code): cat = Categorical.from_codes(code, categories=list("abc")) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(cat, axis=1) + gb = df.groupby(cat, axis=1, observed=False) result = gb.mean() msg = "The 'axis' keyword in DataFrame.groupby is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - gb2 = df.T.groupby(cat, axis=0) + gb2 = df.T.groupby(cat, axis=0, observed=False) expected = gb2.mean().T tm.assert_frame_equal(result, expected) @@ -1478,7 +1478,7 @@ def test_series_groupby_categorical_aggregation_getitem(): df = DataFrame(d) cat = pd.cut(df["foo"], np.linspace(0, 20, 5)) df["range"] = cat - groups = df.groupby(["range", "baz"], as_index=True, sort=True) + groups = df.groupby(["range", "baz"], as_index=True, sort=True, observed=False) result = groups["foo"].agg("mean") expected = groups.agg("mean")["foo"] tm.assert_series_equal(result, expected) @@ -1539,7 +1539,7 @@ def test_read_only_category_no_sort(): {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))} ) expected = DataFrame(data={"a": [2.0, 6.0]}, index=CategoricalIndex(cats, name="b")) - result = df.groupby("b", sort=False).mean() + result = df.groupby("b", sort=False, observed=False).mean() tm.assert_frame_equal(result, expected) @@ -1583,7 +1583,7 @@ def test_sorted_missing_category_values(): dtype="category", ) - result = df.groupby(["bar", "foo"]).size().unstack() + result = df.groupby(["bar", "foo"], observed=False).size().unstack() tm.assert_frame_equal(result, expected) @@ -1748,7 +1748,7 @@ def test_groupby_categorical_indices_unused_categories(): "col": range(3), } ) - grouped = df.groupby("key", sort=False) + grouped = df.groupby("key", sort=False, observed=False) result = grouped.indices expected = { "b": np.array([0, 1], dtype="intp"), @@ -2013,3 +2013,15 @@ def test_many_categories(as_index, sort, index_kind, ordered): expected = DataFrame({"a": Series(index), "b": data}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("cat_columns", ["a", "b", ["a", "b"]]) +@pytest.mark.parametrize("keys", ["a", "b", ["a", "b"]]) +def test_groupby_default_depr(cat_columns, keys): + # GH#43999 + df = DataFrame({"a": [1, 1, 2, 3], "b": [4, 5, 6, 7]}) + df[cat_columns] = df[cat_columns].astype("category") + msg = "The default of observed=False is deprecated" + klass = FutureWarning if set(cat_columns) & set(keys) else None + with tm.assert_produces_warning(klass, match=msg): + df.groupby(keys) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ea4bb42fb7ee1..f1dad7a22c789 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1926,7 +1926,7 @@ def test_empty_groupby( df = df.iloc[:0] - gb = df.groupby(keys, group_keys=False, dropna=dropna)[columns] + gb = df.groupby(keys, group_keys=False, dropna=dropna, observed=False)[columns] def get_result(**kwargs): if method == "attr": @@ -2638,7 +2638,7 @@ def test_datetime_categorical_multikey_groupby_indices(): "c": Categorical.from_codes([-1, 0, 1], categories=[0, 1]), } ) - result = df.groupby(["a", "b"]).indices + result = df.groupby(["a", "b"], observed=False).indices expected = { ("a", Timestamp("2018-01-01 00:00:00")): np.array([0]), ("b", Timestamp("2018-02-01 00:00:00")): np.array([1]), diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 31a8e7a7d36ac..1fab736453ea4 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -448,7 +448,7 @@ def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): "a": [0, 1, 2, 3], } ) - gb = df.groupby("key", dropna=False, sort=False, as_index=as_index) + gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False) if test_series: gb = gb["a"] result = gb.sum() @@ -666,7 +666,7 @@ def test_categorical_agg(): df = pd.DataFrame( {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} ) - gb = df.groupby("x", dropna=False) + gb = df.groupby("x", dropna=False, observed=False) result = gb.agg(lambda x: x.sum()) expected = gb.sum() tm.assert_frame_equal(result, expected) @@ -678,7 +678,7 @@ def test_categorical_transform(): df = pd.DataFrame( {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))} ) - gb = df.groupby("x", dropna=False) + gb = df.groupby("x", dropna=False, observed=False) result = gb.transform(lambda x: x.sum()) expected = gb.transform("sum") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 11f62c5d03c49..8602f8bdb1aa1 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -236,7 +236,7 @@ def test_min_max_nullable_uint64_empty_group(): # don't raise NotImplementedError from libgroupby cat = pd.Categorical([0] * 10, categories=[0, 1]) df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) - gb = df.groupby("A") + gb = df.groupby("A", observed=False) res = gb.min() diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 9f42f6ad72591..8c863dc2982ae 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -21,11 +21,11 @@ def test_rank_unordered_categorical_typeerror(): msg = "Cannot perform rank with non-ordered Categorical" - gb = ser.groupby(cat) + gb = ser.groupby(cat, observed=False) with pytest.raises(TypeError, match=msg): gb.rank() - gb2 = df.groupby(cat) + gb2 = df.groupby(cat, observed=False) with pytest.raises(TypeError, match=msg): gb2.rank() diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index e29f87992f8a1..7da6bc8a32013 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -83,7 +83,7 @@ def test_size_period_index(): def test_size_on_categorical(as_index): df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) df["A"] = df["A"].astype("category") - result = df.groupby(["A", "B"], as_index=as_index).size() + result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() expected = DataFrame( [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 27ffeb9247556..d6d0b03a65ebb 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1078,7 +1078,7 @@ def test_transform_absent_categories(func): x_cats = range(2) y = [1] df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y}) - result = getattr(df.y.groupby(df.x), func)() + result = getattr(df.y.groupby(df.x, observed=False), func)() expected = df.y tm.assert_series_equal(result, expected) From 12f93c61b48d855e75f98f0cf02544a2589249b0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 6 Mar 2023 20:32:00 -0500 Subject: [PATCH 4/6] fixup --- pandas/core/groupby/groupby.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 537435b326f11..27884eb5a6502 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -923,7 +923,6 @@ def __init__( self.keys = keys self.sort = sort self.group_keys = group_keys - self.observed = observed self.dropna = dropna if grouper is None: @@ -933,15 +932,10 @@ def __init__( axis=axis, level=level, sort=sort, - observed=observed, + observed=False if observed is lib.no_default else observed, dropna=self.dropna, ) - self.obj = obj - self.axis = obj._get_axis_number(axis) - self.grouper = grouper - self.exclusions = frozenset(exclusions) if exclusions else frozenset() - if observed is lib.no_default: if any(ping._passed_categorical for ping in grouper.groupings): warnings.warn( @@ -952,7 +946,13 @@ def __init__( FutureWarning, stacklevel=find_stack_level(), ) - self.observed = False + observed = False + self.observed = observed + + self.obj = obj + self.axis = obj._get_axis_number(axis) + self.grouper = grouper + self.exclusions = frozenset(exclusions) if exclusions else frozenset() def __getattr__(self, attr: str): if attr in self._internal_names_set: From de4aecece112acfd08572978a631171f89af2c96 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 15 Mar 2023 23:14:35 -0400 Subject: [PATCH 5/6] Mention defaulting to True --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9c62f34b02779..35032c2d60644 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,7 +93,7 @@ Other API changes Deprecations ~~~~~~~~~~~~ - Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) -- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`43999`) +- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) - Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`) - Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`) - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`) From 9d12c4e46d6e71884883587b47f765d25353c91d Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 16 Mar 2023 22:12:22 -0400 Subject: [PATCH 6/6] fixup --- pandas/core/groupby/groupby.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a6006cc9c2077..4f40728449d8a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -954,18 +954,6 @@ def __init__( self.grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() - if observed is lib.no_default: - if any(ping._passed_categorical for ping in grouper.groupings): - warnings.warn( - "The default of observed=False is deprecated and will be changed " - "to True in a future version of pandas. Pass observed=False to " - "retain current behavior or observed=True to adopt the future " - "default and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) - self.observed = False - def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr)