add proper type when grouping by a Series (#708)

ClementPinard · web-flow · commit d873a4607408 · 2023-06-02T08:30:49.000-04:00
* remove Series from GroupByObjectNonScalar, add new orverloads for Series.groupby and DataFrame.groupby

* Add tests for iteration over groupby

* Add dtype in tests

* address PR comments

* Use bound method for ByT and SeriesByT typevars
diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi
@@ -407,33 +407,51 @@ Function: TypeAlias = np.ufunc | Callable[..., Any]
 _HashableTa = TypeVar("_HashableTa", bound=Hashable)
 ByT = TypeVar(
     "ByT",
-    str,
-    bytes,
-    datetime.date,
-    datetime.datetime,
-    datetime.timedelta,
-    np.datetime64,
-    np.timedelta64,
-    bool,
-    int,
-    float,
-    complex,
-    Timestamp,
-    Timedelta,
-    Scalar,
-    Period,
-    Interval[int],
-    Interval[float],
-    Interval[Timestamp],
-    Interval[Timedelta],
-    tuple,
+    bound=str
+    | bytes
+    | datetime.date
+    | datetime.datetime
+    | datetime.timedelta
+    | np.datetime64
+    | np.timedelta64
+    | bool
+    | int
+    | float
+    | complex
+    | Timestamp
+    | Timedelta
+    | Scalar
+    | Period
+    | Interval[int]
+    | Interval[float]
+    | Interval[Timestamp]
+    | Interval[Timedelta]
+    | tuple,
+)
+# Use a distinct SeriesByT when using groupby with Series of known dtype.
+# Essentially, an intersection between Series S1 TypeVar, and ByT TypeVar
+SeriesByT = TypeVar(
+    "SeriesByT",
+    bound=str
+    | bytes
+    | datetime.date
+    | bool
+    | int
+    | float
+    | complex
+    | Timestamp
+    | Timedelta
+    | Period
+    | Interval[int]
+    | Interval[float]
+    | Interval[Timestamp]
+    | Interval[Timedelta],
 )
 GroupByObjectNonScalar: TypeAlias = (
     tuple
     | list[_HashableTa]
     | Function
     | list[Function]
-    | Series
     | list[Series]
     | np.ndarray
     | list[np.ndarray]
@@ -443,7 +461,7 @@ GroupByObjectNonScalar: TypeAlias = (
     | Grouper
     | list[Grouper]
 )
-GroupByObject: TypeAlias = Scalar | Index | GroupByObjectNonScalar
+GroupByObject: TypeAlias = Scalar | Index | GroupByObjectNonScalar | Series
 
 StataDateFormat: TypeAlias = Literal[
     "tc",
diff --git a/pandas-stubs/core/frame.pyi b/pandas-stubs/core/frame.pyi
@@ -107,6 +107,7 @@ from pandas._typing import (
     ReplaceMethod,
     Scalar,
     ScalarT,
+    SeriesByT,
     SortKind,
     StataDateFormat,
     StorageOptions,
@@ -1087,7 +1088,20 @@ class DataFrame(NDFrame, OpsMixin):
     @overload
     def groupby(
         self,
-        by: CategoricalIndex | Index,
+        by: Series[SeriesByT],
+        axis: Axis = ...,
+        level: Level | None = ...,
+        as_index: _bool = ...,
+        sort: _bool = ...,
+        group_keys: _bool = ...,
+        squeeze: _bool = ...,
+        observed: _bool = ...,
+        dropna: _bool = ...,
+    ) -> DataFrameGroupBy[SeriesByT]: ...
+    @overload
+    def groupby(
+        self,
+        by: CategoricalIndex | Index | Series,
         axis: Axis = ...,
         level: Level | None = ...,
         as_index: _bool = ...,
diff --git a/pandas-stubs/core/series.pyi b/pandas-stubs/core/series.pyi
@@ -130,6 +130,7 @@ from pandas._typing import (
     Renamer,
     ReplaceMethod,
     Scalar,
+    SeriesByT,
     SortKind,
     StrDtypeArg,
     TimedeltaDtypeArg,
@@ -635,7 +636,20 @@ class Series(IndexOpsMixin, NDFrame, Generic[S1]):
     @overload
     def groupby(
         self,
-        by: CategoricalIndex | Index,
+        by: Series[SeriesByT],
+        axis: AxisIndex = ...,
+        level: Level | None = ...,
+        as_index: _bool = ...,
+        sort: _bool = ...,
+        group_keys: _bool = ...,
+        squeeze: _bool = ...,
+        observed: _bool = ...,
+        dropna: _bool = ...,
+    ) -> SeriesGroupBy[S1, SeriesByT]: ...
+    @overload
+    def groupby(
+        self,
+        by: CategoricalIndex | Index | Series,
         axis: AxisIndex = ...,
         level: Level | None = ...,
         as_index: _bool = ...,
diff --git a/tests/test_frame.py b/tests/test_frame.py
@@ -985,6 +985,20 @@ def test_types_groupby_any() -> None:
     )
 
 
+def test_types_groupby_iter() -> None:
+    df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5]})
+    series_groupby = pd.Series([True, True, False], dtype=bool)
+    first_group = next(iter(df.groupby(series_groupby)))
+    check(
+        assert_type(first_group[0], bool),
+        bool,
+    )
+    check(
+        assert_type(first_group[1], pd.DataFrame),
+        pd.DataFrame,
+    )
+
+
 def test_types_merge() -> None:
     df = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [3, 4, 5]})
     df2 = pd.DataFrame(data={"col1": [1, 1, 2], "col2": [0, 1, 0]})
diff --git a/tests/test_series.py b/tests/test_series.py
@@ -732,6 +732,17 @@ def test_types_group_by_with_dropna_keyword() -> None:
     s.groupby(level=0).sum()
 
 
+def test_types_groupby_iter() -> None:
+    s = pd.Series([1, 1, 2], dtype=int)
+    series_groupby = pd.Series([True, True, False], dtype=bool)
+    first_group = next(iter(s.groupby(series_groupby)))
+    check(
+        assert_type(first_group[0], bool),
+        bool,
+    )
+    check(assert_type(first_group[1], "pd.Series[int]"), pd.Series, np.integer)
+
+
 def test_types_plot() -> None:
     s = pd.Series([0, 1, 1, 0, -10])
     if TYPE_CHECKING:  # skip pytest