diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index dabd9a650f45b..048cd978c4478 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in creating a :class:`DataFrame` from a timezone-aware :class:`Timestamp` scalar near a Daylight Savings Time transition (:issue:`42505`) - Fixed performance regression in :func:`read_csv` (:issue:`44106`) - Fixed regression in :meth:`Series.duplicated` and :meth:`Series.drop_duplicates` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) +- Fixed regression in :meth:`.GroupBy.sum` with ``timedelta64[ns]`` dtype containing ``NaT`` failing to treat that value as NA (:issue:`42659`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 0450a3483d346..8eccd0eec8a1c 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -56,6 +56,7 @@ def group_add( values: np.ndarray, # ndarray[complexfloating_t, ndim=2] labels: np.ndarray, # const intp_t[:] min_count: int = ..., + datetimelike: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 078cb8e02e824..c0d1405e92518 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -495,7 +495,8 @@ def group_add(add_t[:, ::1] out, int64_t[::1] counts, ndarray[add_t, ndim=2] values, const intp_t[::1] labels, - Py_ssize_t min_count=0) -> None: + Py_ssize_t min_count=0, + bint datetimelike=False) -> None: """ Only aggregates on axis=0 using Kahan summation """ @@ -557,7 +558,14 @@ def group_add(add_t[:, ::1] out, val = values[i, j] # not nan - if val == val: + # With dt64/td64 values, values have been cast to float64 + # instead if int64 for group_add, but the logic + # is otherwise the same as in _treat_as_na + if val == val and not ( + add_t is float64_t + and datetimelike + and val == NPY_NAT + ): nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8223a04883738..7915e107afae6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -531,6 +531,16 @@ def _call_cython_op( result_mask=result_mask, is_datetimelike=is_datetimelike, ) + elif self.how in ["add"]: + # We support datetimelike + func( + result, + counts, + values, + comp_ids, + min_count, + datetimelike=is_datetimelike, + ) else: func(result, counts, values, comp_ids, min_count) else: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c462db526b36d..39a3e82fc2d98 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1162,3 +1162,27 @@ def test_mean_on_timedelta(): pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat") ) tm.assert_series_equal(result, expected) + + +def test_groupby_sum_timedelta_with_nat(): + # GH#42659 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + } + ) + td3 = pd.Timedelta(days=3) + + gb = df.groupby("a") + + res = gb.sum() + expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(res, expected) + + res = gb["b"].sum() + tm.assert_series_equal(res, expected["b"]) + + res = gb["b"].sum(min_count=2) + expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) + tm.assert_series_equal(res, expected)