diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index c7cb9705d7cb9..dfae1bff91ac8 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -50,7 +50,7 @@ def group_any_all( val_test: Literal["any", "all"], skipna: bool, ) -> None: ... -def group_add( +def group_sum( out: np.ndarray, # complexfloating_t[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[complexfloating_t, ndim=2] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index db785bd962f96..06830a1d84c6e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -124,7 +124,7 @@ def group_median_float64( ndarray[intp_t] indexer float64_t* ptr - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" ngroups = len(counts) N, K = (values).shape @@ -502,7 +502,7 @@ def group_any_all( # ---------------------------------------------------------------------- -# group_add, group_prod, group_var, group_mean, group_ohlc +# group_sum, group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- ctypedef fused mean_t: @@ -511,17 +511,17 @@ ctypedef fused mean_t: complex64_t complex128_t -ctypedef fused add_t: +ctypedef fused sum_t: mean_t object @cython.wraparound(False) @cython.boundscheck(False) -def group_add( - add_t[:, ::1] out, +def group_sum( + sum_t[:, ::1] out, int64_t[::1] counts, - ndarray[add_t, ndim=2] values, + ndarray[sum_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=0, bint is_datetimelike=False, @@ -531,8 +531,8 @@ def group_add( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - add_t val, t, y - add_t[:, ::1] sumx, compensation + sum_t val, t, y + sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -546,7 +546,7 @@ def group_add( N, K = (values).shape - if add_t is object: + if sum_t is object: # NB: this does not use 'compensation' like the non-object track does. for i in range(N): lab = labels[i] @@ -588,10 +588,10 @@ def group_add( # not nan # With dt64/td64 values, values have been cast to float64 - # instead if int64 for group_add, but the logic + # instead if int64 for group_sum, but the logic # is otherwise the same as in _treat_as_na if val == val and not ( - add_t is float64_t + sum_t is float64_t and is_datetimelike and val == NPY_NAT ): @@ -677,7 +677,7 @@ def group_var( int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -745,7 +745,7 @@ def group_mean( Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. min_count : Py_ssize_t - Only used in add and prod. Always -1. + Only used in sum and prod. Always -1. is_datetimelike : bool True if `values` contains datetime-like entries. mask : ndarray[bool, ndim=2], optional @@ -766,7 +766,7 @@ def group_mean( int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -821,7 +821,7 @@ def group_ohlc( Py_ssize_t i, j, N, K, lab floating val - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" if len(labels) == 0: return diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9b4991d32692b..06422f8cc5cb0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1338,7 +1338,6 @@ def _resolve_numeric_only( if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): # GH#47500 - how = "sum" if how == "add" else how warnings.warn( f"{type(self).__name__}.{how} called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will " @@ -1738,9 +1737,8 @@ def _cython_agg_general( kwd_name = "numeric_only" if how in ["any", "all"]: kwd_name = "bool_only" - kernel = "sum" if how == "add" else how raise NotImplementedError( - f"{type(self).__name__}.{kernel} does not implement {kwd_name}." + f"{type(self).__name__}.{how} does not implement {kwd_name}." ) elif not is_ser: data = data.get_numeric_data(copy=False) @@ -2417,7 +2415,7 @@ def sum( result = self._agg_general( numeric_only=numeric_only, min_count=min_count, - alias="add", + alias="sum", npfunc=np.sum, ) @@ -4341,8 +4339,6 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None: - if how == "add": - how = "sum" if numeric_only is not lib.no_default and not numeric_only: # numeric_only was specified and falsey but still dropped nuisance columns warnings.warn( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6dc4ccfa8e1ee..283e4a48657c5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -121,7 +121,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: _CYTHON_FUNCTIONS = { "aggregate": { - "add": "group_add", + "sum": "group_sum", "prod": "group_prod", "min": "group_min", "max": "group_max", @@ -213,7 +213,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["add", "var", "prod", "mean", "ohlc"] or ( + if how in ["sum", "var", "prod", "mean", "ohlc"] or ( self.kind == "transform" and self.has_dropped_na ): # result may still include NaN, so we have to cast @@ -241,7 +241,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): if isinstance(dtype, CategoricalDtype): # NotImplementedError for methods that can fall back to a # non-cython implementation. - if how in ["add", "prod", "cumsum", "cumprod"]: + if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"{dtype} type does not support {how} operations") elif how not in ["rank"]: # only "rank" is implemented in cython @@ -258,7 +258,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): # TODO: same for period_dtype? no for these methods with Period # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes - if how in ["add", "prod", "cumsum", "cumprod"]: + if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(dtype): if how in ["prod", "cumprod"]: @@ -311,7 +311,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: """ how = self.how - if how in ["add", "cumsum", "sum", "prod"]: + if how in ["sum", "cumsum", "sum", "prod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) elif how in ["mean", "median", "var"]: @@ -567,7 +567,7 @@ def _call_cython_op( result_mask=result_mask, is_datetimelike=is_datetimelike, ) - elif self.how in ["add"]: + elif self.how in ["sum"]: # We support datetimelike func( out=result, @@ -625,7 +625,7 @@ def _call_cython_op( # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here # Casting only needed for float16, bool, datetimelike, - # and self.how in ["add", "prod", "ohlc", "cumprod"] + # and self.how in ["sum", "prod", "ohlc", "cumprod"] res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 869ed31b6a2d9..6c5a3ae67c78a 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -166,7 +166,7 @@ def test_cython_fail_agg(): ("mean", np.mean), ("median", np.median), ("var", np.var), - ("add", np.sum), + ("sum", np.sum), ("prod", np.prod), ("min", np.min), ("max", np.max), @@ -214,7 +214,7 @@ def test_cython_agg_empty_buckets_nanops(observed): grps = range(0, 25, 5) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( - "add", alt=None, numeric_only=True + "sum", alt=None, numeric_only=True ) intervals = pd.interval_range(0, 20, freq=5, inclusive="right") expected = DataFrame( diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index fbc3b385e5098..970d4f155ecfc 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -61,7 +61,7 @@ def test_custom_grouper(index): # check all cython functions work g.ohlc() # doesn't use _cython_agg_general - funcs = ["add", "mean", "prod", "min", "max", "var"] + funcs = ["sum", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f, alt=None, numeric_only=True) @@ -69,7 +69,7 @@ def test_custom_grouper(index): g = s.groupby(b) # check all cython functions work g.ohlc() # doesn't use _cython_agg_general - funcs = ["add", "mean", "prod", "min", "max", "var"] + funcs = ["sum", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f, alt=None, numeric_only=True) @@ -414,7 +414,7 @@ def test_resample_upsampling_picked_but_not_correct(): tm.assert_series_equal(result2, expected) -@pytest.mark.parametrize("f", ["add", "mean", "prod", "min", "max", "var"]) +@pytest.mark.parametrize("f", ["sum", "mean", "prod", "min", "max", "var"]) def test_resample_frame_basic_cy_funcs(f): df = tm.makeTimeDataFrame()