From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 001/106] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From cf5c6c3cdfeb05416e8cdb3ebe4d92c6e87046ee Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 20 Oct 2019 19:25:57 +0200 Subject: [PATCH 002/106] Implement agg for DataFrame --- pandas/core/frame.py | 44 ++++++- pandas/core/groupby/generic.py | 212 +------------------------------ pandas/core/groupby/helper.py | 193 ++++++++++++++++++++++++++++ pandas/tests/frame/test_apply.py | 92 ++++++++++++++ 4 files changed, 335 insertions(+), 206 deletions(-) create mode 100644 pandas/core/groupby/helper.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c90bf4ba7151f..535c577c9fbb7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -100,6 +100,11 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby.helper import ( + _is_multi_agg_with_relabel, + _maybe_mangle_lambdas, + _normalize_keyword_aggregation, +) from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex @@ -6616,9 +6621,22 @@ def _gotitem( **_shared_doc_kwargs ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) + relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + if relabeling: + func, indexes, order = _normalize_keyword_aggregation(kwargs) + reordered_indexes = [ + pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) + ] + kwargs = {} + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + func = _maybe_mangle_lambdas(func) + result = None try: result, how = self._aggregate(func, axis=axis, *args, **kwargs) @@ -6626,6 +6644,30 @@ def aggregate(self, func, axis=0, *args, **kwargs): pass if result is None: return self.apply(func, axis=axis, args=args, **kwargs) + + if relabeling: + + # create a function name and index mapping dictionary for each column + func_index_dict = OrderedDict() + idx = 0 + for func_name, funcs in func.items(): + func_index_dict[func_name] = reordered_indexes[idx : idx + len(funcs)] + idx = idx + len(funcs) + + # restructure the result + reordered_result = DataFrame(index=reordered_indexes) + + # when there are more than one column being used in aggregate, the order + # of result will be reversed, and in case the func is not used by other + # columns, there might be NaN values, so separate these two cases + if len(func) > 1: + for k, v in func_index_dict.items(): + reordered_result.loc[v, k] = result[k][::-1].dropna().values + else: + result.index = reordered_indexes + reordered_result = result + + result = reordered_result.reindex(indexes) return result def _aggregate(self, arg, axis=0, *args, **kwargs): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a78857423e7e0..e73983d7c1539 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -7,21 +7,10 @@ """ from collections import OrderedDict, abc, namedtuple import copy -import functools from functools import partial from textwrap import dedent import typing -from typing import ( - Any, - Callable, - FrozenSet, - Hashable, - Iterable, - Sequence, - Tuple, - Type, - Union, -) +from typing import Any, Callable, FrozenSet, Hashable, Iterable, Tuple, Type, Union import warnings import numpy as np @@ -40,10 +29,8 @@ ensure_platform_int, is_bool, is_datetimelike, - is_dict_like, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_dtype, is_object_dtype, is_scalar, @@ -63,6 +50,11 @@ _transform_template, groupby, ) +from pandas.core.groupby.helper import ( + _is_multi_agg_with_relabel, + _maybe_mangle_lambdas, + _normalize_keyword_aggregation, +) from pandas.core.index import Index, MultiIndex, _all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block @@ -870,7 +862,7 @@ def aggregate(self, func=None, *args, **kwargs): raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") func = _maybe_mangle_lambdas(func) - + print("here") result, how = self._aggregate(func, _level=_level, *args, **kwargs) if how is None: return result @@ -1758,196 +1750,6 @@ def groupby_series(obj, col=None): boxplot = boxplot_frame_groupby -def _is_multi_agg_with_relabel(**kwargs): - """ - Check whether kwargs passed to .agg look like multi-agg with relabeling. - - Parameters - ---------- - **kwargs : dict - - Returns - ------- - bool - - Examples - -------- - >>> _is_multi_agg_with_relabel(a='max') - False - >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), - ... a_min=('a', 'min')) - True - >>> _is_multi_agg_with_relabel() - False - """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs - - -def _normalize_keyword_aggregation(kwargs): - """ - Normalize user-provided "named aggregation" kwargs. - - Transforms from the new ``Dict[str, NamedAgg]`` style kwargs - to the old OrderedDict[str, List[scalar]]]. - - Parameters - ---------- - kwargs : dict - - Returns - ------- - aggspec : dict - The transformed kwargs. - columns : List[str] - The user-provided keys. - col_idx_order : List[int] - List of columns indices. - - Examples - -------- - >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) - """ - if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) - - # Normalize the aggregation functions as Dict[column, List[func]], - # process normally, then fixup the names. - # TODO(Py35): When we drop python 3.5, change this to - # defaultdict(list) - # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] - # May be hitting https://github.com/python/mypy/issues/5958 - # saying it doesn't have an attribute __name__ - aggspec = OrderedDict() - order = [] - columns, pairs = list(zip(*kwargs.items())) - - for name, (column, aggfunc) in zip(columns, pairs): - if column in aggspec: - aggspec[column].append(aggfunc) - else: - aggspec[column] = [aggfunc] - order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - - # uniquify aggfunc name if duplicated in order list - uniquified_order = _make_unique(order) - - # GH 25719, due to aggspec will change the order of assigned columns in aggregation - # uniquified_aggspec will store uniquified order list and will compare it with order - # based on index - aggspec_order = [ - (column, com.get_callable_name(aggfunc) or aggfunc) - for column, aggfuncs in aggspec.items() - for aggfunc in aggfuncs - ] - uniquified_aggspec = _make_unique(aggspec_order) - - # get the new indice of columns by comparison - col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) - return aggspec, columns, col_idx_order - - -def _make_unique(seq): - """Uniquify aggfunc name of the pairs in the order list - - Examples: - -------- - >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) - [('a', '_0'), ('a', '_1'), ('b', '')] - """ - return [ - (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) - if seq.count(pair) > 1 - else pair - for i, pair in enumerate(seq) - ] - - -# TODO: Can't use, because mypy doesn't like us setting __name__ -# error: "partial[Any]" has no attribute "__name__" -# the type is: -# typing.Sequence[Callable[..., ScalarResult]] -# -> typing.Sequence[Callable[..., ScalarResult]]: - - -def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: - """ - Possibly mangle a list of aggfuncs. - - Parameters - ---------- - aggfuncs : Sequence - - Returns - ------- - mangled: list-like - A new AggSpec sequence, where lambdas have been converted - to have unique names. - - Notes - ----- - If just one aggfunc is passed, the name will not be mangled. - """ - if len(aggfuncs) <= 1: - # don't mangle for .agg([lambda x: .]) - return aggfuncs - i = 0 - mangled_aggfuncs = [] - for aggfunc in aggfuncs: - if com.get_callable_name(aggfunc) == "": - aggfunc = functools.partial(aggfunc) - aggfunc.__name__ = "".format(i) - i += 1 - mangled_aggfuncs.append(aggfunc) - - return mangled_aggfuncs - - -def _maybe_mangle_lambdas(agg_spec: Any) -> Any: - """ - Make new lambdas with unique names. - - Parameters - ---------- - agg_spec : Any - An argument to GroupBy.agg. - Non-dict-like `agg_spec` are pass through as is. - For dict-like `agg_spec` a new spec is returned - with name-mangled lambdas. - - Returns - ------- - mangled : Any - Same type as the input. - - Examples - -------- - >>> _maybe_mangle_lambdas('sum') - 'sum' - - >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP - [, - .f(*args, **kwargs)>] - """ - is_dict = is_dict_like(agg_spec) - if not (is_dict or is_list_like(agg_spec)): - return agg_spec - mangled_aggspec = type(agg_spec)() # dict or OrderdDict - - if is_dict: - for key, aggfuncs in agg_spec.items(): - if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): - mangled_aggfuncs = _managle_lambda_list(aggfuncs) - else: - mangled_aggfuncs = aggfuncs - - mangled_aggspec[key] = mangled_aggfuncs - else: - mangled_aggspec = _managle_lambda_list(agg_spec) - - return mangled_aggspec - - def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ If we have date/time like in the original, then coerce dates diff --git a/pandas/core/groupby/helper.py b/pandas/core/groupby/helper.py new file mode 100644 index 0000000000000..0507b14993d78 --- /dev/null +++ b/pandas/core/groupby/helper.py @@ -0,0 +1,193 @@ +from collections import OrderedDict +import functools +from typing import Any, Sequence + +from pandas.compat import PY36 + +from pandas.core.dtypes.common import is_dict_like, is_list_like + +import pandas.core.common as com +from pandas.core.index import Index + + +def _is_multi_agg_with_relabel(**kwargs): + """ + Check whether kwargs passed to .agg look like multi-agg with relabeling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> _is_multi_agg_with_relabel(a='max') + False + >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> _is_multi_agg_with_relabel() + False + """ + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs + + +def _normalize_keyword_aggregation(kwargs): + """ + Normalize user-provided "named aggregation" kwargs. + + Transforms from the new ``Dict[str, NamedAgg]`` style kwargs + to the old OrderedDict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + col_idx_order : List[int] + List of columns indices. + + Examples + -------- + >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) + (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + """ + if not PY36: + kwargs = OrderedDict(sorted(kwargs.items())) + + # Normalize the aggregation functions as Dict[column, List[func]], + # process normally, then fixup the names. + # TODO(Py35): When we drop python 3.5, change this to + # defaultdict(list) + # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec = OrderedDict() + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + if column in aggspec: + aggspec[column].append(aggfunc) + else: + aggspec[column] = [aggfunc] + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = functools.partial(aggfunc) + aggfunc.__name__ = "".format(i) + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def _maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to GroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> _maybe_mangle_lambdas('sum') + 'sum' + + >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe034504b8161..f1a1635a46606 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1,5 +1,6 @@ from collections import OrderedDict from datetime import datetime +import functools from itertools import chain import operator import warnings @@ -1346,3 +1347,94 @@ def test_frequency_is_original(self, num_cols): df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq + + +class TestDataFrameNamedAggregate: + + # GH 26513 + def test_agg_relabel(self): + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + + # simplest case with one column, one func + result = df.agg(foo=("B", "sum")) + expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) + tm.assert_frame_equal(result, expected) + + # test on same column with different methods + result = df.agg(foo=("B", "sum"), bar=("B", "min")) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + tm.assert_frame_equal(result, expected) + + # test on multiple columns with multiple methods + result = df.agg( + foo=("A", "sum"), + bar=("B", "mean"), + cat=("A", "min"), + dat=("B", "max"), + f=("A", "max"), + g=("C", "min"), + ) + expected = pd.DataFrame( + { + "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], + "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), + ) + tm.assert_frame_equal(result, expected) + + # test on partial, functools or more complex cases + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + expected = pd.DataFrame( + {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + ) + expected = pd.DataFrame( + { + "A": [1.0, 1.0, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 10.0, np.nan, 4.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_namedtuple(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.agg( + foo=pd.NamedAgg("B", "sum"), + bar=pd.NamedAgg("B", min), + cat=pd.NamedAgg(column="B", aggfunc="count"), + fft=pd.NamedAgg("B", aggfunc="max"), + ) + expected = pd.DataFrame( + {"B": [10, 1, 4, 4]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=pd.NamedAgg("A", "min"), + bar=pd.NamedAgg(column="B", aggfunc="max"), + cat=pd.NamedAgg(column="A", aggfunc="max"), + ) + expected = pd.DataFrame( + {"A": [1.0, np.nan, 2.0], "B": [np.nan, 4.0, np.nan]}, + index=pd.Index(["foo", "bar", "cat"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_raises(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() From 4fb74b567c83427a20a4b7d84ca34cb349540cce Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 20 Oct 2019 19:34:00 +0200 Subject: [PATCH 003/106] remove unused import --- pandas/tests/frame/test_apply.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 68aeb5a81d45a..fe8f1d992347d 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1,6 +1,5 @@ from collections import OrderedDict from datetime import datetime -import functools from itertools import chain import operator import warnings From 97209beffd40b83dac8a8adbe769b120b199eb3f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 20 Oct 2019 20:50:33 +0200 Subject: [PATCH 004/106] remove print --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e73983d7c1539..5c1e2e7f3a131 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -862,7 +862,7 @@ def aggregate(self, func=None, *args, **kwargs): raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") func = _maybe_mangle_lambdas(func) - print("here") + result, how = self._aggregate(func, _level=_level, *args, **kwargs) if how is None: return result From ca273ff1aee5ee69a5f0428ebccb961df72c30db Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 20 Oct 2019 21:09:30 +0200 Subject: [PATCH 005/106] fix test --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index aa80c461a00e7..93730c8d31dc2 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas +from pandas.core.groupby.helper import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm From 1d2ab15728308d86efb2535864f35f6d6450e095 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 20 Oct 2019 21:34:14 +0200 Subject: [PATCH 006/106] fix typo --- pandas/tests/frame/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe8f1d992347d..aa40efe25f3c5 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1416,7 +1416,7 @@ def test_agg_namedtuple(self): fft=pd.NamedAgg("B", aggfunc="max"), ) expected = pd.DataFrame( - {"B": [10, 1, 4, 4]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) ) tm.assert_frame_equal(result, expected) From 3ca193c0656bddd19734e3314b76184cef9325bd Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 20 Oct 2019 22:19:35 +0200 Subject: [PATCH 007/106] add keyword agg for series --- pandas/core/series.py | 6 +++++- pandas/tests/frame/test_apply.py | 30 +++++++++++++++--------------- pandas/tests/series/test_apply.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 16 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ea48b3603623a..1eb237cda73e4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3812,9 +3812,13 @@ def _gotitem(self, key, ndim, subset=None): **_shared_doc_kwargs ) @Appender(generic._shared_docs["aggregate"]) - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate(self, func=None, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) + + if func is None: + func = kwargs + result, how = self._aggregate(func, *args, **kwargs) if result is None: diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index aa40efe25f3c5..f79cd876b829d 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1347,6 +1347,20 @@ def test_frequency_is_original(self, num_cols): df.apply(lambda x: x) assert index.freq == original.freq + def test_apply_datetime_tz_issue(self): + # GH 29052 + + timestamps = [ + pd.Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1) + expected = pd.Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) + class TestDataFrameNamedAggregate: @@ -1426,7 +1440,7 @@ def test_agg_namedtuple(self): cat=pd.NamedAgg(column="A", aggfunc="max"), ) expected = pd.DataFrame( - {"A": [1.0, np.nan, 2.0], "B": [np.nan, 4.0, np.nan]}, + {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, index=pd.Index(["foo", "bar", "cat"]), ) tm.assert_frame_equal(result, expected) @@ -1437,17 +1451,3 @@ def test_agg_raises(self): with pytest.raises(TypeError, match=msg): df.agg() - - def test_apply_datetime_tz_issue(self): - # GH 29052 - - timestamps = [ - pd.Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), - pd.Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), - pd.Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), - ] - df = DataFrame(data=[0, 1, 2], index=timestamps) - result = df.apply(lambda x: x.name, axis=1) - expected = pd.Series(index=timestamps, data=timestamps) - - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 65a0822bbc55f..fb32847941726 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -737,3 +737,34 @@ def test_apply_scaler_on_date_time_index_aware_series(self): series = tm.makeTimeSeries(nper=30).tz_localize("UTC") result = pd.Series(series.index).apply(lambda x: 1) assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) + + +class TestNamedAggregation: + def test_relabel_no_duplicated_method(self): + # this is to test there is no duplicated method used in agg + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum") + expected = df["A"].agg({"foo": "sum"}) + tm.assert_series_equal(result, expected) + + result = df.B.agg(foo="min", bar="max") + expected = df.B.agg({"foo": "min", "bar": "max"}) + tm.assert_series_equal(result, expected) + + result = df.B.agg(foo=sum, bar=min, cat="max") + expected = df.B.agg({"foo": sum, "bar": min, "cat": "max"}) + tm.assert_series_equal(result, expected) + + def test_relabel_duplicated_method(self): + # this is to test with nested renaming, duplicated method can be used + # if they are assigned with different new names + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df.A.agg(foo="sum", bar="sum") + expected = pd.Series([6, 6], index=pd.Index(["foo", "bar"]), name="A") + tm.assert_series_equal(result, expected) + + result = df.B.agg(foo=min, bar="min") + expected = pd.Series([1, 1], index=pd.Index(["foo", "bar"]), name="B") + tm.assert_series_equal(result, expected) From c8f80ed5493d27dbac95d0b813ed71ce720f71bc Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 20 Oct 2019 22:43:45 +0200 Subject: [PATCH 008/106] fix linting --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 93730c8d31dc2..f7aee7c3d5642 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,8 +10,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError -from pandas.core.groupby.helper import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping +from pandas.core.groupby.helper import _make_unique, _maybe_mangle_lambdas import pandas.util.testing as tm From 8c738e91dc488ccf04fa8eb26213525bca07c9d6 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 21 Oct 2019 08:41:55 +0200 Subject: [PATCH 009/106] fix PY35 issue --- pandas/core/frame.py | 3 +-- pandas/core/series.py | 7 ++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 535c577c9fbb7..9d889e0284849 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6663,11 +6663,10 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if len(func) > 1: for k, v in func_index_dict.items(): reordered_result.loc[v, k] = result[k][::-1].dropna().values + result = reordered_result.reindex(indexes) else: result.index = reordered_indexes - reordered_result = result - result = reordered_result.reindex(indexes) return result def _aggregate(self, arg, axis=0, *args, **kwargs): diff --git a/pandas/core/series.py b/pandas/core/series.py index 1eb237cda73e4..428b6c35ff1a3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3817,7 +3817,12 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): self._get_axis_number(axis) if func is None: - func = kwargs + # This is due to order issue of dictionary in PY35, e.g. if {"foo" + # : "sum", "bar": "min"}, then it will take "bar" first because it + # b is before f + func = OrderedDict() + for k, v in kwargs.items(): + func[k] = v result, how = self._aggregate(func, *args, **kwargs) if result is None: From d4d9ea419705281071a93bc5da4904064c65d672 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 21 Oct 2019 10:01:12 +0200 Subject: [PATCH 010/106] try to fix py35 order issue --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d889e0284849..5695749393cfb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6665,7 +6665,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): reordered_result.loc[v, k] = result[k][::-1].dropna().values result = reordered_result.reindex(indexes) else: - result.index = reordered_indexes + result.index = indexes return result From 2a6de2795b84672f544ddd9b8a6549f8906267d5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 21 Oct 2019 10:42:46 +0200 Subject: [PATCH 011/106] test if fixed --- pandas/core/frame.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5695749393cfb..fd9d2f314c8dc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6627,9 +6627,12 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: func, indexes, order = _normalize_keyword_aggregation(kwargs) - reordered_indexes = [ - pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) - ] + if len(func) > 1: + reordered_indexes = [ + pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) + ] + else: + reordered_indexes = list(kwargs.keys()) kwargs = {} elif func is None: # nicer error message From 21e09f9373870e3ec4a0437695e5df8246f1d466 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 21 Oct 2019 12:50:19 +0200 Subject: [PATCH 012/106] test again --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fd9d2f314c8dc..c9fb08fa64071 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6668,7 +6668,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): reordered_result.loc[v, k] = result[k][::-1].dropna().values result = reordered_result.reindex(indexes) else: - result.index = indexes + result.index = reordered_indexes return result From 058a8e982be014c28e007b188188273996d8b312 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 21 Oct 2019 12:59:12 +0200 Subject: [PATCH 013/106] simpler code --- pandas/core/series.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 428b6c35ff1a3..a545450750210 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3820,9 +3820,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # This is due to order issue of dictionary in PY35, e.g. if {"foo" # : "sum", "bar": "min"}, then it will take "bar" first because it # b is before f - func = OrderedDict() - for k, v in kwargs.items(): - func[k] = v + func = OrderedDict(kwargs.items()) result, how = self._aggregate(func, *args, **kwargs) if result is None: From 0da68d8935cfdbdc60b3b3fce1f4602848129f7f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 07:17:21 +0800 Subject: [PATCH 014/106] test py35 --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c9fb08fa64071..0797fa25f095b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6632,6 +6632,8 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) ] else: + # if drop support to PY35, this could remove + kwargs = OrderedDict(kwargs.items()) reordered_indexes = list(kwargs.keys()) kwargs = {} elif func is None: @@ -6649,7 +6651,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): return self.apply(func, axis=axis, args=args, **kwargs) if relabeling: - # create a function name and index mapping dictionary for each column func_index_dict = OrderedDict() idx = 0 From 438398db6d77e520627750fe32fa535389ccf443 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 08:16:58 +0800 Subject: [PATCH 015/106] test PY35 --- pandas/core/frame.py | 15 ++++++--------- pandas/core/groupby/generic.py | 1 - pandas/core/series.py | 4 +++- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d91658657ad37..e2cbbd2c13f31 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6622,15 +6622,12 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: + if not PY36: + kwargs = OrderedDict(sorted(kwargs.items())) func, indexes, order = _normalize_keyword_aggregation(kwargs) - if len(func) > 1: - reordered_indexes = [ - pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) - ] - else: - # if drop support to PY35, this could remove - kwargs = OrderedDict(kwargs.items()) - reordered_indexes = list(kwargs.keys()) + reordered_indexes = [ + pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) + ] kwargs = {} elif func is None: # nicer error message @@ -6665,7 +6662,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): reordered_result.loc[v, k] = result[k][::-1].dropna().values result = reordered_result.reindex(indexes) else: - result.index = reordered_indexes + result.index = indexes return result diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 832901d1fb0ba..2e13874065cca 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -17,7 +17,6 @@ Hashable, Iterable, Optional, - Sequence, Tuple, Type, Union, diff --git a/pandas/core/series.py b/pandas/core/series.py index d1a60999b1ede..8d16d6a52267e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3816,7 +3816,9 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # This is due to order issue of dictionary in PY35, e.g. if {"foo" # : "sum", "bar": "min"}, then it will take "bar" first because it # b is before f - func = OrderedDict(kwargs.items()) + if not PY36: + kwargs = OrderedDict(sorted(kwargs.items())) + func = kwargs result, how = self._aggregate(func, *args, **kwargs) if result is None: From d47b790b1346a413882f7d6b900c8ad8d08adaff Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 08:54:51 +0800 Subject: [PATCH 016/106] try to fix py35 --- pandas/core/frame.py | 2 +- pandas/core/series.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e2cbbd2c13f31..c95e170df08e7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6662,7 +6662,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): reordered_result.loc[v, k] = result[k][::-1].dropna().values result = reordered_result.reindex(indexes) else: - result.index = indexes + result.index = reordered_indexes return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 8d16d6a52267e..bf1902d768883 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3819,6 +3819,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if not PY36: kwargs = OrderedDict(sorted(kwargs.items())) func = kwargs + kwargs = {} result, how = self._aggregate(func, *args, **kwargs) if result is None: From 832b8d925662c28d5140947b9a5a7709cf002906 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 09:56:46 +0800 Subject: [PATCH 017/106] find py35 output --- pandas/core/frame.py | 9 ++++----- pandas/core/series.py | 4 +--- pandas/tests/frame/test_apply.py | 1 + 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c95e170df08e7..5433df56ea85f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6623,7 +6623,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) + ids = list(sorted(list(kwargs))) func, indexes, order = _normalize_keyword_aggregation(kwargs) reordered_indexes = [ pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) @@ -6652,17 +6652,16 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): idx = idx + len(funcs) # restructure the result - reordered_result = DataFrame(index=reordered_indexes) - + reordered_result = DataFrame(index=indexes) # when there are more than one column being used in aggregate, the order # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases if len(func) > 1: for k, v in func_index_dict.items(): reordered_result.loc[v, k] = result[k][::-1].dropna().values - result = reordered_result.reindex(indexes) + result = reordered_result else: - result.index = reordered_indexes + result.index = ids return result diff --git a/pandas/core/series.py b/pandas/core/series.py index bf1902d768883..3ab4c390522d7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3816,9 +3816,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # This is due to order issue of dictionary in PY35, e.g. if {"foo" # : "sum", "bar": "min"}, then it will take "bar" first because it # b is before f - if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) - func = kwargs + func = OrderedDict(kwargs.items()) kwargs = {} result, how = self._aggregate(func, *args, **kwargs) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index f79cd876b829d..31efc914ca72f 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1433,6 +1433,7 @@ def test_agg_namedtuple(self): {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) ) tm.assert_frame_equal(result, expected) + assert all(result['B'] == [3, 1, 2, 2]) result = df.agg( foo=pd.NamedAgg("A", "min"), From 5a3b6903e6acc354cfb73452c44f2be2690621e1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 11:32:02 +0800 Subject: [PATCH 018/106] test py35 --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5433df56ea85f..9d72002f52ebb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6661,7 +6661,10 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): reordered_result.loc[v, k] = result[k][::-1].dropna().values result = reordered_result else: - result.index = ids + if not PY36: + result.index = ids + else: + result.index = indexes return result From 4fb86f0d951fbe40f9246f3671d083bc3d50b4e1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 15:27:26 +0800 Subject: [PATCH 019/106] retest py35 --- pandas/core/frame.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d72002f52ebb..f90947744da83 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6622,8 +6622,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: - if not PY36: - ids = list(sorted(list(kwargs))) func, indexes, order = _normalize_keyword_aggregation(kwargs) reordered_indexes = [ pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) @@ -6644,12 +6642,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): return self.apply(func, axis=axis, args=args, **kwargs) if relabeling: - # create a function name and index mapping dictionary for each column - func_index_dict = OrderedDict() - idx = 0 - for func_name, funcs in func.items(): - func_index_dict[func_name] = reordered_indexes[idx : idx + len(funcs)] - idx = idx + len(funcs) # restructure the result reordered_result = DataFrame(index=indexes) @@ -6657,15 +6649,14 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases if len(func) > 1: - for k, v in func_index_dict.items(): - reordered_result.loc[v, k] = result[k][::-1].dropna().values - result = reordered_result + idx = 0 + for col, funcs in OrderedDict(func.items()): + v = reordered_indexes[idx:, idx + len(funcs)] + reordered_result.loc[v, col] = result[col][::-1].dropna().values + idx = idx + len(funcs) else: - if not PY36: - result.index = ids - else: - result.index = indexes - + reordered_result.iloc[:, 0] = result.values + result = reordered_result return result def _aggregate(self, arg, axis=0, *args, **kwargs): From a1369bfc8fc5657c5fe2476fbdf503475bfb411f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 16:20:40 +0800 Subject: [PATCH 020/106] retest py35 --- pandas/core/frame.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f90947744da83..6a4d3c55ba894 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6643,20 +6643,22 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if relabeling: - # restructure the result - reordered_result = DataFrame(index=indexes) # when there are more than one column being used in aggregate, the order # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases if len(func) > 1: + + # restructure the result + reordered_result = DataFrame(index=indexes) idx = 0 - for col, funcs in OrderedDict(func.items()): - v = reordered_indexes[idx:, idx + len(funcs)] + for col, funcs in func.items(): + v = reordered_indexes[idx: idx + len(funcs)] reordered_result.loc[v, col] = result[col][::-1].dropna().values idx = idx + len(funcs) + result = reordered_result else: - reordered_result.iloc[:, 0] = result.values - result = reordered_result + result.index = indexes + return result def _aggregate(self, arg, axis=0, *args, **kwargs): From ef981a3db0c35d8c8d91aa5c90325256ac739d3a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 16:57:25 +0800 Subject: [PATCH 021/106] try to fix py35 --- pandas/core/frame.py | 19 +++++++++---------- pandas/tests/frame/test_apply.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6a4d3c55ba894..433d7fe69c7d0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6646,18 +6646,17 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # when there are more than one column being used in aggregate, the order # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases - if len(func) > 1: - # restructure the result - reordered_result = DataFrame(index=indexes) - idx = 0 - for col, funcs in func.items(): - v = reordered_indexes[idx: idx + len(funcs)] + reordered_result = DataFrame(index=indexes) + idx = 0 + for col, funcs in func.items(): + v = reordered_indexes[idx: idx + len(funcs)] + if len(func) > 1: reordered_result.loc[v, col] = result[col][::-1].dropna().values - idx = idx + len(funcs) - result = reordered_result - else: - result.index = indexes + else: + reordered_result.loc[v, col] = result[col].values + idx = idx + len(funcs) + result = reordered_result return result diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 31efc914ca72f..73a978643df3a 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1432,8 +1432,8 @@ def test_agg_namedtuple(self): expected = pd.DataFrame( {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) ) - tm.assert_frame_equal(result, expected) assert all(result['B'] == [3, 1, 2, 2]) + tm.assert_frame_equal(result, expected) result = df.agg( foo=pd.NamedAgg("A", "min"), From 82c89605e27eba73f3002e1921b26b6c40bca63d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 17:34:17 +0800 Subject: [PATCH 022/106] try to fix py35 --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 433d7fe69c7d0..17eedd4e938dc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6647,7 +6647,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases - reordered_result = DataFrame(index=indexes) + reordered_result = DataFrame(index=reordered_indexes) idx = 0 for col, funcs in func.items(): v = reordered_indexes[idx: idx + len(funcs)] @@ -6656,7 +6656,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): else: reordered_result.loc[v, col] = result[col].values idx = idx + len(funcs) - result = reordered_result + result = reordered_result.reindex(indexes) return result From c610391b5d8c5f4fff7b375a703f0f31a5c38810 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 18:28:27 +0800 Subject: [PATCH 023/106] try one more time --- pandas/core/frame.py | 8 +++++--- pandas/tests/frame/test_apply.py | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 17eedd4e938dc..ca01e597a5a9c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6622,6 +6622,8 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: + if not PY36: + idx_35 = list(OrderedDict(kwargs)) func, indexes, order = _normalize_keyword_aggregation(kwargs) reordered_indexes = [ pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) @@ -6646,8 +6648,9 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # when there are more than one column being used in aggregate, the order # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases - - reordered_result = DataFrame(index=reordered_indexes) + if not PY36: + indexes = idx_35 + reordered_result = DataFrame(index=indexes) idx = 0 for col, funcs in func.items(): v = reordered_indexes[idx: idx + len(funcs)] @@ -6656,7 +6659,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): else: reordered_result.loc[v, col] = result[col].values idx = idx + len(funcs) - result = reordered_result.reindex(indexes) return result diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 73a978643df3a..f79cd876b829d 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1432,7 +1432,6 @@ def test_agg_namedtuple(self): expected = pd.DataFrame( {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) ) - assert all(result['B'] == [3, 1, 2, 2]) tm.assert_frame_equal(result, expected) result = df.agg( From 679ba59dccd5a4413826b6c04579c6913985b078 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 19:05:59 +0800 Subject: [PATCH 024/106] fix typo --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca01e597a5a9c..a176780dd81b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6659,7 +6659,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): else: reordered_result.loc[v, col] = result[col].values idx = idx + len(funcs) - + result = reordered_result return result def _aggregate(self, arg, axis=0, *args, **kwargs): From 2ee2628d290671469adbb8e2893110478c7dc57b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 19:38:11 +0800 Subject: [PATCH 025/106] py35 --- pandas/core/frame.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a176780dd81b1..972dcc3c89e6c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6622,8 +6622,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: - if not PY36: - idx_35 = list(OrderedDict(kwargs)) func, indexes, order = _normalize_keyword_aggregation(kwargs) reordered_indexes = [ pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) @@ -6648,12 +6646,10 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # when there are more than one column being used in aggregate, the order # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases - if not PY36: - indexes = idx_35 reordered_result = DataFrame(index=indexes) idx = 0 for col, funcs in func.items(): - v = reordered_indexes[idx: idx + len(funcs)] + v = reordered_indexes[idx : idx + len(funcs)] if len(func) > 1: reordered_result.loc[v, col] = result[col][::-1].dropna().values else: From 31f7033969c93f47035fdef51e9c52a4dcb6510f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 21:14:17 +0800 Subject: [PATCH 026/106] skip PY35 --- pandas/tests/frame/test_apply.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index f79cd876b829d..57e8b5f61364f 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat import PY36 + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1375,7 +1377,10 @@ def test_agg_relabel(self): # test on same column with different methods result = df.agg(foo=("B", "sum"), bar=("B", "min")) - expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + if not PY36: + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + else: + expected = pd.DataFrame({"B": [1, 10]}, index=pd.Index(["bar", "foo"])) tm.assert_frame_equal(result, expected) # test on multiple columns with multiple methods @@ -1429,9 +1434,14 @@ def test_agg_namedtuple(self): cat=pd.NamedAgg(column="B", aggfunc="count"), fft=pd.NamedAgg("B", aggfunc="max"), ) - expected = pd.DataFrame( - {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) - ) + if not PY36: + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + else: + expected = pd.DataFrame( + {"B": [1, 2, 2, 3]}, index=pd.Index(["bar", "cat", "fft", "foo"]) + ) tm.assert_frame_equal(result, expected) result = df.agg( From 2acb2446f93be259138f4cf8c1bfe463e1c6bda9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 22:00:13 +0800 Subject: [PATCH 027/106] skip py35 --- pandas/tests/frame/test_apply.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 57e8b5f61364f..01222653eac67 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.compat import PY36 +from pandas.compat import PY35 from pandas.core.dtypes.dtypes import CategoricalDtype @@ -1377,7 +1377,7 @@ def test_agg_relabel(self): # test on same column with different methods result = df.agg(foo=("B", "sum"), bar=("B", "min")) - if not PY36: + if PY36: expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) else: expected = pd.DataFrame({"B": [1, 10]}, index=pd.Index(["bar", "foo"])) @@ -1434,7 +1434,7 @@ def test_agg_namedtuple(self): cat=pd.NamedAgg(column="B", aggfunc="count"), fft=pd.NamedAgg("B", aggfunc="max"), ) - if not PY36: + if PY36: expected = pd.DataFrame( {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) ) From dfbd67acb3970fad99e3c57b0333605945e50e93 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 22:26:03 +0800 Subject: [PATCH 028/106] fix typo --- pandas/tests/frame/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 01222653eac67..a3ce017d09392 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.compat import PY35 +from pandas.compat import PY36 from pandas.core.dtypes.dtypes import CategoricalDtype From ff5e60fe6e0231c26d613f740009b5b9637da23d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 23 Oct 2019 23:13:26 +0800 Subject: [PATCH 029/106] skip all py35 --- pandas/tests/frame/test_apply.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index a3ce017d09392..e60b6aa67e5cd 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1400,14 +1400,22 @@ def test_agg_relabel(self): }, index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), ) - tm.assert_frame_equal(result, expected) + if PY36: + tm.assert_frame_equal(result, expected) + else: + with pytest.xfail(reason="PY35"): + tm.assert_frame_equal(result, expected) # test on partial, functools or more complex cases result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) expected = pd.DataFrame( {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) ) - tm.assert_frame_equal(result, expected) + if PY36: + tm.assert_frame_equal(result, expected) + else: + with pytest.xfail(reason="PY35"): + tm.assert_frame_equal(result, expected) result = df.agg( foo=("A", min), @@ -1424,7 +1432,11 @@ def test_agg_relabel(self): }, index=pd.Index(["foo", "bar", "cat", "dat", "f"]), ) - tm.assert_frame_equal(result, expected) + if PY36: + tm.assert_frame_equal(result, expected) + else: + with pytest.xfail(reason="PY35"): + tm.assert_frame_equal(result, expected) def test_agg_namedtuple(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) @@ -1453,7 +1465,11 @@ def test_agg_namedtuple(self): {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, index=pd.Index(["foo", "bar", "cat"]), ) - tm.assert_frame_equal(result, expected) + if PY36: + tm.assert_frame_equal(result, expected) + else: + with pytest.xfail(reason="PY35"): + tm.assert_frame_equal(result, expected) def test_agg_raises(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) From 7c6c891b87fab1e1cb5216e5ffa9f6f42bc041d2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 24 Oct 2019 00:10:04 +0800 Subject: [PATCH 030/106] skip py35 for series --- pandas/tests/series/test_apply.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index fb32847941726..c1acebb255a7f 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import PY36 + import pandas as pd from pandas import DataFrame, Index, Series, isna from pandas.conftest import _get_cython_table_params @@ -763,8 +765,16 @@ def test_relabel_duplicated_method(self): result = df.A.agg(foo="sum", bar="sum") expected = pd.Series([6, 6], index=pd.Index(["foo", "bar"]), name="A") - tm.assert_series_equal(result, expected) + if PY36: + tm.assert_frame_equal(result, expected) + else: + with pytest.xfail(reason="PY35"): + tm.assert_frame_equal(result, expected) result = df.B.agg(foo=min, bar="min") expected = pd.Series([1, 1], index=pd.Index(["foo", "bar"]), name="B") - tm.assert_series_equal(result, expected) + if PY36: + tm.assert_frame_equal(result, expected) + else: + with pytest.xfail(reason="PY35"): + tm.assert_frame_equal(result, expected) From 3e55fcbd49a7c93487100b84a71f4b163b67d95a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 24 Oct 2019 07:00:57 +0800 Subject: [PATCH 031/106] fix test --- pandas/tests/series/test_apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index c1acebb255a7f..125c7f78da69a 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -764,7 +764,7 @@ def test_relabel_duplicated_method(self): df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) result = df.A.agg(foo="sum", bar="sum") - expected = pd.Series([6, 6], index=pd.Index(["foo", "bar"]), name="A") + expected = pd.Series([6, 6], index=["foo", "bar"], name="A") if PY36: tm.assert_frame_equal(result, expected) else: @@ -772,7 +772,7 @@ def test_relabel_duplicated_method(self): tm.assert_frame_equal(result, expected) result = df.B.agg(foo=min, bar="min") - expected = pd.Series([1, 1], index=pd.Index(["foo", "bar"]), name="B") + expected = pd.Series([1, 1], index=["foo", "bar"], name="B") if PY36: tm.assert_frame_equal(result, expected) else: From 6d74b2957f8c55989e90877916dbef821a783980 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 24 Oct 2019 07:56:26 +0800 Subject: [PATCH 032/106] skip series py35 --- pandas/tests/series/test_apply.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 125c7f78da69a..f4ed5f72ddb7b 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -766,15 +766,15 @@ def test_relabel_duplicated_method(self): result = df.A.agg(foo="sum", bar="sum") expected = pd.Series([6, 6], index=["foo", "bar"], name="A") if PY36: - tm.assert_frame_equal(result, expected) + tm.assert_series_equal(result, expected) else: with pytest.xfail(reason="PY35"): - tm.assert_frame_equal(result, expected) + tm.assert_series_equal(result, expected) result = df.B.agg(foo=min, bar="min") expected = pd.Series([1, 1], index=["foo", "bar"], name="B") if PY36: - tm.assert_frame_equal(result, expected) + tm.assert_series_equal(result, expected) else: with pytest.xfail(reason="PY35"): - tm.assert_frame_equal(result, expected) + tm.assert_series_equal(result, expected) From 05af2de5698d5764df6004dbfdf079d645194660 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 8 Nov 2019 20:36:07 +0100 Subject: [PATCH 033/106] remove helper --- pandas/core/groupby/helper.py | 193 ------------------------------- pandas/tests/frame/test_apply.py | 44 ++----- 2 files changed, 10 insertions(+), 227 deletions(-) delete mode 100644 pandas/core/groupby/helper.py diff --git a/pandas/core/groupby/helper.py b/pandas/core/groupby/helper.py deleted file mode 100644 index 0507b14993d78..0000000000000 --- a/pandas/core/groupby/helper.py +++ /dev/null @@ -1,193 +0,0 @@ -from collections import OrderedDict -import functools -from typing import Any, Sequence - -from pandas.compat import PY36 - -from pandas.core.dtypes.common import is_dict_like, is_list_like - -import pandas.core.common as com -from pandas.core.index import Index - - -def _is_multi_agg_with_relabel(**kwargs): - """ - Check whether kwargs passed to .agg look like multi-agg with relabeling. - - Parameters - ---------- - **kwargs : dict - - Returns - ------- - bool - - Examples - -------- - >>> _is_multi_agg_with_relabel(a='max') - False - >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), - ... a_min=('a', 'min')) - True - >>> _is_multi_agg_with_relabel() - False - """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs - - -def _normalize_keyword_aggregation(kwargs): - """ - Normalize user-provided "named aggregation" kwargs. - - Transforms from the new ``Dict[str, NamedAgg]`` style kwargs - to the old OrderedDict[str, List[scalar]]]. - - Parameters - ---------- - kwargs : dict - - Returns - ------- - aggspec : dict - The transformed kwargs. - columns : List[str] - The user-provided keys. - col_idx_order : List[int] - List of columns indices. - - Examples - -------- - >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) - """ - if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) - - # Normalize the aggregation functions as Dict[column, List[func]], - # process normally, then fixup the names. - # TODO(Py35): When we drop python 3.5, change this to - # defaultdict(list) - # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] - # May be hitting https://github.com/python/mypy/issues/5958 - # saying it doesn't have an attribute __name__ - aggspec = OrderedDict() - order = [] - columns, pairs = list(zip(*kwargs.items())) - - for name, (column, aggfunc) in zip(columns, pairs): - if column in aggspec: - aggspec[column].append(aggfunc) - else: - aggspec[column] = [aggfunc] - order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - - # uniquify aggfunc name if duplicated in order list - uniquified_order = _make_unique(order) - - # GH 25719, due to aggspec will change the order of assigned columns in aggregation - # uniquified_aggspec will store uniquified order list and will compare it with order - # based on index - aggspec_order = [ - (column, com.get_callable_name(aggfunc) or aggfunc) - for column, aggfuncs in aggspec.items() - for aggfunc in aggfuncs - ] - uniquified_aggspec = _make_unique(aggspec_order) - - # get the new indice of columns by comparison - col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) - return aggspec, columns, col_idx_order - - -def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: - """ - Possibly mangle a list of aggfuncs. - - Parameters - ---------- - aggfuncs : Sequence - - Returns - ------- - mangled: list-like - A new AggSpec sequence, where lambdas have been converted - to have unique names. - - Notes - ----- - If just one aggfunc is passed, the name will not be mangled. - """ - if len(aggfuncs) <= 1: - # don't mangle for .agg([lambda x: .]) - return aggfuncs - i = 0 - mangled_aggfuncs = [] - for aggfunc in aggfuncs: - if com.get_callable_name(aggfunc) == "": - aggfunc = functools.partial(aggfunc) - aggfunc.__name__ = "".format(i) - i += 1 - mangled_aggfuncs.append(aggfunc) - - return mangled_aggfuncs - - -def _maybe_mangle_lambdas(agg_spec: Any) -> Any: - """ - Make new lambdas with unique names. - - Parameters - ---------- - agg_spec : Any - An argument to GroupBy.agg. - Non-dict-like `agg_spec` are pass through as is. - For dict-like `agg_spec` a new spec is returned - with name-mangled lambdas. - - Returns - ------- - mangled : Any - Same type as the input. - - Examples - -------- - >>> _maybe_mangle_lambdas('sum') - 'sum' - - >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP - [, - .f(*args, **kwargs)>] - """ - is_dict = is_dict_like(agg_spec) - if not (is_dict or is_list_like(agg_spec)): - return agg_spec - mangled_aggspec = type(agg_spec)() # dict or OrderdDict - - if is_dict: - for key, aggfuncs in agg_spec.items(): - if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): - mangled_aggfuncs = _managle_lambda_list(aggfuncs) - else: - mangled_aggfuncs = aggfuncs - - mangled_aggspec[key] = mangled_aggfuncs - else: - mangled_aggspec = _managle_lambda_list(agg_spec) - - return mangled_aggspec - - -def _make_unique(seq): - """Uniquify aggfunc name of the pairs in the order list - - Examples: - -------- - >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) - [('a', '_0'), ('a', '_1'), ('b', '')] - """ - return [ - (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) - if seq.count(pair) > 1 - else pair - for i, pair in enumerate(seq) - ] diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 73aa508377fe8..ad9fe76fe3202 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas.compat import PY36 - from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1384,10 +1382,8 @@ def test_agg_relabel(self): # test on same column with different methods result = df.agg(foo=("B", "sum"), bar=("B", "min")) - if PY36: - expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) - else: - expected = pd.DataFrame({"B": [1, 10]}, index=pd.Index(["bar", "foo"])) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + tm.assert_frame_equal(result, expected) # test on multiple columns with multiple methods @@ -1407,22 +1403,14 @@ def test_agg_relabel(self): }, index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), ) - if PY36: - tm.assert_frame_equal(result, expected) - else: - with pytest.xfail(reason="PY35"): - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # test on partial, functools or more complex cases result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) expected = pd.DataFrame( {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) ) - if PY36: - tm.assert_frame_equal(result, expected) - else: - with pytest.xfail(reason="PY35"): - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.agg( foo=("A", min), @@ -1439,11 +1427,7 @@ def test_agg_relabel(self): }, index=pd.Index(["foo", "bar", "cat", "dat", "f"]), ) - if PY36: - tm.assert_frame_equal(result, expected) - else: - with pytest.xfail(reason="PY35"): - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_namedtuple(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) @@ -1453,14 +1437,10 @@ def test_agg_namedtuple(self): cat=pd.NamedAgg(column="B", aggfunc="count"), fft=pd.NamedAgg("B", aggfunc="max"), ) - if PY36: - expected = pd.DataFrame( - {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) - ) - else: - expected = pd.DataFrame( - {"B": [1, 2, 2, 3]}, index=pd.Index(["bar", "cat", "fft", "foo"]) - ) + + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) tm.assert_frame_equal(result, expected) result = df.agg( @@ -1472,11 +1452,7 @@ def test_agg_namedtuple(self): {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, index=pd.Index(["foo", "bar", "cat"]), ) - if PY36: - tm.assert_frame_equal(result, expected) - else: - with pytest.xfail(reason="PY35"): - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_raises(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) From 6206fa4fc98bce4005f8966504031f9bf83322ac Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 8 Nov 2019 20:37:14 +0100 Subject: [PATCH 034/106] remove py36 --- pandas/tests/series/test_apply.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 0d6edf8ada3e3..bafdb0dd9ec3f 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas.compat import PY36 - import pandas as pd from pandas import DataFrame, Index, Series, isna from pandas.conftest import _get_cython_table_params From 34199ad68ffa0dd2049f51ea586e7d4be6829013 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 8 Nov 2019 21:07:54 +0100 Subject: [PATCH 035/106] put back imports --- pandas/core/frame.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2a90e89d2871b..006908fad01f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -99,6 +99,11 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby.generic import ( + _is_multi_agg_with_relabel, + _maybe_mangle_lambdas, + _normalize_keyword_aggregation, +) from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex From c56f05fe9da6047c3485b5d1eb6053bf20694743 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 8 Nov 2019 22:56:46 +0100 Subject: [PATCH 036/106] avoid circular dependency --- pandas/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 006908fad01f0..1e535bce211a8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -99,11 +99,6 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.groupby.generic import ( - _is_multi_agg_with_relabel, - _maybe_mangle_lambdas, - _normalize_keyword_aggregation, -) from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex @@ -6594,6 +6589,11 @@ def _gotitem( ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, axis=0, *args, **kwargs): + from pandas.core.groupby.generic import ( + _is_multi_agg_with_relabel, + _maybe_mangle_lambdas, + _normalize_keyword_aggregation, + ) axis = self._get_axis_number(axis) relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) From d3f062080dc8befed3a4eebc0f63f7b5566735e9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 8 Nov 2019 23:27:35 +0100 Subject: [PATCH 037/106] fix linting --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e535bce211a8..c67178e9e0cc3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6594,6 +6594,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): _maybe_mangle_lambdas, _normalize_keyword_aggregation, ) + axis = self._get_axis_number(axis) relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) From 89b8e6b3a2137b83a29bb66207d13ebdb25c2d57 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 20 Dec 2019 21:47:30 +0100 Subject: [PATCH 038/106] code change based on review --- pandas/core/base.py | 28 +++++ pandas/core/frame.py | 22 +--- pandas/core/groupby/util.py | 197 +++++++++++++++++++++++++++++++ pandas/core/series.py | 9 +- pandas/tests/frame/test_apply.py | 13 +- 5 files changed, 241 insertions(+), 28 deletions(-) create mode 100644 pandas/core/groupby/util.py diff --git a/pandas/core/base.py b/pandas/core/base.py index 381d45d829e62..c49c5a76aa0a2 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -35,6 +35,11 @@ from pandas.core.algorithms import duplicated, unique1d, value_counts from pandas.core.arrays import ExtensionArray from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.groupby.util import ( + _is_multi_agg_with_relabel, + _maybe_mangle_lambdas, + _normalize_keyword_aggregation, +) import pandas.core.nanops as nanops _shared_docs: Dict[str, str] = dict() @@ -289,6 +294,29 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): f"'{arg}' is not a valid function for '{type(self).__name__}' object" ) + def _reconstruct_func(self, func, *args, **kwargs): + + relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + if relabeling: + func, columns, order = _normalize_keyword_aggregation(kwargs) + + elif isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column " + "names assigned" + ) + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + func = _maybe_mangle_lambdas(func) + if relabeling: + return relabeling, func, columns, order + return relabeling, func, None, None + def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9f934dd46e3a0..d443ddad68ec1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6412,26 +6412,11 @@ def _gotitem( ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, axis=0, *args, **kwargs): - from pandas.core.groupby.generic import ( - _is_multi_agg_with_relabel, - _maybe_mangle_lambdas, - _normalize_keyword_aggregation, - ) - axis = self._get_axis_number(axis) - relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + relabeling, func, columns, order = self._reconstruct_func(func, *args, **kwargs) if relabeling: - func, indexes, order = _normalize_keyword_aggregation(kwargs) - reordered_indexes = [ - pair[0] for pair in sorted(zip(indexes, order), key=lambda t: t[1]) - ] kwargs = {} - elif func is None: - # nicer error message - raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - - func = _maybe_mangle_lambdas(func) result = None try: @@ -6442,11 +6427,14 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): return self.apply(func, axis=axis, args=args, **kwargs) if relabeling: + reordered_indexes = [ + pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) + ] # when there are more than one column being used in aggregate, the order # of result will be reversed, and in case the func is not used by other # columns, there might be NaN values, so separate these two cases - reordered_result = DataFrame(index=indexes) + reordered_result = DataFrame(index=columns) idx = 0 for col, funcs in func.items(): v = reordered_indexes[idx : idx + len(funcs)] diff --git a/pandas/core/groupby/util.py b/pandas/core/groupby/util.py new file mode 100644 index 0000000000000..bcd00da5c60ff --- /dev/null +++ b/pandas/core/groupby/util.py @@ -0,0 +1,197 @@ +from collections import OrderedDict +from functools import partial +from typing import Any, Sequence + +from pandas.core.dtypes.common import is_dict_like, is_list_like + +import pandas.core.common as com +from pandas.core.indexes.api import Index + + +def _is_multi_agg_with_relabel(**kwargs) -> bool: + """ + Check whether kwargs passed to .agg look like multi-agg with relabeling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> _is_multi_agg_with_relabel(a='max') + False + >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> _is_multi_agg_with_relabel() + False + """ + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) + + +def _normalize_keyword_aggregation(kwargs): + """ + Normalize user-provided "named aggregation" kwargs. + + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs + to the old OrderedDict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + col_idx_order : List[int] + List of columns indices. + + Examples + -------- + >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) + (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + """ + # Normalize the aggregation functions as Mapping[column, List[func]], + # process normally, then fixup the names. + # TODO(Py35): When we drop python 3.5, change this to + # defaultdict(list) + # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec = OrderedDict() + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + if column in aggspec: + aggspec[column].append(aggfunc) + else: + aggspec[column] = [aggfunc] + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] + + +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = partial(aggfunc) + aggfunc.__name__ = f"" + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def _maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to GroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> _maybe_mangle_lambdas('sum') + 'sum' + + >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec diff --git a/pandas/core/series.py b/pandas/core/series.py index e3a9e2f4ddcbe..70e3205ccc651 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3544,10 +3544,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): self._get_axis_number(axis) if func is None: - # This is due to order issue of dictionary in PY35, e.g. if {"foo" - # : "sum", "bar": "min"}, then it will take "bar" first because it - # b is before f - func = OrderedDict(kwargs.items()) + func = dict(kwargs.items()) kwargs = {} result, how = self._aggregate(func, *args, **kwargs) @@ -4394,9 +4391,7 @@ def to_period(self, freq=None, copy=True): hist = pandas.plotting.hist_series -Series._setup_axes( - ["index"], docs={"index": "The index (axis labels) of the Series."}, -) +Series._setup_axes(["index"], docs={"index": "The index (axis labels) of the Series."}) Series._add_numeric_operations() Series._add_series_only_operations() Series._add_series_or_dataframe_operations() diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index e4c5d7f624110..df5dff290cd2f 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1391,9 +1391,8 @@ def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, meth class TestDataFrameNamedAggregate: - - # GH 26513 def test_agg_relabel(self): + # GH 26513 df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) # simplest case with one column, one func @@ -1407,7 +1406,9 @@ def test_agg_relabel(self): tm.assert_frame_equal(result, expected) - # test on multiple columns with multiple methods + def test_agg_relabel_multi_columns_multi_methods(self): + # GH 26513, test on multiple columns with multiple methods + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) result = df.agg( foo=("A", "sum"), bar=("B", "mean"), @@ -1426,7 +1427,9 @@ def test_agg_relabel(self): ) tm.assert_frame_equal(result, expected) - # test on partial, functools or more complex cases + def test_agg_relable_partial_functions(self): + # GH 26513, test on partial, functools or more complex cases + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) expected = pd.DataFrame( {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) @@ -1451,6 +1454,7 @@ def test_agg_relabel(self): tm.assert_frame_equal(result, expected) def test_agg_namedtuple(self): + # GH 26513 df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) result = df.agg( foo=pd.NamedAgg("B", "sum"), @@ -1476,6 +1480,7 @@ def test_agg_namedtuple(self): tm.assert_frame_equal(result, expected) def test_agg_raises(self): + # GH 26513 df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) msg = "Must provide" From 8aa1cc9a893313c0f84f4dfb543104c557b509bf Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 20 Dec 2019 21:54:53 +0100 Subject: [PATCH 039/106] remove util --- pandas/core/base.py | 2 +- pandas/core/groupby/util.py | 197 ------------------------------------ 2 files changed, 1 insertion(+), 198 deletions(-) delete mode 100644 pandas/core/groupby/util.py diff --git a/pandas/core/base.py b/pandas/core/base.py index c49c5a76aa0a2..ca1fa3fd7ad5c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -35,7 +35,7 @@ from pandas.core.algorithms import duplicated, unique1d, value_counts from pandas.core.arrays import ExtensionArray from pandas.core.construction import create_series_with_explicit_dtype -from pandas.core.groupby.util import ( +from pandas.core.groupby.generic import ( _is_multi_agg_with_relabel, _maybe_mangle_lambdas, _normalize_keyword_aggregation, diff --git a/pandas/core/groupby/util.py b/pandas/core/groupby/util.py deleted file mode 100644 index bcd00da5c60ff..0000000000000 --- a/pandas/core/groupby/util.py +++ /dev/null @@ -1,197 +0,0 @@ -from collections import OrderedDict -from functools import partial -from typing import Any, Sequence - -from pandas.core.dtypes.common import is_dict_like, is_list_like - -import pandas.core.common as com -from pandas.core.indexes.api import Index - - -def _is_multi_agg_with_relabel(**kwargs) -> bool: - """ - Check whether kwargs passed to .agg look like multi-agg with relabeling. - - Parameters - ---------- - **kwargs : dict - - Returns - ------- - bool - - Examples - -------- - >>> _is_multi_agg_with_relabel(a='max') - False - >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), - ... a_min=('a', 'min')) - True - >>> _is_multi_agg_with_relabel() - False - """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( - len(kwargs) > 0 - ) - - -def _normalize_keyword_aggregation(kwargs): - """ - Normalize user-provided "named aggregation" kwargs. - - Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs - to the old OrderedDict[str, List[scalar]]]. - - Parameters - ---------- - kwargs : dict - - Returns - ------- - aggspec : dict - The transformed kwargs. - columns : List[str] - The user-provided keys. - col_idx_order : List[int] - List of columns indices. - - Examples - -------- - >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) - """ - # Normalize the aggregation functions as Mapping[column, List[func]], - # process normally, then fixup the names. - # TODO(Py35): When we drop python 3.5, change this to - # defaultdict(list) - # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] - # May be hitting https://github.com/python/mypy/issues/5958 - # saying it doesn't have an attribute __name__ - aggspec = OrderedDict() - order = [] - columns, pairs = list(zip(*kwargs.items())) - - for name, (column, aggfunc) in zip(columns, pairs): - if column in aggspec: - aggspec[column].append(aggfunc) - else: - aggspec[column] = [aggfunc] - order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - - # uniquify aggfunc name if duplicated in order list - uniquified_order = _make_unique(order) - - # GH 25719, due to aggspec will change the order of assigned columns in aggregation - # uniquified_aggspec will store uniquified order list and will compare it with order - # based on index - aggspec_order = [ - (column, com.get_callable_name(aggfunc) or aggfunc) - for column, aggfuncs in aggspec.items() - for aggfunc in aggfuncs - ] - uniquified_aggspec = _make_unique(aggspec_order) - - # get the new indice of columns by comparison - col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) - return aggspec, columns, col_idx_order - - -def _make_unique(seq): - """Uniquify aggfunc name of the pairs in the order list - - Examples: - -------- - >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) - [('a', '_0'), ('a', '_1'), ('b', '')] - """ - return [ - (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) - if seq.count(pair) > 1 - else pair - for i, pair in enumerate(seq) - ] - - -# TODO: Can't use, because mypy doesn't like us setting __name__ -# error: "partial[Any]" has no attribute "__name__" -# the type is: -# typing.Sequence[Callable[..., ScalarResult]] -# -> typing.Sequence[Callable[..., ScalarResult]]: - - -def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: - """ - Possibly mangle a list of aggfuncs. - - Parameters - ---------- - aggfuncs : Sequence - - Returns - ------- - mangled: list-like - A new AggSpec sequence, where lambdas have been converted - to have unique names. - - Notes - ----- - If just one aggfunc is passed, the name will not be mangled. - """ - if len(aggfuncs) <= 1: - # don't mangle for .agg([lambda x: .]) - return aggfuncs - i = 0 - mangled_aggfuncs = [] - for aggfunc in aggfuncs: - if com.get_callable_name(aggfunc) == "": - aggfunc = partial(aggfunc) - aggfunc.__name__ = f"" - i += 1 - mangled_aggfuncs.append(aggfunc) - - return mangled_aggfuncs - - -def _maybe_mangle_lambdas(agg_spec: Any) -> Any: - """ - Make new lambdas with unique names. - - Parameters - ---------- - agg_spec : Any - An argument to GroupBy.agg. - Non-dict-like `agg_spec` are pass through as is. - For dict-like `agg_spec` a new spec is returned - with name-mangled lambdas. - - Returns - ------- - mangled : Any - Same type as the input. - - Examples - -------- - >>> _maybe_mangle_lambdas('sum') - 'sum' - - >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP - [, - .f(*args, **kwargs)>] - """ - is_dict = is_dict_like(agg_spec) - if not (is_dict or is_list_like(agg_spec)): - return agg_spec - mangled_aggspec = type(agg_spec)() # dict or OrderdDict - - if is_dict: - for key, aggfuncs in agg_spec.items(): - if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): - mangled_aggfuncs = _managle_lambda_list(aggfuncs) - else: - mangled_aggfuncs = aggfuncs - - mangled_aggspec[key] = mangled_aggfuncs - else: - mangled_aggspec = _managle_lambda_list(agg_spec) - - return mangled_aggspec From 091ca75e99024a686f152c365af833ab36d92c09 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 20 Dec 2019 21:58:40 +0100 Subject: [PATCH 040/106] Add docstring --- pandas/core/base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index ca1fa3fd7ad5c..8780162a74565 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -295,7 +295,14 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): ) def _reconstruct_func(self, func, *args, **kwargs): + """ + This is the internal function to reconstruct func given if there is relabeling + or not. And also normalize the keyword to get new order of columns; + If relabeling is True, will return relabeling, reconstructed func, column + names, and the reconstructed order of columns. + If relabeling is False, the columns and order will be None. + """ relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: func, columns, order = _normalize_keyword_aggregation(kwargs) From c2d510413704230bb301709c158f0bb268748e1a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 20 Dec 2019 22:10:03 +0100 Subject: [PATCH 041/106] fix circular import --- pandas/core/base.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 8780162a74565..fed9a17eb748c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -35,11 +35,6 @@ from pandas.core.algorithms import duplicated, unique1d, value_counts from pandas.core.arrays import ExtensionArray from pandas.core.construction import create_series_with_explicit_dtype -from pandas.core.groupby.generic import ( - _is_multi_agg_with_relabel, - _maybe_mangle_lambdas, - _normalize_keyword_aggregation, -) import pandas.core.nanops as nanops _shared_docs: Dict[str, str] = dict() @@ -303,6 +298,12 @@ def _reconstruct_func(self, func, *args, **kwargs): names, and the reconstructed order of columns. If relabeling is False, the columns and order will be None. """ + from pandas.core.groupby.generic import ( + _is_multi_agg_with_relabel, + _maybe_mangle_lambdas, + _normalize_keyword_aggregation, + ) + relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: func, columns, order = _normalize_keyword_aggregation(kwargs) From 0484f5eb7bdbbc75348c03407f37dfd55420a815 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Jan 2020 21:05:30 +0100 Subject: [PATCH 042/106] reorg and deduplicate --- pandas/core/aggregation.py | 225 +++++++++++++++++++++++++++++++++ pandas/core/base.py | 36 ------ pandas/core/frame.py | 3 +- pandas/core/groupby/generic.py | 209 ++---------------------------- 4 files changed, 235 insertions(+), 238 deletions(-) create mode 100644 pandas/core/aggregation.py diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py new file mode 100644 index 0000000000000..3c4b76437e2d5 --- /dev/null +++ b/pandas/core/aggregation.py @@ -0,0 +1,225 @@ +from collections import abc, defaultdict +from functools import partial +from typing import Any, List, Sequence + +from pandas.core.dtypes.common import is_dict_like, is_list_like + +from pandas.core.base import SpecificationError +import pandas.core.common as com +from pandas.core.indexes.api import Index + + +def reconstruct_func(self, func, *args, **kwargs): + """ + This is the internal function to reconstruct func given if there is relabeling + or not. And also normalize the keyword to get new order of columns; + + If relabeling is True, will return relabeling, reconstructed func, column + names, and the reconstructed order of columns. + If relabeling is False, the columns and order will be None. + """ + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) + if relabeling: + func, columns, order = normalize_keyword_aggregation(kwargs) + + elif isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column " "names assigned" + ) + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + func = maybe_mangle_lambdas(func) + if not relabeling: + columns = None + order = None + + return relabeling, func, columns, order + + +def is_multi_agg_with_relabel(**kwargs) -> bool: + """ + Check whether kwargs passed to .agg look like multi-agg with relabeling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> _is_multi_agg_with_relabel(a='max') + False + >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> _is_multi_agg_with_relabel() + False + """ + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) + + +def normalize_keyword_aggregation(kwargs): + """ + Normalize user-provided "named aggregation" kwargs. + + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs + to the old Dict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + col_idx_order : List[int] + List of columns indices. + + Examples + -------- + >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) + ({'input': ['sum']}, ('output',), [('input', 'sum')]) + """ + # Normalize the aggregation functions as Mapping[column, List[func]], + # process normally, then fixup the names. + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec = defaultdict(list) + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + aggspec[column].append(aggfunc) + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] + + +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = partial(aggfunc) + aggfunc.__name__ = f"" + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to GroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> _maybe_mangle_lambdas('sum') + 'sum' + + >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec diff --git a/pandas/core/base.py b/pandas/core/base.py index 8ffd67f89454f..d38dbec684f35 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -294,42 +294,6 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): f"'{arg}' is not a valid function for '{type(self).__name__}' object" ) - def _reconstruct_func(self, func, *args, **kwargs): - """ - This is the internal function to reconstruct func given if there is relabeling - or not. And also normalize the keyword to get new order of columns; - - If relabeling is True, will return relabeling, reconstructed func, column - names, and the reconstructed order of columns. - If relabeling is False, the columns and order will be None. - """ - from pandas.core.groupby.generic import ( - _is_multi_agg_with_relabel, - _maybe_mangle_lambdas, - _normalize_keyword_aggregation, - ) - - relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) - if relabeling: - func, columns, order = _normalize_keyword_aggregation(kwargs) - - elif isinstance(func, list) and len(func) > len(set(func)): - - # GH 28426 will raise error if duplicated function names are used and - # there is no reassigned name - raise SpecificationError( - "Function names must be unique if there is no new column " - "names assigned" - ) - elif func is None: - # nicer error message - raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - - func = _maybe_mangle_lambdas(func) - if relabeling: - return relabeling, func, columns, order - return relabeling, func, None, None - def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 18687339ce2e2..cfc340b14210e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -97,6 +97,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor +from pandas.core.aggregation import reconstruct_func from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -6621,7 +6622,7 @@ def _gotitem( def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) - relabeling, func, columns, order = self._reconstruct_func(func, *args, **kwargs) + relabeling, func, columns, order = reconstruct_func(func, *args, **kwargs) if relabeling: kwargs = {} diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c49677fa27a31..6f25495d1fcc7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -53,6 +53,12 @@ ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna +from pandas.core.aggregation import ( + is_multi_agg_with_relabel, + maybe_mangle_lambdas, + normalize_keyword_aggregation, + reconstruct_func, +) import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -249,7 +255,7 @@ def aggregate(self, func=None, *args, **kwargs): elif isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. - func = _maybe_mangle_lambdas(func) + func = maybe_mangle_lambdas(func) ret = self._aggregate_multiple_funcs(func) if relabeling: ret.columns = columns @@ -918,24 +924,9 @@ class DataFrameGroupBy(GroupBy): @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + relabeling, func, columns, order = reconstruct_func(func, *args, **kwargs) if relabeling: - func, columns, order = _normalize_keyword_aggregation(kwargs) - kwargs = {} - elif isinstance(func, list) and len(func) > len(set(func)): - - # GH 28426 will raise error if duplicated function names are used and - # there is no reassigned name - raise SpecificationError( - "Function names must be unique if there is no new column " - "names assigned" - ) - elif func is None: - # nicer error message - raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - - func = _maybe_mangle_lambdas(func) result, how = self._aggregate(func, *args, **kwargs) if how is None: @@ -1860,190 +1851,6 @@ def groupby_series(obj, col=None): boxplot = boxplot_frame_groupby -def _is_multi_agg_with_relabel(**kwargs) -> bool: - """ - Check whether kwargs passed to .agg look like multi-agg with relabeling. - - Parameters - ---------- - **kwargs : dict - - Returns - ------- - bool - - Examples - -------- - >>> _is_multi_agg_with_relabel(a='max') - False - >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), - ... a_min=('a', 'min')) - True - >>> _is_multi_agg_with_relabel() - False - """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( - len(kwargs) > 0 - ) - - -def _normalize_keyword_aggregation(kwargs): - """ - Normalize user-provided "named aggregation" kwargs. - - Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs - to the old Dict[str, List[scalar]]]. - - Parameters - ---------- - kwargs : dict - - Returns - ------- - aggspec : dict - The transformed kwargs. - columns : List[str] - The user-provided keys. - col_idx_order : List[int] - List of columns indices. - - Examples - -------- - >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - ({'input': ['sum']}, ('output',), [('input', 'sum')]) - """ - # Normalize the aggregation functions as Mapping[column, List[func]], - # process normally, then fixup the names. - # TODO: aggspec type: typing.Dict[str, List[AggScalar]] - # May be hitting https://github.com/python/mypy/issues/5958 - # saying it doesn't have an attribute __name__ - aggspec = defaultdict(list) - order = [] - columns, pairs = list(zip(*kwargs.items())) - - for name, (column, aggfunc) in zip(columns, pairs): - aggspec[column].append(aggfunc) - order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - - # uniquify aggfunc name if duplicated in order list - uniquified_order = _make_unique(order) - - # GH 25719, due to aggspec will change the order of assigned columns in aggregation - # uniquified_aggspec will store uniquified order list and will compare it with order - # based on index - aggspec_order = [ - (column, com.get_callable_name(aggfunc) or aggfunc) - for column, aggfuncs in aggspec.items() - for aggfunc in aggfuncs - ] - uniquified_aggspec = _make_unique(aggspec_order) - - # get the new indice of columns by comparison - col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) - return aggspec, columns, col_idx_order - - -def _make_unique(seq): - """Uniquify aggfunc name of the pairs in the order list - - Examples: - -------- - >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) - [('a', '_0'), ('a', '_1'), ('b', '')] - """ - return [ - (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) - if seq.count(pair) > 1 - else pair - for i, pair in enumerate(seq) - ] - - -# TODO: Can't use, because mypy doesn't like us setting __name__ -# error: "partial[Any]" has no attribute "__name__" -# the type is: -# typing.Sequence[Callable[..., ScalarResult]] -# -> typing.Sequence[Callable[..., ScalarResult]]: - - -def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: - """ - Possibly mangle a list of aggfuncs. - - Parameters - ---------- - aggfuncs : Sequence - - Returns - ------- - mangled: list-like - A new AggSpec sequence, where lambdas have been converted - to have unique names. - - Notes - ----- - If just one aggfunc is passed, the name will not be mangled. - """ - if len(aggfuncs) <= 1: - # don't mangle for .agg([lambda x: .]) - return aggfuncs - i = 0 - mangled_aggfuncs = [] - for aggfunc in aggfuncs: - if com.get_callable_name(aggfunc) == "": - aggfunc = partial(aggfunc) - aggfunc.__name__ = f"" - i += 1 - mangled_aggfuncs.append(aggfunc) - - return mangled_aggfuncs - - -def _maybe_mangle_lambdas(agg_spec: Any) -> Any: - """ - Make new lambdas with unique names. - - Parameters - ---------- - agg_spec : Any - An argument to GroupBy.agg. - Non-dict-like `agg_spec` are pass through as is. - For dict-like `agg_spec` a new spec is returned - with name-mangled lambdas. - - Returns - ------- - mangled : Any - Same type as the input. - - Examples - -------- - >>> _maybe_mangle_lambdas('sum') - 'sum' - - >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP - [, - .f(*args, **kwargs)>] - """ - is_dict = is_dict_like(agg_spec) - if not (is_dict or is_list_like(agg_spec)): - return agg_spec - mangled_aggspec = type(agg_spec)() # dict or OrderdDict - - if is_dict: - for key, aggfuncs in agg_spec.items(): - if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): - mangled_aggfuncs = _managle_lambda_list(aggfuncs) - else: - mangled_aggfuncs = aggfuncs - - mangled_aggspec[key] = mangled_aggfuncs - else: - mangled_aggspec = _managle_lambda_list(agg_spec) - - return mangled_aggspec - - def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ If we have date/time like in the original, then coerce dates From 425c8021d2d5bb44f300a40a0ca8255bc4a0a274 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Jan 2020 21:06:46 +0100 Subject: [PATCH 043/106] remove used imports --- pandas/core/groupby/generic.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6f25495d1fcc7..6fc9ee054d494 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -53,12 +53,7 @@ ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna -from pandas.core.aggregation import ( - is_multi_agg_with_relabel, - maybe_mangle_lambdas, - normalize_keyword_aggregation, - reconstruct_func, -) +from pandas.core.aggregation import maybe_mangle_lambdas, reconstruct_func import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com From d5c2c6c059ebf2b1638c8b84410033248eea9713 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Jan 2020 21:29:02 +0100 Subject: [PATCH 044/106] fix linting --- pandas/core/aggregation.py | 6 +++--- pandas/core/groupby/generic.py | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 3c4b76437e2d5..3b24eba42832e 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -1,6 +1,6 @@ -from collections import abc, defaultdict +from collections import defaultdict from functools import partial -from typing import Any, List, Sequence +from typing import Any, Sequence from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -9,7 +9,7 @@ from pandas.core.indexes.api import Index -def reconstruct_func(self, func, *args, **kwargs): +def reconstruct_func(func, *args, **kwargs): """ This is the internal function to reconstruct func given if there is relabeling or not. And also normalize the keyword to get new order of columns; diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6fc9ee054d494..35cc9ecfbdb17 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,7 +5,7 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ -from collections import abc, defaultdict, namedtuple +from collections import abc, namedtuple import copy from functools import partial from textwrap import dedent @@ -42,10 +42,8 @@ ensure_int64, ensure_platform_int, is_bool, - is_dict_like, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_dtype, is_object_dtype, is_scalar, From 8bb9714f6d2b606d1b101668b856ef67855d0dae Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Jan 2020 21:53:19 +0100 Subject: [PATCH 045/106] fix wrong import --- pandas/core/aggregation.py | 2 +- pandas/tests/groupby/aggregate/test_aggregate.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 3b24eba42832e..e038da3caa630 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -27,7 +27,7 @@ def reconstruct_func(func, *args, **kwargs): # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( - "Function names must be unique if there is no new column " "names assigned" + "Function names must be unique if there is no new column names assigned" ) elif func is None: # nicer error message diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0d8379407fef7..ca5a444680399 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -8,8 +8,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat +from pandas.core.aggregation import _make_unique, maybe_mangle_lambdas from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -633,14 +633,14 @@ def test_lambda_named_agg(func): class TestLambdaMangling: def test_maybe_mangle_lambdas_passthrough(self): - assert _maybe_mangle_lambdas("mean") == "mean" - assert _maybe_mangle_lambdas(lambda x: x).__name__ == "" + assert maybe_mangle_lambdas("mean") == "mean" + assert maybe_mangle_lambdas(lambda x: x).__name__ == "" # don't mangel single lambda. - assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" + assert maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" def test_maybe_mangle_lambdas_listlike(self): aggfuncs = [lambda x: 1, lambda x: 2] - result = _maybe_mangle_lambdas(aggfuncs) + result = maybe_mangle_lambdas(aggfuncs) assert result[0].__name__ == "" assert result[1].__name__ == "" assert aggfuncs[0](None) == result[0](None) @@ -648,13 +648,13 @@ def test_maybe_mangle_lambdas_listlike(self): def test_maybe_mangle_lambdas(self): func = {"A": [lambda x: 0, lambda x: 1]} - result = _maybe_mangle_lambdas(func) + result = maybe_mangle_lambdas(func) assert result["A"][0].__name__ == "" assert result["A"][1].__name__ == "" def test_maybe_mangle_lambdas_args(self): func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} - result = _maybe_mangle_lambdas(func) + result = maybe_mangle_lambdas(func) assert result["A"][0].__name__ == "" assert result["A"][1].__name__ == "" @@ -664,7 +664,7 @@ def test_maybe_mangle_lambdas_args(self): def test_maybe_mangle_lambdas_named(self): func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}} - result = _maybe_mangle_lambdas(func) + result = maybe_mangle_lambdas(func) assert result == func def test_basic(self): From 0545231301ab6be111663f131885168918d09d04 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Jan 2020 21:56:59 +0100 Subject: [PATCH 046/106] isort --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d4daa43ee76e7..fb31760a7f7cd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -8,10 +8,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat -from pandas.core.aggregation import _make_unique, maybe_mangle_lambdas - import pandas._testing as tm - +from pandas.core.aggregation import _make_unique, maybe_mangle_lambdas from pandas.core.base import SpecificationError from pandas.core.groupby.grouper import Grouping From 0a278899a36b1f140f4af68a249fd5ca31309f2c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Jan 2020 22:22:50 +0100 Subject: [PATCH 047/106] fix mypy --- pandas/core/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index e038da3caa630..3605107fceebb 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -1,6 +1,6 @@ from collections import defaultdict from functools import partial -from typing import Any, Sequence +from typing import Any, DefaultDict, Sequence from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -98,7 +98,7 @@ def normalize_keyword_aggregation(kwargs): # TODO: aggspec type: typing.Dict[str, List[AggScalar]] # May be hitting https://github.com/python/mypy/issues/5958 # saying it doesn't have an attribute __name__ - aggspec = defaultdict(list) + aggspec: DefaultDict = defaultdict(list) order = [] columns, pairs = list(zip(*kwargs.items())) From a66053ef5d2f655e1f11bfe2a2caee6e273a08f2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 6 Jan 2020 22:05:53 +0100 Subject: [PATCH 048/106] Code change based on review --- pandas/core/frame.py | 19 +++++++------------ pandas/core/series.py | 1 - 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cfc340b14210e..b8d7a8cadb15d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6623,8 +6623,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) relabeling, func, columns, order = reconstruct_func(func, *args, **kwargs) - if relabeling: - kwargs = {} result = None try: @@ -6639,18 +6637,15 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] - # when there are more than one column being used in aggregate, the order - # of result will be reversed, and in case the func is not used by other - # columns, there might be NaN values, so separate these two cases + # This is to keep the order to columns occurrence unchanged, and also + # keep the order of new columns occurrence unchanged reordered_result = DataFrame(index=columns) idx = 0 - for col, funcs in func.items(): - v = reordered_indexes[idx : idx + len(funcs)] - if len(func) > 1: - reordered_result.loc[v, col] = result[col][::-1].dropna().values - else: - reordered_result.loc[v, col] = result[col].values - idx = idx + len(funcs) + for col, fun in func.items(): + s = result[col] + s.index = reordered_indexes[idx : idx + len(fun)] + reordered_result[col] = s.reindex(columns) + idx = idx + len(fun) result = reordered_result return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 9fd36054ea480..084d71f85d215 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3632,7 +3632,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if func is None: func = dict(kwargs.items()) - kwargs = {} result, how = self._aggregate(func, *args, **kwargs) if result is None: From 7311ef041c6991f3d2709537fd9ae9b38ae4f721 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 6 Jan 2020 22:51:37 +0100 Subject: [PATCH 049/106] dropna --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b8d7a8cadb15d..648fa2caba576 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6642,7 +6642,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): reordered_result = DataFrame(index=columns) idx = 0 for col, fun in func.items(): - s = result[col] + s = result[col].dropna() s.index = reordered_indexes[idx : idx + len(fun)] reordered_result[col] = s.reindex(columns) idx = idx + len(fun) From da2ff3758b483a824a768fe8ead0750861aadba8 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Jan 2020 09:29:19 +0100 Subject: [PATCH 050/106] fix logic --- pandas/core/frame.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 648fa2caba576..f3cdc5df24121 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6641,8 +6641,15 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # keep the order of new columns occurrence unchanged reordered_result = DataFrame(index=columns) idx = 0 + + # The reason is self._aggregate outputs different type of result if + # any column is only used once in aggregation + mask = True if any([len(v) == 1 for v in func.values()]) else False for col, fun in func.items(): - s = result[col].dropna() + if mask: + s = result[col] + else: + s = result[col][::-1].dropna() s.index = reordered_indexes[idx : idx + len(fun)] reordered_result[col] = s.reindex(columns) idx = idx + len(fun) From bcc5bc342c59eb9331fb41cfeb8daf146cb74644 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Jan 2020 09:59:44 +0100 Subject: [PATCH 051/106] fix logic --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f3cdc5df24121..35e66a27be88c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6645,8 +6645,9 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # The reason is self._aggregate outputs different type of result if # any column is only used once in aggregation mask = True if any([len(v) == 1 for v in func.values()]) else False + print(mask) for col, fun in func.items(): - if mask: + if not mask: s = result[col] else: s = result[col][::-1].dropna() From 0825027c7743872af2926c62bed3883bd32527e1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Jan 2020 10:00:01 +0100 Subject: [PATCH 052/106] remove unused --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 35e66a27be88c..fddda4de5ffd1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6645,7 +6645,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # The reason is self._aggregate outputs different type of result if # any column is only used once in aggregation mask = True if any([len(v) == 1 for v in func.values()]) else False - print(mask) for col, fun in func.items(): if not mask: s = result[col] From d3c35f59517483cec2fb614461a1e95e809f134a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Jan 2020 10:26:11 +0100 Subject: [PATCH 053/106] fix linting --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fddda4de5ffd1..74b88e298bbf4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6644,7 +6644,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # The reason is self._aggregate outputs different type of result if # any column is only used once in aggregation - mask = True if any([len(v) == 1 for v in func.values()]) else False + mask = True if any(len(v) == 1 for v in func.values()) else False for col, fun in func.items(): if not mask: s = result[col] From cef2b50010e30b500c983dad4f8477fade8d05af Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Jan 2020 15:51:31 +0100 Subject: [PATCH 054/106] simpler python --- pandas/core/frame.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 74b88e298bbf4..62976b221a1ab 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6646,10 +6646,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): # any column is only used once in aggregation mask = True if any(len(v) == 1 for v in func.values()) else False for col, fun in func.items(): - if not mask: - s = result[col] - else: - s = result[col][::-1].dropna() + s = result[col][::-1].dropna() if mask else result[col] s.index = reordered_indexes[idx : idx + len(fun)] reordered_result[col] = s.reindex(columns) idx = idx + len(fun) From 3da2e2adbbcd873b1f67b8bde7f8dc0e49764f21 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 21 Jan 2020 09:39:52 +0100 Subject: [PATCH 055/106] fix merge error --- pandas/core/aggregation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 295b2267b88df..79b87f146b9a7 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -16,12 +16,15 @@ def is_multi_agg_with_relabel(**kwargs) -> bool: """ Check whether kwargs passed to .agg look like multi-agg with relabeling. + Parameters ---------- **kwargs : dict + Returns ------- bool + Examples -------- >>> is_multi_agg_with_relabel(a='max') @@ -42,9 +45,11 @@ def normalize_keyword_aggregation(kwargs: dict) -> Tuple[dict, List[str], List[i Normalize user-provided "named aggregation" kwargs. Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs to the old Dict[str, List[scalar]]]. + Parameters ---------- kwargs : dict + Returns ------- aggspec : dict @@ -53,6 +58,7 @@ def normalize_keyword_aggregation(kwargs: dict) -> Tuple[dict, List[str], List[i The user-provided keys. col_idx_order : List[int] List of columns indices. + Examples -------- >>> normalize_keyword_aggregation({'output': ('input', 'sum')}) @@ -93,6 +99,7 @@ def _make_unique_kwarg_list( seq: Sequence[Tuple[Any, Any]] ) -> Sequence[Tuple[Any, Any]]: """Uniquify aggfunc name of the pairs in the order list + Examples: -------- >>> kwarg_list = [('a', ''), ('a', ''), ('b', '')] @@ -117,14 +124,17 @@ def _make_unique_kwarg_list( def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: """ Possibly mangle a list of aggfuncs. + Parameters ---------- aggfuncs : Sequence + Returns ------- mangled: list-like A new AggSpec sequence, where lambdas have been converted to have unique names. + Notes ----- If just one aggfunc is passed, the name will not be mangled. @@ -147,6 +157,7 @@ def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: def maybe_mangle_lambdas(agg_spec: Any) -> Any: """ Make new lambdas with unique names. + Parameters ---------- agg_spec : Any @@ -154,10 +165,12 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: Non-dict-like `agg_spec` are pass through as is. For dict-like `agg_spec` a new spec is returned with name-mangled lambdas. + Returns ------- mangled : Any Same type as the input. + Examples -------- >>> maybe_mangle_lambdas('sum') From 3ce91fc065da5d2f1bb546f53848fe6173bd8040 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 21 Jan 2020 10:04:52 +0100 Subject: [PATCH 056/106] fixup --- pandas/core/aggregation.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 79b87f146b9a7..6ab557d6e31e0 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -9,10 +9,42 @@ from pandas.core.dtypes.common import is_dict_like, is_list_like +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index +def reconstruct_func(func, *args, **kwargs): + """ + This is the internal function to reconstruct func given if there is relabeling + or not. And also normalize the keyword to get new order of columns; + If relabeling is True, will return relabeling, reconstructed func, column + names, and the reconstructed order of columns. + If relabeling is False, the columns and order will be None. + """ + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) + if relabeling: + func, columns, order = normalize_keyword_aggregation(kwargs) + + elif isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column " "names assigned" + ) + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + func = maybe_mangle_lambdas(func) + if not relabeling: + columns = None + order = None + + return relabeling, func, columns, order + + def is_multi_agg_with_relabel(**kwargs) -> bool: """ Check whether kwargs passed to .agg look like multi-agg with relabeling. From 1426ee27d9195d256fb76d4c24f968d4de789c59 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 21 Jan 2020 10:40:56 +0100 Subject: [PATCH 057/106] fix annotation --- pandas/core/aggregation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 6ab557d6e31e0..bf3df9388a457 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,7 +5,7 @@ from collections import defaultdict from functools import partial -from typing import Any, DefaultDict, List, Sequence, Tuple +from typing import Any, DefaultDict, List, Optional, Sequence, Tuple from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -23,6 +23,9 @@ def reconstruct_func(func, *args, **kwargs): If relabeling is False, the columns and order will be None. """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) + columns: Optional[List[int]] = None + order: Optional[List[int]] = None + if relabeling: func, columns, order = normalize_keyword_aggregation(kwargs) @@ -31,16 +34,13 @@ def reconstruct_func(func, *args, **kwargs): # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( - "Function names must be unique if there is no new column " "names assigned" + "Function names must be unique if there is no new column names assigned" ) elif func is None: # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") func = maybe_mangle_lambdas(func) - if not relabeling: - columns = None - order = None return relabeling, func, columns, order From 5893a0eb93829aa63819a91eb849a71afd923242 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 21 Jan 2020 11:04:33 +0100 Subject: [PATCH 058/106] fix annotation --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index bf3df9388a457..b8431f1d2f5a1 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -23,7 +23,7 @@ def reconstruct_func(func, *args, **kwargs): If relabeling is False, the columns and order will be None. """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) - columns: Optional[List[int]] = None + columns: Optional[List[str]] = None order: Optional[List[int]] = None if relabeling: From 0f55073834e7c088b60b57891b543007d572e0be Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 25 Jan 2020 20:41:45 +0100 Subject: [PATCH 059/106] move code --- pandas/core/aggregation.py | 8 ++++---- pandas/core/groupby/generic.py | 20 ++------------------ 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index b8431f1d2f5a1..508c1d522c323 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -26,10 +26,7 @@ def reconstruct_func(func, *args, **kwargs): columns: Optional[List[str]] = None order: Optional[List[int]] = None - if relabeling: - func, columns, order = normalize_keyword_aggregation(kwargs) - - elif isinstance(func, list) and len(func) > len(set(func)): + if isinstance(func, list) and len(func) > len(set(func)): # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name @@ -40,6 +37,9 @@ def reconstruct_func(func, *args, **kwargs): # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + if relabeling: + func, columns, order = normalize_keyword_aggregation(kwargs) + func = maybe_mangle_lambdas(func) return relabeling, func, columns, order diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 98cdcd0f2b6ee..374fcd86c04fc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -55,6 +55,7 @@ is_multi_agg_with_relabel, maybe_mangle_lambdas, normalize_keyword_aggregation, + reconstruct_func, ) import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError @@ -921,24 +922,7 @@ class DataFrameGroupBy(GroupBy): @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - relabeling = func is None and is_multi_agg_with_relabel(**kwargs) - if relabeling: - func, columns, order = normalize_keyword_aggregation(kwargs) - - kwargs = {} - elif isinstance(func, list) and len(func) > len(set(func)): - - # GH 28426 will raise error if duplicated function names are used and - # there is no reassigned name - raise SpecificationError( - "Function names must be unique if there is no new column " - "names assigned" - ) - elif func is None: - # nicer error message - raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - - func = maybe_mangle_lambdas(func) + relabeling, func, columns, order = reconstruct_func(func, *args, **kwargs) result, how = self._aggregate(func, *args, **kwargs) if how is None: From 90d52bad4d6c4e5ccb5324e72c9561966cd9ccb2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 25 Jan 2020 20:59:00 +0100 Subject: [PATCH 060/106] move it back --- pandas/core/aggregation.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 508c1d522c323..d9edee9690e79 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -17,16 +17,25 @@ def reconstruct_func(func, *args, **kwargs): """ This is the internal function to reconstruct func given if there is relabeling - or not. And also normalize the keyword to get new order of columns; + or not and also normalize the keyword to get new order of columns. + If relabeling is True, will return relabeling, reconstructed func, column names, and the reconstructed order of columns. If relabeling is False, the columns and order will be None. + + Parameters + ---------- + func: aggregated function + **kwargs: dict """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) columns: Optional[List[str]] = None order: Optional[List[int]] = None - if isinstance(func, list) and len(func) > len(set(func)): + if relabeling: + func, columns, order = normalize_keyword_aggregation(kwargs) + + elif isinstance(func, list) and len(func) > len(set(func)): # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name @@ -37,9 +46,6 @@ def reconstruct_func(func, *args, **kwargs): # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - if relabeling: - func, columns, order = normalize_keyword_aggregation(kwargs) - func = maybe_mangle_lambdas(func) return relabeling, func, columns, order From 381a69749d7378974e2320d3a230a0ddeb2b322f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 25 Jan 2020 21:02:17 +0100 Subject: [PATCH 061/106] fixup --- pandas/core/aggregation.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index d9edee9690e79..08d6aeba254e2 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -32,20 +32,22 @@ def reconstruct_func(func, *args, **kwargs): columns: Optional[List[str]] = None order: Optional[List[int]] = None + if not relabeling: + if isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column names" + " assigned" + ) + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + if relabeling: func, columns, order = normalize_keyword_aggregation(kwargs) - elif isinstance(func, list) and len(func) > len(set(func)): - - # GH 28426 will raise error if duplicated function names are used and - # there is no reassigned name - raise SpecificationError( - "Function names must be unique if there is no new column names assigned" - ) - elif func is None: - # nicer error message - raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - func = maybe_mangle_lambdas(func) return relabeling, func, columns, order From 238b4cc7feefec3475694c4bd5abb662f5a94954 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 25 Jan 2020 21:38:56 +0100 Subject: [PATCH 062/106] add docstring --- pandas/core/aggregation.py | 19 ++++++++++++++++++- pandas/core/groupby/generic.py | 2 -- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 08d6aeba254e2..d811327af29ea 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -14,7 +14,9 @@ from pandas.core.indexes.api import Index -def reconstruct_func(func, *args, **kwargs): +def reconstruct_func( + func: Optional[List[Any]], *args, **kwargs: dict +) -> Tuple[bool, Any, List[str], List[int]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -27,6 +29,21 @@ def reconstruct_func(func, *args, **kwargs): ---------- func: aggregated function **kwargs: dict + + Returns + ------- + relabelling: bool, if there is relabelling or not + func: normalized and mangled func + columns: list of column names + order: list of columns indices + + Examples + -------- + >>> reconstruct_func({"foo": ("col", "min")}) + True, {"col": ["min"]}, ("foo",), [0] + + >>> reconstruct_func("min") + False, "min", None, None """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) columns: Optional[List[str]] = None diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 374fcd86c04fc..05446d96570ea 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -52,9 +52,7 @@ from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna from pandas.core.aggregation import ( - is_multi_agg_with_relabel, maybe_mangle_lambdas, - normalize_keyword_aggregation, reconstruct_func, ) import pandas.core.algorithms as algorithms From f8e1891f89dd8a84e18d9b1275d43cbcf5b0bdef Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 25 Jan 2020 22:40:51 +0100 Subject: [PATCH 063/106] add func --- pandas/core/aggregation.py | 2 +- pandas/core/frame.py | 42 +++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index d811327af29ea..4dd84f1d7c33a 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -16,7 +16,7 @@ def reconstruct_func( func: Optional[List[Any]], *args, **kwargs: dict -) -> Tuple[bool, Any, List[str], List[int]]: +) -> Tuple[bool, Any, Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5eb36b976b362..96de67ba8b243 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6756,26 +6756,34 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): return self.apply(func, axis=axis, args=args, **kwargs) if relabeling: - reordered_indexes = [ - pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) - ] + result = self._relabel_result(result, func, columns, order) - # This is to keep the order to columns occurrence unchanged, and also - # keep the order of new columns occurrence unchanged - reordered_result = DataFrame(index=columns) - idx = 0 - - # The reason is self._aggregate outputs different type of result if - # any column is only used once in aggregation - mask = True if any(len(v) == 1 for v in func.values()) else False - for col, fun in func.items(): - s = result[col][::-1].dropna() if mask else result[col] - s.index = reordered_indexes[idx : idx + len(fun)] - reordered_result[col] = s.reindex(columns) - idx = idx + len(fun) - result = reordered_result return result + @staticmethod + def _relabel_result(result, func, columns, order): + """Internal function to reorder result if relabelling.""" + + reordered_indexes = [ + pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) + ] + + # This is to keep the order to columns occurrence unchanged, and also + # keep the order of new columns occurrence unchanged + reordered_result = DataFrame(index=columns) + idx = 0 + + # The reason is self._aggregate outputs different type of result if + # any column is only used once in aggregation + mask = isinstance(result, ABCDataFrame) and result.isna().any().any() + + for col, fun in func.items(): + s = result[col][::-1].dropna() if mask else result[col] + s.index = reordered_indexes[idx: idx + len(fun)] + reordered_result[col] = s.reindex(columns) + idx = idx + len(fun) + return reordered_result + def _aggregate(self, arg, axis=0, *args, **kwargs): if axis == 1: # NDFrame.aggregate returns a tuple, and we need to transpose From 66e9b3863e62bc495e478dc9fb6894ed494c86b1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 25 Jan 2020 22:41:36 +0100 Subject: [PATCH 064/106] isort --- pandas/core/groupby/generic.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 05446d96570ea..1fa16d48b0f49 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -51,10 +51,7 @@ ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna -from pandas.core.aggregation import ( - maybe_mangle_lambdas, - reconstruct_func, -) +from pandas.core.aggregation import maybe_mangle_lambdas, reconstruct_func import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com From f4d8a4f73b2d12cf2f5c863f2c24f86627addd6b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 26 Jan 2020 08:36:50 +0100 Subject: [PATCH 065/106] fix linting --- pandas/core/aggregation.py | 2 +- pandas/core/frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 4dd84f1d7c33a..b40e79fd28294 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -15,7 +15,7 @@ def reconstruct_func( - func: Optional[List[Any]], *args, **kwargs: dict + func: Optional[List[Any], dict], *args, **kwargs: dict ) -> Tuple[bool, Any, Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96de67ba8b243..c2813a55ab585 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6779,7 +6779,7 @@ def _relabel_result(result, func, columns, order): for col, fun in func.items(): s = result[col][::-1].dropna() if mask else result[col] - s.index = reordered_indexes[idx: idx + len(fun)] + s.index = reordered_indexes[idx : idx + len(fun)] reordered_result[col] = s.reindex(columns) idx = idx + len(fun) return reordered_result From c3e34a03c747b298fa7729f9603b70e94009f9a7 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 26 Jan 2020 09:11:04 +0100 Subject: [PATCH 066/106] fix linting --- pandas/core/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index b40e79fd28294..b3ea76c353c14 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,7 +5,7 @@ from collections import defaultdict from functools import partial -from typing import Any, DefaultDict, List, Optional, Sequence, Tuple +from typing import Any, DefaultDict, List, Optional, Sequence, Tuple, Union from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -15,7 +15,7 @@ def reconstruct_func( - func: Optional[List[Any], dict], *args, **kwargs: dict + func: Optional[Union[List[Any], dict]], *args, **kwargs: dict ) -> Tuple[bool, Any, Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling From 88c775176d578d570e63157a9c79da60906358f3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 2 Feb 2020 21:06:48 +0100 Subject: [PATCH 067/106] code change on JR reviews --- pandas/core/aggregation.py | 21 ++++ pandas/tests/frame/test_apply.py | 98 ------------------ pandas/tests/frame/test_apply_relabeling.py | 103 +++++++++++++++++++ pandas/tests/series/test_apply.py | 31 ------ pandas/tests/series/test_apply_relabeling.py | 33 ++++++ 5 files changed, 157 insertions(+), 129 deletions(-) create mode 100644 pandas/tests/frame/test_apply_relabeling.py create mode 100644 pandas/tests/series/test_apply_relabeling.py diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index b3ea76c353c14..dac500ef91ee1 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -12,6 +12,7 @@ from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index +from pandas.core.dtypes.generic import ABCDataFrame def reconstruct_func( @@ -253,3 +254,23 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: mangled_aggspec = _managle_lambda_list(agg_spec) return mangled_aggspec + + +def _relabel_result(result, reordered_result, func, columns, order): + """Internal function to reorder result if relabelling for dataframe.agg.""" + + reordered_indexes = [ + pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) + ] + idx = 0 + + # The reason is self._aggregate outputs different type of result if + # any column is only used once in aggregation + mask = isinstance(result, ABCDataFrame) and result.isna().any().any() + + for col, fun in func.items(): + s = result[col][::-1].dropna() if mask else result[col] + s.index = reordered_indexes[idx : idx + len(fun)] + reordered_result[col] = s.reindex(columns) + idx = idx + len(fun) + return reordered_result diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index c922ee470056d..e98f74e133ea9 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1404,101 +1404,3 @@ def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, meth tm.assert_series_equal( none_in_first_column_result, none_in_second_column_result ) - - -class TestDataFrameNamedAggregate: - def test_agg_relabel(self): - # GH 26513 - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - - # simplest case with one column, one func - result = df.agg(foo=("B", "sum")) - expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) - tm.assert_frame_equal(result, expected) - - # test on same column with different methods - result = df.agg(foo=("B", "sum"), bar=("B", "min")) - expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) - - tm.assert_frame_equal(result, expected) - - def test_agg_relabel_multi_columns_multi_methods(self): - # GH 26513, test on multiple columns with multiple methods - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - result = df.agg( - foo=("A", "sum"), - bar=("B", "mean"), - cat=("A", "min"), - dat=("B", "max"), - f=("A", "max"), - g=("C", "min"), - ) - expected = pd.DataFrame( - { - "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], - "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], - "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], - }, - index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_relable_partial_functions(self): - # GH 26513, test on partial, functools or more complex cases - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) - expected = pd.DataFrame( - {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) - ) - tm.assert_frame_equal(result, expected) - - result = df.agg( - foo=("A", min), - bar=("A", np.min), - cat=("B", max), - dat=("C", "min"), - f=("B", np.sum), - ) - expected = pd.DataFrame( - { - "A": [1.0, 1.0, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, 10.0, np.nan, 4.0], - "C": [np.nan, np.nan, np.nan, 3.0, np.nan], - }, - index=pd.Index(["foo", "bar", "cat", "dat", "f"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_namedtuple(self): - # GH 26513 - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - result = df.agg( - foo=pd.NamedAgg("B", "sum"), - bar=pd.NamedAgg("B", min), - cat=pd.NamedAgg(column="B", aggfunc="count"), - fft=pd.NamedAgg("B", aggfunc="max"), - ) - - expected = pd.DataFrame( - {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) - ) - tm.assert_frame_equal(result, expected) - - result = df.agg( - foo=pd.NamedAgg("A", "min"), - bar=pd.NamedAgg(column="B", aggfunc="max"), - cat=pd.NamedAgg(column="A", aggfunc="max"), - ) - expected = pd.DataFrame( - {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, - index=pd.Index(["foo", "bar", "cat"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_raises(self): - # GH 26513 - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - msg = "Must provide" - - with pytest.raises(TypeError, match=msg): - df.agg() diff --git a/pandas/tests/frame/test_apply_relabeling.py b/pandas/tests/frame/test_apply_relabeling.py new file mode 100644 index 0000000000000..15292b8024c2f --- /dev/null +++ b/pandas/tests/frame/test_apply_relabeling.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestDataFrameNamedAggregate: + def test_agg_relabel(self): + # GH 26513 + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + + # simplest case with one column, one func + result = df.agg(foo=("B", "sum")) + expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) + tm.assert_frame_equal(result, expected) + + # test on same column with different methods + result = df.agg(foo=("B", "sum"), bar=("B", "min")) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_multi_columns_multi_methods(self): + # GH 26513, test on multiple columns with multiple methods + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg( + foo=("A", "sum"), + bar=("B", "mean"), + cat=("A", "min"), + dat=("B", "max"), + f=("A", "max"), + g=("C", "min"), + ) + expected = pd.DataFrame( + { + "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], + "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_relable_partial_functions(self): + # GH 26513, test on partial, functools or more complex cases + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + expected = pd.DataFrame( + {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + ) + expected = pd.DataFrame( + { + "A": [1.0, 1.0, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 10.0, np.nan, 4.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_namedtuple(self): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.agg( + foo=pd.NamedAgg("B", "sum"), + bar=pd.NamedAgg("B", min), + cat=pd.NamedAgg(column="B", aggfunc="count"), + fft=pd.NamedAgg("B", aggfunc="max"), + ) + + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=pd.NamedAgg("A", "min"), + bar=pd.NamedAgg(column="B", aggfunc="max"), + cat=pd.NamedAgg(column="A", aggfunc="max"), + ) + expected = pd.DataFrame( + {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, + index=pd.Index(["foo", "bar", "cat"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_raises(self): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 0b319f7a623c6..a4c55a80a9f0f 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -787,34 +787,3 @@ def test_map_float_to_string_precision(self): result = ser.map(lambda val: str(val)).to_dict() expected = {0: "0.3333333333333333"} assert result == expected - - -class TestNamedAggregation: - def test_relabel_no_duplicated_method(self): - # this is to test there is no duplicated method used in agg - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) - - result = df["A"].agg(foo="sum") - expected = df["A"].agg({"foo": "sum"}) - tm.assert_series_equal(result, expected) - - result = df.B.agg(foo="min", bar="max") - expected = df.B.agg({"foo": "min", "bar": "max"}) - tm.assert_series_equal(result, expected) - - result = df.B.agg(foo=sum, bar=min, cat="max") - expected = df.B.agg({"foo": sum, "bar": min, "cat": "max"}) - tm.assert_series_equal(result, expected) - - def test_relabel_duplicated_method(self): - # this is to test with nested renaming, duplicated method can be used - # if they are assigned with different new names - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) - - result = df.A.agg(foo="sum", bar="sum") - expected = pd.Series([6, 6], index=["foo", "bar"], name="A") - tm.assert_series_equal(result, expected) - - result = df.B.agg(foo=min, bar="min") - expected = pd.Series([1, 1], index=["foo", "bar"], name="B") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_apply_relabeling.py b/pandas/tests/series/test_apply_relabeling.py new file mode 100644 index 0000000000000..5fb6b6688e7b7 --- /dev/null +++ b/pandas/tests/series/test_apply_relabeling.py @@ -0,0 +1,33 @@ +import pandas as pd +import pandas._testing as tm + + +class TestNamedAggregation: + def test_relabel_no_duplicated_method(self): + # this is to test there is no duplicated method used in agg + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum") + expected = df["A"].agg({"foo": "sum"}) + tm.assert_series_equal(result, expected) + + result = df.B.agg(foo="min", bar="max") + expected = df.B.agg({"foo": "min", "bar": "max"}) + tm.assert_series_equal(result, expected) + + result = df.B.agg(foo=sum, bar=min, cat="max") + expected = df.B.agg({"foo": sum, "bar": min, "cat": "max"}) + tm.assert_series_equal(result, expected) + + def test_relabel_duplicated_method(self): + # this is to test with nested renaming, duplicated method can be used + # if they are assigned with different new names + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df.A.agg(foo="sum", bar="sum") + expected = pd.Series([6, 6], index=["foo", "bar"], name="A") + tm.assert_series_equal(result, expected) + + result = df.B.agg(foo=min, bar="min") + expected = pd.Series([1, 1], index=["foo", "bar"], name="B") + tm.assert_series_equal(result, expected) From e2b957a284ad993e19a680f3324550e44bef860f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 2 Feb 2020 21:08:25 +0100 Subject: [PATCH 068/106] move --- pandas/core/frame.py | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b311f562b7acc..84179ad48e0ac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -103,7 +103,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import reconstruct_func +from pandas.core.aggregation import reconstruct_func, _relabel_result from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -6782,34 +6782,13 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): return self.apply(func, axis=axis, args=args, **kwargs) if relabeling: - result = self._relabel_result(result, func, columns, order) + # This is to keep the order to columns occurrence unchanged, and also + # keep the order of new columns occurrence unchanged + reordered_result = DataFrame(index=columns) + result = _relabel_result(result, reordered_result, func, columns, order) return result - @staticmethod - def _relabel_result(result, func, columns, order): - """Internal function to reorder result if relabelling.""" - - reordered_indexes = [ - pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) - ] - - # This is to keep the order to columns occurrence unchanged, and also - # keep the order of new columns occurrence unchanged - reordered_result = DataFrame(index=columns) - idx = 0 - - # The reason is self._aggregate outputs different type of result if - # any column is only used once in aggregation - mask = isinstance(result, ABCDataFrame) and result.isna().any().any() - - for col, fun in func.items(): - s = result[col][::-1].dropna() if mask else result[col] - s.index = reordered_indexes[idx : idx + len(fun)] - reordered_result[col] = s.reindex(columns) - idx = idx + len(fun) - return reordered_result - def _aggregate(self, arg, axis=0, *args, **kwargs): if axis == 1: # NDFrame.aggregate returns a tuple, and we need to transpose From 99f75b2892aa6932e67d1d89560a4c703095f2b7 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 2 Feb 2020 21:09:14 +0100 Subject: [PATCH 069/106] linting --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index dac500ef91ee1..842e590371dc8 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -8,11 +8,11 @@ from typing import Any, DefaultDict, List, Optional, Sequence, Tuple, Union from pandas.core.dtypes.common import is_dict_like, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index -from pandas.core.dtypes.generic import ABCDataFrame def reconstruct_func( From 30b729637b56e731c85fdb006d0ea83276bfc4d3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 2 Feb 2020 21:39:34 +0100 Subject: [PATCH 070/106] isort --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 84179ad48e0ac..c7aff43d43d5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -103,7 +103,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import reconstruct_func, _relabel_result +from pandas.core.aggregation import _relabel_result, reconstruct_func from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor From baea583fb048888faa4108d2c1b7cf2229936e27 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 08:44:54 +0100 Subject: [PATCH 071/106] code change --- pandas/core/aggregation.py | 12 +++++++----- pandas/tests/series/test_apply_relabeling.py | 12 ++++++------ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 842e590371dc8..46f4e6d4c5229 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,7 +5,9 @@ from collections import defaultdict from functools import partial -from typing import Any, DefaultDict, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, DefaultDict, List, Optional, Sequence, Tuple, Union + +from pandas._typing import Label from pandas.core.dtypes.common import is_dict_like, is_list_like from pandas.core.dtypes.generic import ABCDataFrame @@ -16,8 +18,8 @@ def reconstruct_func( - func: Optional[Union[List[Any], dict]], *args, **kwargs: dict -) -> Tuple[bool, Any, Optional[List[str]], Optional[List[int]]]: + func: Optional[Union[List, dict]], *args, **kwargs +) -> Tuple[bool, Optional[Callable], Optional[List[Label]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -47,8 +49,8 @@ def reconstruct_func( False, "min", None, None """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) - columns: Optional[List[str]] = None - order: Optional[List[int]] = None + columns: Optional[List[str]] + order: Optional[List[int]] if not relabeling: if isinstance(func, list) and len(func) > len(set(func)): diff --git a/pandas/tests/series/test_apply_relabeling.py b/pandas/tests/series/test_apply_relabeling.py index 5fb6b6688e7b7..0b8d2c4e1f26d 100644 --- a/pandas/tests/series/test_apply_relabeling.py +++ b/pandas/tests/series/test_apply_relabeling.py @@ -11,12 +11,12 @@ def test_relabel_no_duplicated_method(self): expected = df["A"].agg({"foo": "sum"}) tm.assert_series_equal(result, expected) - result = df.B.agg(foo="min", bar="max") - expected = df.B.agg({"foo": "min", "bar": "max"}) + result = df["B"].agg(foo="min", bar="max") + expected = df["B"].agg({"foo": "min", "bar": "max"}) tm.assert_series_equal(result, expected) - result = df.B.agg(foo=sum, bar=min, cat="max") - expected = df.B.agg({"foo": sum, "bar": min, "cat": "max"}) + result = df["B"].agg(foo=sum, bar=min, cat="max") + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) tm.assert_series_equal(result, expected) def test_relabel_duplicated_method(self): @@ -24,10 +24,10 @@ def test_relabel_duplicated_method(self): # if they are assigned with different new names df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) - result = df.A.agg(foo="sum", bar="sum") + result = df["A"].agg(foo="sum", bar="sum") expected = pd.Series([6, 6], index=["foo", "bar"], name="A") tm.assert_series_equal(result, expected) - result = df.B.agg(foo=min, bar="min") + result = df["B"].agg(foo=min, bar="min") expected = pd.Series([1, 1], index=["foo", "bar"], name="B") tm.assert_series_equal(result, expected) From 04bffe6f05a32fcfd525acdd623693efa2d44a79 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 08:48:27 +0100 Subject: [PATCH 072/106] add docstring --- pandas/core/aggregation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 46f4e6d4c5229..00997d7cfa090 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -31,7 +31,8 @@ def reconstruct_func( Parameters ---------- func: aggregated function - **kwargs: dict + **kwargs: dict, kwargs used in is_multi_agg_with_relabel and + normalize_keyword_aggregation function for relabelling Returns ------- From 42091c3434106785fc215d3ba58894b36c5dbb3c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 09:08:00 +0100 Subject: [PATCH 073/106] add None back --- pandas/core/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 00997d7cfa090..da3ea379834f5 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -50,8 +50,8 @@ def reconstruct_func( False, "min", None, None """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) - columns: Optional[List[str]] - order: Optional[List[int]] + columns: Optional[List[str]] = None + order: Optional[List[int]] = None if not relabeling: if isinstance(func, list) and len(func) > len(set(func)): From fc13e193a1c5770da832c9f8c30f4b6f0c05aab8 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 09:31:52 +0100 Subject: [PATCH 074/106] fix annotation --- pandas/core/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index da3ea379834f5..10d66b79c719d 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,7 +5,7 @@ from collections import defaultdict from functools import partial -from typing import Any, Callable, DefaultDict, List, Optional, Sequence, Tuple, Union +from typing import Any, DefaultDict, List, Optional, Sequence, Tuple, Union from pandas._typing import Label @@ -19,7 +19,7 @@ def reconstruct_func( func: Optional[Union[List, dict]], *args, **kwargs -) -> Tuple[bool, Optional[Callable], Optional[List[Label]], Optional[List[int]]]: +) -> Tuple[bool, Any, Optional[List[Label]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. From 1403426fe9662a2788611b5f7773e235e811f8f1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 09:36:22 +0100 Subject: [PATCH 075/106] better annotation --- pandas/core/aggregation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 10d66b79c719d..bd46dc565c137 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,7 +5,7 @@ from collections import defaultdict from functools import partial -from typing import Any, DefaultDict, List, Optional, Sequence, Tuple, Union +from typing import Any, DefaultDict, Dict, List, Optional, Sequence, Tuple, Union from pandas._typing import Label @@ -18,8 +18,8 @@ def reconstruct_func( - func: Optional[Union[List, dict]], *args, **kwargs -) -> Tuple[bool, Any, Optional[List[Label]], Optional[List[int]]]: + func: Optional[Union[List, Dict]], *args, **kwargs +) -> Tuple[bool, Optional[List, Dict], Optional[List[Label]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. From 3d9655e4a5ba1c705403aa217097a1602c0ee460 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 10:03:24 +0100 Subject: [PATCH 076/106] fix annotation --- pandas/core/aggregation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index bd46dc565c137..7c705b1720259 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -19,7 +19,9 @@ def reconstruct_func( func: Optional[Union[List, Dict]], *args, **kwargs -) -> Tuple[bool, Optional[List, Dict], Optional[List[Label]], Optional[List[int]]]: +) -> Tuple[ + bool, Optional[Union[List, Dict]], Optional[List[Label]], Optional[List[int]] +]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. From d78c57c59b893772965b36eb2186f005e18e384c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 12 Feb 2020 10:25:48 +0100 Subject: [PATCH 077/106] fix annotation --- pandas/core/aggregation.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 7c705b1720259..fddf858918d5d 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -7,8 +7,6 @@ from functools import partial from typing import Any, DefaultDict, Dict, List, Optional, Sequence, Tuple, Union -from pandas._typing import Label - from pandas.core.dtypes.common import is_dict_like, is_list_like from pandas.core.dtypes.generic import ABCDataFrame @@ -20,7 +18,7 @@ def reconstruct_func( func: Optional[Union[List, Dict]], *args, **kwargs ) -> Tuple[ - bool, Optional[Union[List, Dict]], Optional[List[Label]], Optional[List[int]] + bool, Optional[Union[List, Dict]], Optional[List[str]], Optional[List[int]] ]: """ This is the internal function to reconstruct func given if there is relabeling From 04879281311d67c95a3288b62584391b17ca7822 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 16 Feb 2020 17:12:15 +0100 Subject: [PATCH 078/106] fix linting --- pandas/core/aggregation.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index fddf858918d5d..7a8d912d148f3 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -17,9 +17,7 @@ def reconstruct_func( func: Optional[Union[List, Dict]], *args, **kwargs -) -> Tuple[ - bool, Optional[Union[List, Dict]], Optional[List[str]], Optional[List[int]] -]: +) -> Tuple[bool, Optional[Union[List, Dict]], Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. From cd8d00f66a02df84b9ce820b39e17867b38a7d36 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 30 Mar 2020 20:40:26 +0200 Subject: [PATCH 079/106] simpler python --- pandas/core/aggregation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 7a1b0f7948b5a..6c9e2b2556170 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -7,6 +7,8 @@ from functools import partial from typing import Any, DefaultDict, Dict, List, Optional, Sequence, Tuple, Union +import numpy as np + from pandas.core.dtypes.common import is_dict_like, is_list_like from pandas.core.dtypes.generic import ABCDataFrame @@ -268,7 +270,7 @@ def _relabel_result(result, reordered_result, func, columns, order): # The reason is self._aggregate outputs different type of result if # any column is only used once in aggregation - mask = isinstance(result, ABCDataFrame) and result.isna().any().any() + mask = isinstance(result, ABCDataFrame) and np.any(result.isna()) for col, fun in func.items(): s = result[col][::-1].dropna() if mask else result[col] From 6dddd5513fb132159af84ed0970affdbf8d583be Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 30 Mar 2020 20:43:11 +0200 Subject: [PATCH 080/106] simpler python --- pandas/core/aggregation.py | 2 +- pandas/core/frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 6c9e2b2556170..f04ccc4a502f8 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -18,7 +18,7 @@ def reconstruct_func( - func: Optional[Union[List, Dict]], *args, **kwargs + func: Optional[Union[List, Dict]], **kwargs ) -> Tuple[bool, Optional[Union[List, Dict]], Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b570216646d05..b6c1e54fea8ac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6769,7 +6769,7 @@ def _gotitem( def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) - relabeling, func, columns, order = reconstruct_func(func, *args, **kwargs) + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result = None try: From 96dc3edd69982723f89aed5129b7ff3bef4e34c5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 30 Mar 2020 21:13:51 +0200 Subject: [PATCH 081/106] fixup --- pandas/core/aggregation.py | 4 ++-- pandas/core/groupby/generic.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index f04ccc4a502f8..ebd04a08fd6d6 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -59,8 +59,8 @@ def reconstruct_func( # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( - "Function names must be unique if there is no new column names" - " assigned" + "Function names must be unique if there is no new column names " + "assigned" ) elif func is None: # nicer error message diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f971418646082..1d2b03e637a95 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -907,7 +907,7 @@ class DataFrameGroupBy(GroupBy): @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - relabeling, func, columns, order = reconstruct_func(func, *args, **kwargs) + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result, how = self._aggregate(func, *args, **kwargs) if how is None: From 075b85b9f4ebfea97c407fa8a503ee84f78c0649 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 30 Mar 2020 21:51:46 +0200 Subject: [PATCH 082/106] simplification --- pandas/core/aggregation.py | 8 ++++---- pandas/core/frame.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index ebd04a08fd6d6..513bb32c746e7 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -260,7 +260,7 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: return mangled_aggspec -def _relabel_result(result, reordered_result, func, columns, order): +def _relabel_result(result, func, columns, order): """Internal function to reorder result if relabelling for dataframe.agg.""" reordered_indexes = [ @@ -271,10 +271,10 @@ def _relabel_result(result, reordered_result, func, columns, order): # The reason is self._aggregate outputs different type of result if # any column is only used once in aggregation mask = isinstance(result, ABCDataFrame) and np.any(result.isna()) - + reordered_result_in_dict = {} for col, fun in func.items(): s = result[col][::-1].dropna() if mask else result[col] s.index = reordered_indexes[idx : idx + len(fun)] - reordered_result[col] = s.reindex(columns) + reordered_result_in_dict[col] = s.reindex(columns) idx = idx + len(fun) - return reordered_result + return reordered_result_in_dict diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6c1e54fea8ac..bfb9ad40f0a97 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6782,8 +6782,8 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if relabeling: # This is to keep the order to columns occurrence unchanged, and also # keep the order of new columns occurrence unchanged - reordered_result = DataFrame(index=columns) - result = _relabel_result(result, reordered_result, func, columns, order) + result_in_dict = _relabel_result(result, func, columns, order) + result = DataFrame(result_in_dict, index=columns) return result From a44471ce841cab916ea9de50da63cfd0cb0c426f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 30 Mar 2020 21:55:17 +0200 Subject: [PATCH 083/106] better docs --- pandas/core/aggregation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 513bb32c746e7..b76180b96c28a 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -260,18 +260,20 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: return mangled_aggspec -def _relabel_result(result, func, columns, order): - """Internal function to reorder result if relabelling for dataframe.agg.""" +def _relabel_result(result, func, columns, order) -> Dict: + """Internal function to reorder result if relabelling is True for + dataframe.agg, and return the reordered result in dict.""" reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] + reordered_result_in_dict = {} idx = 0 # The reason is self._aggregate outputs different type of result if # any column is only used once in aggregation mask = isinstance(result, ABCDataFrame) and np.any(result.isna()) - reordered_result_in_dict = {} + for col, fun in func.items(): s = result[col][::-1].dropna() if mask else result[col] s.index = reordered_indexes[idx : idx + len(fun)] From 2fb4b27fcb12cf9eccb4d13a1de7d60cf694c0be Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Apr 2020 21:04:56 +0200 Subject: [PATCH 084/106] add docs --- pandas/core/aggregation.py | 42 ++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index d65b47da09f66..3a92e68160a5b 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,12 +5,22 @@ from collections import defaultdict from functools import partial -from typing import Any, DefaultDict, Dict, List, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) import numpy as np from pandas.core.dtypes.common import is_dict_like, is_list_like -from pandas.core.dtypes.generic import ABCDataFrame +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.base import SpecificationError import pandas.core.common as com @@ -259,10 +269,34 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: return mangled_aggspec -def _relabel_result(result, func, columns, order) -> Dict: +def _relabel_result( + result: Union[ABCSeries, ABCDataFrame], + func: Dict[str, List[Union[Callable, str]]], + columns: Tuple, + order: List[int], +) -> Dict[str, ABCSeries]: """Internal function to reorder result if relabelling is True for - dataframe.agg, and return the reordered result in dict.""" + dataframe.agg, and return the reordered result in dict. + Parameters: + ---------- + result: Result from aggregation + func: Dict of (column name, funcs) + columns: New columns name for relabelling + order: New order for relabelling + + Examples: + --------- + >>> result = DataFrame({"A": [np.nan, 2, np.nan], + ... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP + >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} + >>> columns = ("foo", "aab", "bar", "dat") + >>> order = [0, 1, 2, 3] + >>> _relabel_result(result, func, columns, order) # doctest: +SKIP + dict(A=Series([2.0, NaN, NaN, NaN], index=columns), + C=Series([NaN, 6.0, NaN, NaN], index=columns), + B=Series([NaN, NaN, 2.5, 4.0], index=columns)) + """ reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] From 5e04185799051c19eab28866ca0c4485dfc0c496 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Apr 2020 21:07:02 +0200 Subject: [PATCH 085/106] focs --- pandas/core/aggregation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 3a92e68160a5b..ae58a739c644a 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -293,9 +293,9 @@ def _relabel_result( >>> columns = ("foo", "aab", "bar", "dat") >>> order = [0, 1, 2, 3] >>> _relabel_result(result, func, columns, order) # doctest: +SKIP - dict(A=Series([2.0, NaN, NaN, NaN], index=columns), - C=Series([NaN, 6.0, NaN, NaN], index=columns), - B=Series([NaN, NaN, 2.5, 4.0], index=columns)) + dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]), + C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]), + B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"])) """ reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) From 7f4839eb947d5bd84cd0b8332c9a73e5c2bc7159 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Apr 2020 21:57:17 +0200 Subject: [PATCH 086/106] fix doc --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index ae58a739c644a..f94e0be3dc774 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -53,7 +53,7 @@ def reconstruct_func( Examples -------- - >>> reconstruct_func({"foo": ("col", "min")}) + >>> reconstruct_func(None, {"foo": ("col", "min")}) True, {"col": ["min"]}, ("foo",), [0] >>> reconstruct_func("min") From 65d578b351f2c3746fe8d679c9c8e803b51871ff Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Apr 2020 22:27:50 +0200 Subject: [PATCH 087/106] fixup --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index f94e0be3dc774..205ddb75be24b 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -53,7 +53,7 @@ def reconstruct_func( Examples -------- - >>> reconstruct_func(None, {"foo": ("col", "min")}) + >>> reconstruct_func(None, **{"foo": ("col", "min")}) True, {"col": ["min"]}, ("foo",), [0] >>> reconstruct_func("min") From 3e6a06cdca1a8691af189d82525ad7e0d4c4c4ec Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Apr 2020 22:59:24 +0200 Subject: [PATCH 088/106] fix up --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 205ddb75be24b..3b83828f5a457 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -54,7 +54,7 @@ def reconstruct_func( Examples -------- >>> reconstruct_func(None, **{"foo": ("col", "min")}) - True, {"col": ["min"]}, ("foo",), [0] + (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) >>> reconstruct_func("min") False, "min", None, None From 865144717e6aaff7d84d55b78ce1c1997e9954d2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Apr 2020 23:29:10 +0200 Subject: [PATCH 089/106] fix doctest --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 3b83828f5a457..00e7c626b114f 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -57,7 +57,7 @@ def reconstruct_func( (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) >>> reconstruct_func("min") - False, "min", None, None + (False, "min", None, None) """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) columns: Optional[List[str]] = None From a7439fe83c6d1171f4fe4f273febec093b4884c4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 09:06:58 +0200 Subject: [PATCH 090/106] doctest --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 00e7c626b114f..b944cc4594424 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -57,7 +57,7 @@ def reconstruct_func( (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) >>> reconstruct_func("min") - (False, "min", None, None) + (False, 'min', None, None) """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) columns: Optional[List[str]] = None From 9fd8ec5cdcf6f46f870260beb85f82e3a03e94d5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 13:23:21 +0200 Subject: [PATCH 091/106] rebuild --- pandas/core/aggregation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index b944cc4594424..7d622312f0bd5 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -78,7 +78,6 @@ def reconstruct_func( if relabeling: func, columns, order = normalize_keyword_aggregation(kwargs) - func = maybe_mangle_lambdas(func) return relabeling, func, columns, order From 0546224441ecef532cc895916b425c120f65aa44 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 16 Jun 2020 22:36:38 +0200 Subject: [PATCH 092/106] cleaner code --- pandas/core/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 184870ca04063..74d512082b9ef 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -306,6 +306,7 @@ def _relabel_result( reorder_mask = len(result.columns) > 1 for col, fun in func.items(): + s = result[col].dropna() # In the `_aggregate`, the callable names are obtained and used in `result`, and # these names are ordered alphabetically. e.g. @@ -320,8 +321,6 @@ def _relabel_result( # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is # [sum, ], but in `result`, it will be [, sum], and we need to # reorder so that aggregated values map to their functions. - fun = [com.get_callable_name(f) if not isinstance(f, str) else f for f in fun] - s = result[col].dropna() # If there is only one column being used for aggregation, not need to reorder # since the index is not sorted, e.g. @@ -330,6 +329,7 @@ def _relabel_result( # mean 1.5 # mean 1.5 if reorder_mask: + fun = [com.get_callable_name(f) if not isinstance(f, str) else f for f in fun] col_idx_order = Index(s.index).get_indexer(fun) s = s[col_idx_order] From f5f0e68bf9fd0f425aa075a987bcd4c85cb15555 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 16 Jun 2020 22:54:12 +0200 Subject: [PATCH 093/106] rename --- pandas/tests/frame/apply/{test_apply.py => test_frame_apply.py} | 0 pandas/tests/series/apply/{test_apply.py => test_series_apply.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/frame/apply/{test_apply.py => test_frame_apply.py} (100%) rename pandas/tests/series/apply/{test_apply.py => test_series_apply.py} (100%) diff --git a/pandas/tests/frame/apply/test_apply.py b/pandas/tests/frame/apply/test_frame_apply.py similarity index 100% rename from pandas/tests/frame/apply/test_apply.py rename to pandas/tests/frame/apply/test_frame_apply.py diff --git a/pandas/tests/series/apply/test_apply.py b/pandas/tests/series/apply/test_series_apply.py similarity index 100% rename from pandas/tests/series/apply/test_apply.py rename to pandas/tests/series/apply/test_series_apply.py From 54ff96244931b60607bd5f996211bec0d1fab483 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 16 Jun 2020 22:56:42 +0200 Subject: [PATCH 094/106] linting --- pandas/core/aggregation.py | 10 +++++----- pandas/core/groupby/generic.py | 2 -- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 74d512082b9ef..2cdf3fb85454f 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -17,8 +17,6 @@ Union, ) -import numpy as np - from pandas._typing import Label from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -329,12 +327,14 @@ def _relabel_result( # mean 1.5 # mean 1.5 if reorder_mask: - fun = [com.get_callable_name(f) if not isinstance(f, str) else f for f in fun] + fun = [ + com.get_callable_name(f) if not isinstance(f, str) else f for f in fun + ] col_idx_order = Index(s.index).get_indexer(fun) s = s[col_idx_order] - # assign the new user-provided "named aggregation" as index names, and reindex it - # based on the whole user-provided names. + # assign the new user-provided "named aggregation" as index names, and reindex + # it based on the whole user-provided names. s.index = reordered_indexes[idx : idx + len(fun)] reordered_result_in_dict[col] = s.reindex(columns, copy=False) idx = idx + len(fun) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ae33942ead6ac..64b12f9147e06 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -54,9 +54,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core.aggregation import ( - is_multi_agg_with_relabel, maybe_mangle_lambdas, - normalize_keyword_aggregation, reconstruct_func, validate_func_kwargs, ) From ac570230cb5244b62239ba4a4c680f6a2c967934 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 16 Jun 2020 23:34:50 +0200 Subject: [PATCH 095/106] init --- pandas/tests/frame/apply/__init__.py | 0 pandas/tests/series/apply/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/frame/apply/__init__.py create mode 100644 pandas/tests/series/apply/__init__.py diff --git a/pandas/tests/frame/apply/__init__.py b/pandas/tests/frame/apply/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/series/apply/__init__.py b/pandas/tests/series/apply/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 484e42c02453297707411a1dc66ce0bc8203d35a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 17 Jun 2020 08:52:20 +0200 Subject: [PATCH 096/106] better doc --- pandas/core/aggregation.py | 8 ++++---- pandas/tests/frame/apply/test_apply_relabeling.py | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 2cdf3fb85454f..77ea3fdea9b2d 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -311,17 +311,17 @@ def _relabel_result( # C2 C1 # 1 NaN # amax NaN 4.0 - # max NaN 2.0 + # max NaN 4.0 # sum 18.0 6.0 # Therefore, the order of functions for each column could be shuffled # accordingly so need to get the callable name if it is not parsed names, and # reorder the aggregated result for each column. # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is # [sum, ], but in `result`, it will be [, sum], and we need to - # reorder so that aggregated values map to their functions. + # reorder so that aggregated values map to their functions regarding the order. - # If there is only one column being used for aggregation, not need to reorder - # since the index is not sorted, e.g. + # However there is only one column being used for aggregation, not need to + # reorder since the index is not sorted, and keep as is in `funcs`, e.g. # A # min 1.0 # mean 1.5 diff --git a/pandas/tests/frame/apply/test_apply_relabeling.py b/pandas/tests/frame/apply/test_apply_relabeling.py index 83824b2675ac6..965f69753bdc7 100644 --- a/pandas/tests/frame/apply/test_apply_relabeling.py +++ b/pandas/tests/frame/apply/test_apply_relabeling.py @@ -57,14 +57,15 @@ def test_agg_relabel_partial_functions(self): cat=("B", max), dat=("C", "min"), f=("B", np.sum), + kk=("B", lambda x: min(x)), ) expected = pd.DataFrame( { - "A": [1.0, 1.0, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, 4.0, np.nan, 10.0], - "C": [np.nan, np.nan, np.nan, 3.0, np.nan], + "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], }, - index=pd.Index(["foo", "bar", "cat", "dat", "f"]), + index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), ) tm.assert_frame_equal(result, expected) From 47e6598efd95eee444fc21e66287d744ac235760 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 17 Jun 2020 09:20:40 +0200 Subject: [PATCH 097/106] complex case --- pandas/core/aggregation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 77ea3fdea9b2d..aef51b25ff7e6 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -23,6 +23,7 @@ from pandas.core.base import SpecificationError import pandas.core.common as com +from pandas.core.frame import DataFrame from pandas.core.indexes.api import Index from pandas.core.series import FrameOrSeriesUnion, Series @@ -302,7 +303,7 @@ def _relabel_result( reordered_result_in_dict: Dict[Label, Series] = {} idx = 0 - reorder_mask = len(result.columns) > 1 + reorder_mask = isinstance(result, DataFrame) and len(result.columns) > 1 for col, fun in func.items(): s = result[col].dropna() From f28b452eb5264c3a179f34463b8e178b90b41275 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 17 Jun 2020 09:23:13 +0200 Subject: [PATCH 098/106] linting --- pandas/core/aggregation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index aef51b25ff7e6..c065618fb7c73 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -23,7 +23,6 @@ from pandas.core.base import SpecificationError import pandas.core.common as com -from pandas.core.frame import DataFrame from pandas.core.indexes.api import Index from pandas.core.series import FrameOrSeriesUnion, Series @@ -303,7 +302,7 @@ def _relabel_result( reordered_result_in_dict: Dict[Label, Series] = {} idx = 0 - reorder_mask = isinstance(result, DataFrame) and len(result.columns) > 1 + reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 for col, fun in func.items(): s = result[col].dropna() From 849338347413611797017a6c03ccc64911fd81b2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 25 Jun 2020 20:48:22 +0200 Subject: [PATCH 099/106] add typing --- pandas/core/aggregation.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index c065618fb7c73..d93807113713c 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -28,19 +28,33 @@ def reconstruct_func( - func: Optional[Union[List, Dict]], **kwargs + func: Optional[ + Union[ + Union[Callable, str], + List[Union[Callable, str]], + Dict[Label, Union[Union[Callable, str], List[Union[Callable, str]]]], + ] + ], + **kwargs, ) -> Tuple[bool, Optional[Union[List, Dict]], Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. + If named aggregation is applied, `func` will be None, and kwargs contains the + column and aggregation function information to be parsed; + If named aggregation is not applied, `func` is either string (e.g. 'min') or + Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name + and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]}) + If relabeling is True, will return relabeling, reconstructed func, column names, and the reconstructed order of columns. If relabeling is False, the columns and order will be None. Parameters ---------- - func: aggregated function + func: agg function (e.g. 'min' or Callable) or list of agg functions + (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}). **kwargs: dict, kwargs used in is_multi_agg_with_relabel and normalize_keyword_aggregation function for relabelling From c75c882bdab46825320cdc7763b97ed97d1eb373 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 25 Jun 2020 20:55:56 +0200 Subject: [PATCH 100/106] black --- pandas/core/aggregation.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index d93807113713c..f63a2c8936cb0 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -36,7 +36,18 @@ def reconstruct_func( ] ], **kwargs, -) -> Tuple[bool, Optional[Union[List, Dict]], Optional[List[str]], Optional[List[int]]]: +) -> Tuple[ + bool, + Optional[ + Union[ + Union[Callable, str], + List[Union[Callable, str]], + Dict[Label, Union[Union[Callable, str], List[Union[Callable, str]]]], + ] + ], + Optional[List[str]], + Optional[List[int]], +]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -94,6 +105,7 @@ def reconstruct_func( func, columns, order = normalize_keyword_aggregation(kwargs) func = maybe_mangle_lambdas(func) + print(func) return relabeling, func, columns, order From fa61db7f48c506c6611ac027ecbab4b202449814 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 25 Jun 2020 21:23:25 +0200 Subject: [PATCH 101/106] remove line --- pandas/core/aggregation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index f63a2c8936cb0..de9e1ecb4e817 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -105,7 +105,6 @@ def reconstruct_func( func, columns, order = normalize_keyword_aggregation(kwargs) func = maybe_mangle_lambdas(func) - print(func) return relabeling, func, columns, order From 26b380a3b475a7a5261c2d5fc788e6dc93e5526f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 5 Jul 2020 11:01:01 +0200 Subject: [PATCH 102/106] simplify annotation --- pandas/core/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index de9e1ecb4e817..17001784caccc 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -32,7 +32,7 @@ def reconstruct_func( Union[ Union[Callable, str], List[Union[Callable, str]], - Dict[Label, Union[Union[Callable, str], List[Union[Callable, str]]]], + Dict[Label, Union[Callable, str, List[Union[Callable, str]]]], ] ], **kwargs, @@ -42,7 +42,7 @@ def reconstruct_func( Union[ Union[Callable, str], List[Union[Callable, str]], - Dict[Label, Union[Union[Callable, str], List[Union[Callable, str]]]], + Dict[Label, Union[Callable, str, List[Union[Callable, str]]]], ] ], Optional[List[str]], From 44405e88727f675915b532f1864bb5516d96db0e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Jul 2020 18:01:39 +0200 Subject: [PATCH 103/106] deprivate relabel_result --- pandas/core/aggregation.py | 2 +- pandas/core/frame.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 17001784caccc..dcad48967b97f 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -293,7 +293,7 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: return mangled_aggspec -def _relabel_result( +def relabel_result( result: FrameOrSeriesUnion, func: Dict[str, List[Union[Callable, str]]], columns: Tuple, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bdae9d72e0bf9..10539ab74b4aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -114,7 +114,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import _relabel_result, reconstruct_func +from pandas.core.aggregation import reconstruct_func, relabel_result from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -7322,7 +7322,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if relabeling: # This is to keep the order to columns occurrence unchanged, and also # keep the order of new columns occurrence unchanged - result_in_dict = _relabel_result(result, func, columns, order) + result_in_dict = relabel_result(result, func, columns, order) result = DataFrame(result_in_dict, index=columns) return result From faea906b10849e4e83cb4e09965cbbe3fcb179c3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Jul 2020 18:15:17 +0200 Subject: [PATCH 104/106] cleaner annotations --- pandas/_typing.py | 8 ++++++++ pandas/core/aggregation.py | 22 +++------------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 4892abc5f6f51..8e98833ad37f7 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -96,3 +96,11 @@ # DataFrame::sort_index, among others ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]] IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]] + +# types of `func` kwarg for DataFrame.aggregate and Series.aggregate +AggFuncTypeBase = Union[Callable, str] +AggFuncType = Union[ + AggFuncTypeBase, + List[AggFuncTypeBase], + Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], +] diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index dcad48967b97f..182a7a81c6dea 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -17,7 +17,7 @@ Union, ) -from pandas._typing import Label +from pandas._typing import Label, AggFuncType from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -28,25 +28,9 @@ def reconstruct_func( - func: Optional[ - Union[ - Union[Callable, str], - List[Union[Callable, str]], - Dict[Label, Union[Callable, str, List[Union[Callable, str]]]], - ] - ], - **kwargs, + func: Optional[AggFuncType], **kwargs, ) -> Tuple[ - bool, - Optional[ - Union[ - Union[Callable, str], - List[Union[Callable, str]], - Dict[Label, Union[Callable, str, List[Union[Callable, str]]]], - ] - ], - Optional[List[str]], - Optional[List[int]], + bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], ]: """ This is the internal function to reconstruct func given if there is relabeling From 3d205242ae7f74bb7fdf234251fdef7933f824ca Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Jul 2020 18:25:06 +0200 Subject: [PATCH 105/106] fix import sorting --- pandas/core/aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 182a7a81c6dea..891048ae82dfd 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -17,7 +17,7 @@ Union, ) -from pandas._typing import Label, AggFuncType +from pandas._typing import AggFuncType, Label from pandas.core.dtypes.common import is_dict_like, is_list_like From 05921afef8d10eabd0917ba0fbeeefc18ae7d511 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 10 Jul 2020 18:53:29 +0200 Subject: [PATCH 106/106] move defined annotation inside aggregation.py --- pandas/_typing.py | 8 -------- pandas/core/aggregation.py | 10 +++++++++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 8e98833ad37f7..4892abc5f6f51 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -96,11 +96,3 @@ # DataFrame::sort_index, among others ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]] IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]] - -# types of `func` kwarg for DataFrame.aggregate and Series.aggregate -AggFuncTypeBase = Union[Callable, str] -AggFuncType = Union[ - AggFuncTypeBase, - List[AggFuncTypeBase], - Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], -] diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 891048ae82dfd..16c4a9f862d79 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -17,7 +17,7 @@ Union, ) -from pandas._typing import AggFuncType, Label +from pandas._typing import Label from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -26,6 +26,14 @@ from pandas.core.indexes.api import Index from pandas.core.series import FrameOrSeriesUnion, Series +# types of `func` kwarg for DataFrame.aggregate and Series.aggregate +AggFuncTypeBase = Union[Callable, str] +AggFuncType = Union[ + AggFuncTypeBase, + List[AggFuncTypeBase], + Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], +] + def reconstruct_func( func: Optional[AggFuncType], **kwargs,