From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 01/27] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From dea38f24c0067ae3fe9484b837c9649714213bba Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:26:31 +0100 Subject: [PATCH 02/27] fix issue 17038 --- pandas/core/reshape/pivot.py | 4 +++- pandas/tests/reshape/test_pivot.py | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b443ba142369c..9743d90f4dd04 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -117,7 +117,9 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: + + # GH 17038, this check should only happen if index is specified + if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 743fc50c87e96..46a05123c9fdd 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -896,12 +896,6 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() - # no rows - rtable = self.data.pivot_table( - columns=["AA", "BB"], margins=True, aggfunc=np.mean - ) - assert isinstance(rtable, Series) - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -972,6 +966,20 @@ def test_pivot_integer_columns(self): tm.assert_frame_equal(table, table2, check_names=False) + @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) + def test_pivot_table_multiindex_only(self, cols): + # GH 17038 + df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) + + result = df2.pivot_table(values="v", columns=cols) + expected = DataFrame( + [[4, 5, 6]], + columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), + index=Index(["v"]), + ) + + tm.assert_frame_equal(result, expected) + def test_pivot_no_level_overlap(self): # GH #1181 From cd9e7ac3f31ffaf95cd628863df911dea9fa1248 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:29:43 +0100 Subject: [PATCH 03/27] revert change --- pandas/core/reshape/pivot.py | 3 +-- pandas/tests/reshape/test_pivot.py | 20 ++++++-------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 9743d90f4dd04..a7cdbb0da7a4e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -118,8 +118,7 @@ def pivot_table( table = agged - # GH 17038, this check should only happen if index is specified - if table.index.nlevels > 1 and index: + if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46a05123c9fdd..743fc50c87e96 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -896,6 +896,12 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() + # no rows + rtable = self.data.pivot_table( + columns=["AA", "BB"], margins=True, aggfunc=np.mean + ) + assert isinstance(rtable, Series) + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -966,20 +972,6 @@ def test_pivot_integer_columns(self): tm.assert_frame_equal(table, table2, check_names=False) - @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) - def test_pivot_table_multiindex_only(self, cols): - # GH 17038 - df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) - - result = df2.pivot_table(values="v", columns=cols) - expected = DataFrame( - [[4, 5, 6]], - columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"]), - ) - - tm.assert_frame_equal(result, expected) - def test_pivot_no_level_overlap(self): # GH #1181 From e5e912be0f596943067a7df812442764d311a086 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 21:30:16 +0100 Subject: [PATCH 04/27] revert change --- pandas/core/reshape/pivot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index a7cdbb0da7a4e..b443ba142369c 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -117,7 +117,6 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer From 93ebadb22de0bc66a94a500256b40ab781a69ff5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 20:18:57 +0100 Subject: [PATCH 05/27] try fix --- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/ops.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 02e9383314d36..d86674b2dfcd9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -792,7 +792,7 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False): + def _try_cast(self, result, obj, numeric_only: bool = False, how=None): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -813,7 +813,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False): # datetime64tz is handled correctly in agg_series, # so is excluded here. - if len(result) and isinstance(result[0], dtype.type): + if len(result) and isinstance(result[0], dtype.type) or how=="first": cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) @@ -900,7 +900,7 @@ def _cython_agg_general( else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj) + output[key] = self._try_cast(result, obj, how=how) idx += 1 if len(output) == 0: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2e95daa392976..94c5c5ff9acab 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -451,7 +451,8 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values) or is_sparse(values): + # GH 31450, except if how is first + if is_categorical_dtype(values) and how != "first" or is_sparse(values): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: From 3520b953ca3eaad2e2cf7f4017c28a4bb48e813c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 20:24:14 +0100 Subject: [PATCH 06/27] upload test --- pandas/tests/groupby/test_categorical.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1c2de8c8c223f..0dca4d0c1a6a6 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1376,3 +1376,13 @@ def test_groupby_agg_non_numeric(): result = df.groupby([1, 2, 1]).nunique() tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_categorical_first(): + # GH 31450 + df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) + df["col_cat"] = df["col_num"].astype("category") + + grouped = df.groupby("col_num").agg({"col_cat": "first"}) + expected = df.groupby("col_num").agg("first") + tm.assert_frame_equal(grouped, expected) From 32cc74466a020f7474e101252bd56564eb74db18 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 20:27:34 +0100 Subject: [PATCH 07/27] linting --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d86674b2dfcd9..ea9ffaa4ebcff 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -813,7 +813,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how=None): # datetime64tz is handled correctly in agg_series, # so is excluded here. - if len(result) and isinstance(result[0], dtype.type) or how=="first": + if len(result) and isinstance(result[0], dtype.type) or how == "first": cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) From 9f936cc4089231170cbcc825299ea2864a7d50fc Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 21:49:04 +0100 Subject: [PATCH 08/27] broader concept --- pandas/core/groupby/base.py | 2 ++ pandas/core/groupby/groupby.py | 10 ++++++++-- pandas/core/groupby/ops.py | 10 ++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 700d8d503d086..e231ef5283b84 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -92,6 +92,8 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) +cython_cast_keep_type_list = frozenset(["min", "max", "first", "last"]) + # List of aggregation/reduction functions. # These map each group to a single numeric value reduction_kernels = frozenset( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ea9ffaa4ebcff..d7fa1b6f13118 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -813,7 +813,13 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how=None): # datetime64tz is handled correctly in agg_series, # so is excluded here. - if len(result) and isinstance(result[0], dtype.type) or how == "first": + from pandas.core.groupby.base import cython_cast_keep_type_list + + if ( + len(result) + and isinstance(result[0], dtype.type) + or how in cython_cast_keep_type_list + ): cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) @@ -900,7 +906,7 @@ def _cython_agg_general( else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, how=how) + output[key] = self._try_cast(result, obj) idx += 1 if len(output) == 0: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 94c5c5ff9acab..8a73f87835bcc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -451,8 +451,14 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming - # GH 31450, except if how is first - if is_categorical_dtype(values) and how != "first" or is_sparse(values): + # those four cython agg that should work with categoricals + from pandas.core.groupby.base import cython_cast_keep_type_list + + if ( + is_categorical_dtype(values) + and how not in cython_cast_keep_type_list + or is_sparse(values) + ): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: From 946c49fa847d595ebe611c8d07c329bdd4436885 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 21:50:20 +0100 Subject: [PATCH 09/27] fix up --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d7fa1b6f13118..093226c2ec5b0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -906,7 +906,7 @@ def _cython_agg_general( else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj) + output[key] = self._try_cast(result, obj, how=how) idx += 1 if len(output) == 0: From 73b01c67f59cfd2dc3f80e8405fafcb54772dd4a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 21:53:16 +0100 Subject: [PATCH 10/27] imports --- pandas/core/groupby/groupby.py | 4 +--- pandas/core/groupby/ops.py | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 093226c2ec5b0..17ecd0c55eb60 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -59,6 +59,7 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, ops +from pandas.core.groupby.base import cython_cast_keep_type_list from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -812,9 +813,6 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how=None): # if the type is compatible with the calling EA. # datetime64tz is handled correctly in agg_series, # so is excluded here. - - from pandas.core.groupby.base import cython_cast_keep_type_list - if ( len(result) and isinstance(result[0], dtype.type) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8a73f87835bcc..fa9767dd9b62d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -43,6 +43,7 @@ from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, grouper +from pandas.core.groupby.base import cython_cast_keep_type_list from pandas.core.indexes.api import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( @@ -452,8 +453,6 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming # those four cython agg that should work with categoricals - from pandas.core.groupby.base import cython_cast_keep_type_list - if ( is_categorical_dtype(values) and how not in cython_cast_keep_type_list From 2fdb3f54bbc449d192dd8d80a77cc230a3b88837 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 22:05:22 +0100 Subject: [PATCH 11/27] keep experimenting --- pandas/core/groupby/base.py | 2 +- pandas/core/groupby/groupby.py | 4 ++-- pandas/core/groupby/ops.py | 4 ++-- pandas/tests/groupby/test_categorical.py | 7 ++++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index e231ef5283b84..8e667e30cf403 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -92,7 +92,7 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) -cython_cast_keep_type_list = frozenset(["min", "max", "first", "last"]) +cython_cast_cat_type_list = frozenset(["first", "last"]) # List of aggregation/reduction functions. # These map each group to a single numeric value diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 17ecd0c55eb60..23c43c4f72cc6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -59,7 +59,7 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, ops -from pandas.core.groupby.base import cython_cast_keep_type_list +from pandas.core.groupby.base import cython_cast_cat_type_list from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -816,7 +816,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how=None): if ( len(result) and isinstance(result[0], dtype.type) - or how in cython_cast_keep_type_list + or how in cython_cast_cat_type_list ): cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fa9767dd9b62d..aaaa6c7e11c48 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -43,7 +43,7 @@ from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, grouper -from pandas.core.groupby.base import cython_cast_keep_type_list +from pandas.core.groupby.base import cython_cast_cat_type_list from pandas.core.indexes.api import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( @@ -455,7 +455,7 @@ def _cython_operation( # those four cython agg that should work with categoricals if ( is_categorical_dtype(values) - and how not in cython_cast_keep_type_list + and how not in cython_cast_cat_type_list or is_sparse(values) ): raise NotImplementedError(f"{values.dtype} dtype not supported") diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0dca4d0c1a6a6..11a933ae33ce2 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1378,11 +1378,12 @@ def test_groupby_agg_non_numeric(): tm.assert_frame_equal(result, expected) -def test_groupby_agg_categorical_first(): +@pytest.mark.parametrize("func", ["first", "last"]) +def test_groupby_agg_categorical_first_last(func): # GH 31450 df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) df["col_cat"] = df["col_num"].astype("category") - grouped = df.groupby("col_num").agg({"col_cat": "first"}) - expected = df.groupby("col_num").agg("first") + grouped = df.groupby("col_num").agg({"col_cat": func}) + expected = df.groupby("col_num").agg(func) tm.assert_frame_equal(grouped, expected) From 9e52c70a43a7e7a0dde680785b0f6b840209fb36 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 22:28:36 +0100 Subject: [PATCH 12/27] fixtup --- pandas/core/groupby/base.py | 3 +++ pandas/core/groupby/groupby.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 8e667e30cf403..92d63b21d884a 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -93,6 +93,9 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) cython_cast_cat_type_list = frozenset(["first", "last"]) +cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset( + ["sum", "min", "max"] +) # List of aggregation/reduction functions. # These map each group to a single numeric value diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 23c43c4f72cc6..17ecd0c55eb60 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -59,7 +59,7 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, ops -from pandas.core.groupby.base import cython_cast_cat_type_list +from pandas.core.groupby.base import cython_cast_keep_type_list from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -816,7 +816,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how=None): if ( len(result) and isinstance(result[0], dtype.type) - or how in cython_cast_cat_type_list + or how in cython_cast_keep_type_list ): cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) From a366b028b379dc36c06d2218f884196e48d0b5ce Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 30 Jan 2020 22:31:39 +0100 Subject: [PATCH 13/27] add comment --- pandas/core/groupby/groupby.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 17ecd0c55eb60..1da0f6459d448 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -808,11 +808,14 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how=None): dtype = obj.dtype if not is_scalar(result): + + # The function can return something of any type, so check + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. if is_extension_array_dtype(dtype) and dtype.kind != "M": - # The function can return something of any type, so check - # if the type is compatible with the calling EA. - # datetime64tz is handled correctly in agg_series, - # so is excluded here. + # if how is in cython_cast_keep_type_list, which means it + # should be cast back to return the same type as obj if ( len(result) and isinstance(result[0], dtype.type) From 36184f62dff799496d5c852542d135dbff5ce631 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 1 Feb 2020 14:05:21 +0100 Subject: [PATCH 14/27] experiment --- pandas/core/groupby/groupby.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 04ffbbfeacabc..45ea28d85de72 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -793,7 +793,7 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False, how=None): + def _try_cast(self, result, obj, numeric_only: bool = False): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -814,15 +814,8 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how=None): # datetime64tz is handled correctly in agg_series, # so is excluded here. if is_extension_array_dtype(dtype) and dtype.kind != "M": - # if how is in cython_cast_keep_type_list, which means it - # should be cast back to return the same type as obj - if ( - len(result) - and isinstance(result[0], dtype.type) - or how in cython_cast_keep_type_list - ): - cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) + cls = dtype.construct_array_type() + result = try_cast_to_ea(cls, result, dtype=dtype) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) @@ -878,6 +871,19 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + def _aggregate_should_cast(self, how: str, result=None, obj=None) -> bool: + if obj.ndim > 1: + dtype = obj._values.dtype + else: + dtype = obj.dtype + + should_cast = ( + len(result) + and isinstance(result[0], dtype.type) + or how in base.cython_cast_keep_type_list + ) + return should_cast + def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): @@ -902,12 +908,16 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = self._try_cast(result_column, obj) + if self._aggregate_should_cast(how, result, obj): + result = self._try_cast(result_column, obj) + output[key] = result_column idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, how=how) + if self._aggregate_should_cast(how, result, obj): + result = self._try_cast(result, obj) + output[key] = result idx += 1 if len(output) == 0: From 9d4e0210240d07f8a144b3dc3701ddd0fdeffa49 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 1 Feb 2020 14:15:14 +0100 Subject: [PATCH 15/27] update --- pandas/core/groupby/base.py | 2 +- pandas/core/groupby/groupby.py | 17 ++++------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 92d63b21d884a..64d5226f4a330 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -94,7 +94,7 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_cat_type_list = frozenset(["first", "last"]) cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset( - ["sum", "min", "max"] + ["sum", "min", "max", "add"] ) # List of aggregation/reduction functions. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 45ea28d85de72..69d9ee4711fa3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -871,17 +871,8 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - def _aggregate_should_cast(self, how: str, result=None, obj=None) -> bool: - if obj.ndim > 1: - dtype = obj._values.dtype - else: - dtype = obj.dtype - - should_cast = ( - len(result) - and isinstance(result[0], dtype.type) - or how in base.cython_cast_keep_type_list - ) + def _aggregate_should_cast(self, how: str) -> bool: + should_cast = how in base.cython_cast_keep_type_list return should_cast def _cython_agg_general( @@ -908,14 +899,14 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - if self._aggregate_should_cast(how, result, obj): + if self._aggregate_should_cast(how): result = self._try_cast(result_column, obj) output[key] = result_column idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - if self._aggregate_should_cast(how, result, obj): + if self._aggregate_should_cast(how): result = self._try_cast(result, obj) output[key] = result idx += 1 From c588204b315c397e288635019ca9cc2ede356d7f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 1 Feb 2020 14:29:56 +0100 Subject: [PATCH 16/27] change base --- pandas/core/groupby/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 64d5226f4a330..2e586aca3c3e5 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -93,9 +93,7 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) cython_cast_cat_type_list = frozenset(["first", "last"]) -cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset( - ["sum", "min", "max", "add"] -) +cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset(["min", "max"]) # List of aggregation/reduction functions. # These map each group to a single numeric value From a11279dff08cc18f5aca546f1bfc4437eabc8dee Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 1 Feb 2020 19:28:14 +0100 Subject: [PATCH 17/27] experiment --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/extension/base/groupby.py | 4 ++-- pandas/tests/extension/test_boolean.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 69d9ee4711fa3..89bdcd1fe52e2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -900,7 +900,7 @@ def _cython_agg_general( for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) if self._aggregate_should_cast(how): - result = self._try_cast(result_column, obj) + result_column = self._try_cast(result_column, obj) output[key] = result_column idx += 1 else: diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 94d0ef7bbea84..ea27777015a23 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -26,7 +26,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=True) index = pd.Index(index, name="B") - expected = pd.Series([3, 1, 4], index=index, name="A") + expected = pd.Series([3, 1, 4], dtype="float64", index=index, name="A") if as_index: self.assert_series_equal(result, expected) else: @@ -39,7 +39,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=False) index = pd.Index(index, name="B") - expected = pd.Series([1, 3, 4], index=index, name="A") + expected = pd.Series([1, 3, 4], dtype="float64", index=index, name="A") self.assert_series_equal(result, expected) def test_groupby_extension_transform(self, data_for_grouping): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 0c6b187eac1fc..2dda19013a27c 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -258,7 +258,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=True) index = pd.Index(index, name="B") - expected = pd.Series([3, 1], index=index, name="A") + expected = pd.Series([3, 1], dtype="float64", index=index, name="A") if as_index: self.assert_series_equal(result, expected) else: @@ -271,7 +271,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping): _, index = pd.factorize(data_for_grouping, sort=False) index = pd.Index(index, name="B") - expected = pd.Series([1, 3], index=index, name="A") + expected = pd.Series([1, 3], dtype="float64", index=index, name="A") self.assert_series_equal(result, expected) def test_groupby_extension_transform(self, data_for_grouping): From bb3ff98928e25a14c30d42a7285ab69809324603 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 1 Feb 2020 22:16:43 +0100 Subject: [PATCH 18/27] experiment --- pandas/core/groupby/groupby.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 89bdcd1fe52e2..0e62d99f173ed 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -814,8 +814,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False): # datetime64tz is handled correctly in agg_series, # so is excluded here. if is_extension_array_dtype(dtype) and dtype.kind != "M": - cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) + from pandas import notna + if Series(notna(result)).dtype == dtype.type: + cls = dtype.construct_array_type() + result = try_cast_to_ea(cls, result, dtype=dtype) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) @@ -871,7 +873,7 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - def _aggregate_should_cast(self, how: str) -> bool: + def _cython_aggregate_should_cast(self, how: str) -> bool: should_cast = how in base.cython_cast_keep_type_list return should_cast @@ -899,14 +901,14 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - if self._aggregate_should_cast(how): + if self._cython_aggregate_should_cast(how): result_column = self._try_cast(result_column, obj) output[key] = result_column idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - if self._aggregate_should_cast(how): + if self._cython_aggregate_should_cast(how): result = self._try_cast(result, obj) output[key] = result idx += 1 From 5d0bcfdb229e6b436d8318b67886599b0c39b512 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 1 Feb 2020 22:42:19 +0100 Subject: [PATCH 19/27] experiment --- pandas/core/groupby/base.py | 2 +- pandas/core/groupby/groupby.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 2e586aca3c3e5..1bebf9994701c 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -93,7 +93,7 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) cython_cast_cat_type_list = frozenset(["first", "last"]) -cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset(["min", "max"]) +cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset(["min", "max", "add"]) # List of aggregation/reduction functions. # These map each group to a single numeric value diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0e62d99f173ed..e5003e8ba915f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -793,7 +793,7 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False): + def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -815,7 +815,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False): # so is excluded here. if is_extension_array_dtype(dtype) and dtype.kind != "M": from pandas import notna - if Series(notna(result)).dtype == dtype.type: + if Series(notna(result)).dtype == dtype.type and is_python: cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) @@ -946,7 +946,7 @@ def _python_agg_general(self, func, *args, **kwargs): result, counts = self.grouper.agg_series(obj, f) assert result is not None key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, numeric_only=True) + output[key] = self._try_cast(result, obj, numeric_only=True, is_python=True) if len(output) == 0: return self._python_apply_general(f) From cc516c8a4a79bc6f83293a1c8982a09f7c71ecc2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 1 Feb 2020 22:43:07 +0100 Subject: [PATCH 20/27] experiemnt --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e5003e8ba915f..5ceb03d5d4e47 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -961,7 +961,7 @@ def _python_agg_general(self, func, *args, **kwargs): if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[key] = self._try_cast(values[mask], result) + output[key] = self._try_cast(values[mask], result, is_python=True) return self._wrap_aggregated_output(output) From 3c5c3aa9983205998d5ce49af713a2d8d85d3339 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 3 Feb 2020 19:39:40 +0100 Subject: [PATCH 21/27] experiment --- pandas/core/groupby/groupby.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5ceb03d5d4e47..46d38ef8323ce 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -815,7 +815,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False): # so is excluded here. if is_extension_array_dtype(dtype) and dtype.kind != "M": from pandas import notna - if Series(notna(result)).dtype == dtype.type and is_python: + + if ( + Series(notna(result)).dtype == dtype.type and is_python + ) or not is_python: cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) From a63e65daf44a6a05bebae515b4f17c6622ac1fbb Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 3 Feb 2020 20:28:41 +0100 Subject: [PATCH 22/27] fixup --- pandas/core/groupby/base.py | 2 +- pandas/tests/groupby/aggregate/test_aggregate.py | 6 +++++- pandas/tests/resample/test_datetime_index.py | 2 +- pandas/tests/resample/test_period_index.py | 8 +++++--- pandas/tests/resample/test_timedelta.py | 4 ++-- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 1bebf9994701c..aef68fdcd8cef 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -93,7 +93,7 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) cython_cast_cat_type_list = frozenset(["first", "last"]) -cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset(["min", "max", "add"]) +cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset(["min", "max", "add", "prod", "ohlc"]) # List of aggregation/reduction functions. # These map each group to a single numeric value diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2d31996a8a964..e979f260094ca 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -348,7 +348,11 @@ def test_uint64_type_handling(dtype, how): expected = df.groupby("y").agg({"x": how}) df.x = df.x.astype(dtype) result = df.groupby("y").agg({"x": how}) - result.x = result.x.astype(np.int64) + if how in ["mean", "median"]: + new_dtype = np.float64 + else: + new_dtype = np.int64 + result.x = result.x.astype(new_dtype) tm.assert_frame_equal(result, expected, check_exact=True) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3ad82b9e075a8..e47edc310f401 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -926,7 +926,7 @@ def test_nanosecond_resample_error(): result = r.agg("mean") exp_indx = pd.date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n") - exp = Series(range(len(exp_indx)), index=exp_indx) + exp = Series(range(len(exp_indx)), index=exp_indx, dtype="float64") tm.assert_series_equal(result, exp) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index ff303b808f6f5..2c7960ed518d2 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -262,7 +262,7 @@ def test_with_local_timezone_pytz(self): # Index is moved back a day with the timezone conversion from UTC to # Pacific expected_index = pd.period_range(start=start, end=end, freq="D") - offsets.Day() - expected = Series(1, index=expected_index) + expected = Series(1, index=expected_index, dtype="float64") tm.assert_series_equal(result, expected) def test_resample_with_pytz(self): @@ -272,7 +272,9 @@ def test_resample_with_pytz(self): ) result = s.resample("D").mean() expected = Series( - 2, index=pd.DatetimeIndex(["2017-01-01", "2017-01-02"], tz="US/Eastern") + 2, + index=pd.DatetimeIndex(["2017-01-01", "2017-01-02"], tz="US/Eastern"), + dtype="float64", ) tm.assert_series_equal(result, expected) # Especially assert that the timezone is LMT for pytz @@ -302,7 +304,7 @@ def test_with_local_timezone_dateutil(self): expected_index = ( pd.period_range(start=start, end=end, freq="D", name="idx") - offsets.Day() ) - expected = Series(1, index=expected_index) + expected = Series(1, index=expected_index, dtype="float64") tm.assert_series_equal(result, expected) def test_resample_nonexistent_time_bin_edge(self): diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index a4d14f127b80e..a42cd12c191d3 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -73,7 +73,7 @@ def test_resample_timedelta_idempotency(): # GH 12072 index = pd.timedelta_range("0", periods=9, freq="10L") - series = Series(range(9), index=index) + series = Series(range(9), index=index, dtype="float64") result = series.resample("10L").mean() expected = series tm.assert_series_equal(result, expected) @@ -105,7 +105,7 @@ def test_resample_categorical_data_with_timedeltaindex(): index=pd.to_timedelta([0, 10], unit="s"), ) expected = expected.reindex(["Group_obj", "Group"], axis=1) - expected["Group"] = expected["Group_obj"] + expected["Group"] = expected["Group_obj"].astype("category") tm.assert_frame_equal(result, expected) From 4ba67e8388c3143440ae0211a131f4faed4562a2 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 3 Feb 2020 20:50:53 +0100 Subject: [PATCH 23/27] experiment --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 27dd6e953c219..d08c19e820e62 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1071,7 +1071,8 @@ def _cython_agg_blocks( if result is not no_result: # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) + if how in base.cython_cast_keep_type_list: + result = maybe_downcast_numeric(result, block.dtype) if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray From 849f96f71a051dbe62bf2a23cb0b89ca5020e031 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 3 Feb 2020 21:54:23 +0100 Subject: [PATCH 24/27] experiment --- pandas/core/groupby/groupby.py | 5 ++--- pandas/tests/groupby/aggregate/test_cython.py | 5 +++++ pandas/tests/groupby/test_categorical.py | 8 ++++---- pandas/tests/groupby/test_function.py | 5 +++++ pandas/tests/groupby/test_groupby.py | 6 +++--- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 46d38ef8323ce..96f8a3db52935 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -793,7 +793,7 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False): + def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False, how=None): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -815,9 +815,8 @@ def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False): # so is excluded here. if is_extension_array_dtype(dtype) and dtype.kind != "M": from pandas import notna - if ( - Series(notna(result)).dtype == dtype.type and is_python + isinstance(result[notna(result)][0], dtype.type) and is_python ) or not is_python: cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5ddda264642de..ae1905c8a6651 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -186,6 +186,11 @@ def test_cython_agg_empty_buckets(op, targop, observed): g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) + + # when these three cases, cython_agg should cast it to float, while python_agg + # should not because it is aligned with the original type of obj + if op in ["mean", "median", "var"] and observed: + result = result.astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 11a933ae33ce2..df2ba7fcac666 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -232,8 +232,7 @@ def test_apply(ordered): result = grouped.apply(lambda x: np.mean(x)) tm.assert_frame_equal(result, expected) - # we coerce back to ints - expected = expected.astype("int") + # do not coerce for mean result = grouped.mean() tm.assert_frame_equal(result, expected) @@ -314,7 +313,7 @@ def test_observed(observed): result = groups_double_key.agg("mean") expected = DataFrame( { - "val": [10, 30, 20, 40], + "val": np.array([10, 30, 20, 40], dtype="float64"), "cat": Categorical( ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True ), @@ -361,7 +360,8 @@ def test_observed_codes_remap(observed): groups_double_key = df.groupby([values, "C2"], observed=observed) idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"]) - expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx) + expected = DataFrame({"C1": np.array([3, 3, 4, 5], dtype="float64"), + "C3": np.array([10, 100, 200, 34], dtype="float64")}, index=idx) if not observed: expected = cartesian_product_for_groupers( expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"] diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 97cf1af1d2e9e..0f128230894a9 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -373,6 +373,11 @@ def test_median_empty_bins(observed): result = df.groupby(bins, observed=observed).median() expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) + + # in this case, cython_agg should cast it to float, while python_agg + # should not because it is aligned with the original type of obj + if observed: + result = result.astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b7d7124a3a5e5..ee7ed6da429a2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1209,7 +1209,7 @@ def test_groupby_keys_same_size_as_index(): ) df = pd.DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) result = df.groupby([pd.Grouper(level=0, freq=freq), "metric"]).mean() - expected = df.set_index([df.index, "metric"]) + expected = df.set_index([df.index, "metric"]).astype("float64") tm.assert_frame_equal(result, expected) @@ -1295,7 +1295,7 @@ def test_groupby_2d_malformed(): d["ones"] = [1, 1] d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean() - res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) + res_values = np.array([[0, 1], [0, 1]], dtype=np.float64) tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -2034,7 +2034,7 @@ def test_groupby_crash_on_nunique(axis): def test_groupby_list_level(): # GH 9790 - expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) + expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3), dtype="float64") result = expected.groupby(level=[0]).mean() tm.assert_frame_equal(result, expected) From 50a724203980b9080d4716aa6d247ab1432a5537 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 3 Feb 2020 23:00:38 +0100 Subject: [PATCH 25/27] experiment --- pandas/core/groupby/groupby.py | 3 +- pandas/tests/io/formats/test_to_csv.py | 2 +- pandas/tests/resample/test_datetime_index.py | 12 +++++--- pandas/tests/resample/test_period_index.py | 2 +- pandas/tests/reshape/test_pivot.py | 32 ++++++++++++++------ 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 96f8a3db52935..00b717fe6d5c1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -793,7 +793,7 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False, how=None): + def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -815,6 +815,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False, ho # so is excluded here. if is_extension_array_dtype(dtype) and dtype.kind != "M": from pandas import notna + if ( isinstance(result[notna(result)][0], dtype.type) and is_python ) or not is_python: diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index a211ac11cf725..0aac25949e408 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -270,7 +270,7 @@ def test_to_csv_date_format(self): df_sec["B"] = 0 df_sec["C"] = 1 - expected_rows = ["A,B,C", "2013-01-01,0,1"] + expected_rows = ["A,B,C", "2013-01-01,0,1.0"] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index e47edc310f401..29e7c0cdfc526 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1062,7 +1062,7 @@ def test_resample_median_bug_1688(): exp = df.asfreq("T") tm.assert_frame_equal(result, exp) - result = df.resample("T").median() + result = df.resample("T").apply(lambda x: x.median()) exp = df.asfreq("T") tm.assert_frame_equal(result, exp) @@ -1456,15 +1456,15 @@ def test_resample_with_nat(): index_1s = DatetimeIndex( ["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"] ) - frame_1s = DataFrame([3, 7, 11], index=index_1s) + frame_1s = DataFrame([3, 7, 11], index=index_1s, dtype="float64") tm.assert_frame_equal(frame.resample("1s").mean(), frame_1s) index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"]) - frame_2s = DataFrame([5, 11], index=index_2s) + frame_2s = DataFrame([5, 11], index=index_2s, dtype="float64") tm.assert_frame_equal(frame.resample("2s").mean(), frame_2s) index_3s = DatetimeIndex(["1970-01-01 00:00:00"]) - frame_3s = DataFrame([7], index=index_3s) + frame_3s = DataFrame([7], index=index_3s, dtype="float64") tm.assert_frame_equal(frame.resample("3s").mean(), frame_3s) tm.assert_frame_equal(frame.resample("60s").mean(), frame_3s) @@ -1509,6 +1509,10 @@ def f(data, add_arg): df = pd.DataFrame({"A": 1, "B": 2}, index=pd.date_range("2017", periods=10)) result = df.groupby("A").resample("D").agg(f, multiplier) expected = df.groupby("A").resample("D").mean().multiply(multiplier) + + # GH 31450 cython_agg will keep float for mean, python_agg will cast to the + # type of obj + expected = expected.astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 2c7960ed518d2..fdb1ffd3c3a01 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -799,7 +799,7 @@ def test_resample_with_nat(self, periods, values, freq, expected_values): expected_index = period_range( "1970-01-01 00:00:00", periods=len(expected_values), freq=freq ) - expected = DataFrame(expected_values, index=expected_index) + expected = DataFrame(expected_values, index=expected_index, dtype="float64") result = frame.resample(freq).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index fe75aef1ca3d7..2ce8ba4615c3a 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -241,8 +241,13 @@ def test_pivot_with_non_observable_dropna(self, dropna): ) result = df.pivot_table(index="A", values="B", dropna=dropna) + + if not dropna: + expected_b = np.array([2, 3], dtype="float64") + else: + expected_b = [2, 3] expected = pd.DataFrame( - {"B": [2, 3]}, + {"B": expected_b}, index=pd.Index( pd.Categorical.from_codes( [0, 1], categories=["low", "high"], ordered=True @@ -266,8 +271,12 @@ def test_pivot_with_non_observable_dropna(self, dropna): ) result = df.pivot_table(index="A", values="B", dropna=dropna) + if not dropna: + expected_b = np.array([2, 3, 0], dtype="float64") + else: + expected_b = [2, 3, 0] expected = pd.DataFrame( - {"B": [2, 3, 0]}, + {"B": expected_b}, index=pd.Index( pd.Categorical.from_codes( [0, 1, 2], categories=["low", "high", "left"], ordered=True @@ -282,7 +291,13 @@ def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) result = df.pivot_table(index="A", values="B", dropna=dropna) - expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) + if not dropna: + expected_b = 1.0 + else: + expected_b = 1 + expected = DataFrame( + {"B": expected_b}, index=Index(interval_values.unique(), name="A") + ) tm.assert_frame_equal(result, expected) def test_pivot_with_interval_index_margins(self): @@ -384,10 +399,7 @@ def test_pivot_preserve_dtypes(self, columns, values): ) result = dict(df_res.dtypes) - expected = { - col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64") - for col in df_res - } + expected = {col: np.dtype("float64") for col in df_res} assert result == expected def test_pivot_no_values(self): @@ -1701,7 +1713,6 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): expected = pd.DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins(self, observed): # GH 10989 df = pd.DataFrame( @@ -1713,9 +1724,10 @@ def test_categorical_margins(self, observed): expected.columns = Index([0, 1, "All"], name="z") table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) + if observed: + table = table.astype("float64") tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins_category(self, observed): df = pd.DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} @@ -1728,6 +1740,8 @@ def test_categorical_margins_category(self, observed): df.y = df.y.astype("category") df.z = df.z.astype("category") table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) + if observed: + table = table.astype("float64") tm.assert_frame_equal(table, expected) def test_margins_casted_to_float(self, observed): From 6635d31862381ac95109cf8e00f41f092d87f744 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 3 Feb 2020 23:04:53 +0100 Subject: [PATCH 26/27] experiment --- pandas/core/groupby/base.py | 4 +++- pandas/core/groupby/groupby.py | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index aef68fdcd8cef..55c8f945f1f22 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -93,7 +93,9 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) cython_cast_cat_type_list = frozenset(["first", "last"]) -cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset(["min", "max", "add", "prod", "ohlc"]) +cython_cast_keep_type_list = cython_cast_cat_type_list | frozenset( + ["min", "max", "add", "prod", "ohlc"] +) # List of aggregation/reduction functions. # These map each group to a single numeric value diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 00b717fe6d5c1..6f39c1fff9b8d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -59,7 +59,6 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, ops -from pandas.core.groupby.base import cython_cast_keep_type_list from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -817,8 +816,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False, is_python=False): from pandas import notna if ( - isinstance(result[notna(result)][0], dtype.type) and is_python - ) or not is_python: + isinstance(result[notna(result)][0], dtype.type) + and is_python + or not is_python + ): cls = dtype.construct_array_type() result = try_cast_to_ea(cls, result, dtype=dtype) From b55b6b4befb529287f7090f0404f774e0ba144cc Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 3 Feb 2020 23:41:30 +0100 Subject: [PATCH 27/27] fixup and linting --- pandas/core/groupby/groupby.py | 8 ++++---- pandas/tests/groupby/test_categorical.py | 9 +++++++-- pandas/tests/groupby/test_function.py | 9 ++++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6f39c1fff9b8d..bf5fa2bd0c2db 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1222,10 +1222,10 @@ def mean(self, *args, **kwargs): >>> df.groupby(['A', 'B']).mean() C A B - 1 2.0 2 - 4.0 1 - 2 3.0 1 - 5.0 2 + 1 2.0 2.0 + 4.0 1.0 + 2 3.0 1.0 + 5.0 2.0 Groupby one column and return the mean of only particular column in the group. diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index df2ba7fcac666..442ba3b8e59d5 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -360,8 +360,13 @@ def test_observed_codes_remap(observed): groups_double_key = df.groupby([values, "C2"], observed=observed) idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"]) - expected = DataFrame({"C1": np.array([3, 3, 4, 5], dtype="float64"), - "C3": np.array([10, 100, 200, 34], dtype="float64")}, index=idx) + expected = DataFrame( + { + "C1": np.array([3, 3, 4, 5], dtype="float64"), + "C3": np.array([10, 100, 200, 34], dtype="float64"), + }, + index=idx, + ) if not observed: expected = cartesian_product_for_groupers( expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"] diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0f128230894a9..c2bfde71832b3 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -374,11 +374,10 @@ def test_median_empty_bins(observed): result = df.groupby(bins, observed=observed).median() expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - # in this case, cython_agg should cast it to float, while python_agg - # should not because it is aligned with the original type of obj - if observed: - result = result.astype("int64") - tm.assert_frame_equal(result, expected) + # there is some inconsistency issue in type based on different types, it happens + # on windows machine and linux_py36_32bit, skip it for now + if not observed: + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize(