Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
Expand Down
35 changes: 30 additions & 5 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,30 @@ def compute_dict_like(

keys += [key] * len(key_data)
results += key_data
else:
elif is_groupby:
# key used for column selection and output

df = selected_obj
results, keys = [], []
for key, how in func.items():
cols = df[key]

if cols.ndim == 1:
series_list = [obj._gotitem(key, ndim=1, subset=cols)]
else:
series_list = []
for index in range(cols.shape[1]):
col = cols.iloc[:, index]

series = obj._gotitem(key, ndim=1, subset=col)
series_list.append(series)

for series in series_list:
result = getattr(series, op_name)(how, **kwargs)
results.append(result)
keys.append(key)

else:
results = [
getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
for key, how in func.items()
Expand All @@ -496,11 +518,14 @@ def wrap_results_dict_like(
is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data]

if all(is_ndframe):
results = dict(zip(result_index, result_data))
results = [result for result in result_data if not result.empty]
keys_to_use: Iterable[Hashable]
keys_to_use = [k for k in result_index if not results[k].empty]
keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty]
# Have to check, if at least one DataFrame is not empty.
keys_to_use = keys_to_use if keys_to_use != [] else result_index
if keys_to_use == []:
keys_to_use = result_index
results = result_data

if selected_obj.ndim == 2:
# keys are columns, so we can preserve names
ktu = Index(keys_to_use)
Expand All @@ -509,7 +534,7 @@ def wrap_results_dict_like(

axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
result = concat(
{k: results[k] for k in keys_to_use},
results,
axis=axis,
keys=keys_to_use,
)
Expand Down
118 changes: 118 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1662,3 +1662,121 @@ def func(x):
msg = "length must not be 0"
with pytest.raises(ValueError, match=msg):
df.groupby("A", observed=False).agg(func)


def test_groupby_aggregation_duplicate_columns_single_dict_value():
# GH#55041
df = DataFrame(
[[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
columns=["a", "b", "c", "c"],
)
gb = df.groupby("a")
result = gb.agg({"c": "sum"})

expected = DataFrame(
[[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a")
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_duplicate_columns_multiple_dict_values():
# GH#55041
df = DataFrame(
[[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
columns=["a", "b", "c", "c"],
)
gb = df.groupby("a")
result = gb.agg({"c": ["sum", "min", "max", "min"]})

expected = DataFrame(
[[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]],
columns=MultiIndex(
levels=[["c"], ["sum", "min", "max"]],
codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]],
),
index=Index([1, 2], name="a"),
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_duplicate_columns_some_empty_result():
# GH#55041
df = DataFrame(
[
[1, 9843, 43, 54, 7867],
[2, 940, 9, -34, 44],
[1, -34, -546, -549358, 0],
[2, 244, -33, -100, 44],
],
columns=["a", "b", "b", "c", "c"],
)
gb = df.groupby("a")
result = gb.agg({"b": [], "c": ["var"]})

expected = DataFrame(
[[1.509268e11, 30944844.5], [2.178000e03, 0.0]],
columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]),
index=Index([1, 2], name="a"),
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_multi_index_duplicate_columns():
# GH#55041
df = DataFrame(
[
[1, -9843, 43, 54, 7867],
[2, 940, 9, -34, 44],
[1, -34, 546, -549358, 0],
[2, 244, -33, -100, 44],
],
columns=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]],
),
index=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1], [0, 1, 1, 0]],
),
)
gb = df.groupby(level=0)
result = gb.agg({("level1.1", "level2.2"): "min"})

expected = DataFrame(
[[-9843, 9], [244, -33]],
columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]),
index=Index(["level1.1", "level1.2"]),
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_func_list_multi_index_duplicate_columns():
# GH#55041
df = DataFrame(
[
[1, -9843, 43, 54, 7867],
[2, 940, 9, -34, 44],
[1, -34, 546, -549358, 0],
[2, 244, -33, -100, 44],
],
columns=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]],
),
index=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1], [0, 1, 1, 0]],
),
)
gb = df.groupby(level=0)
result = gb.agg({("level1.1", "level2.2"): ["min", "max"]})

expected = DataFrame(
[[-9843, 940, 9, 546], [244, 244, -33, -33]],
columns=MultiIndex(
levels=[["level1.1"], ["level2.2"], ["min", "max"]],
codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]],
),
index=Index(["level1.1", "level1.2"]),
)
tm.assert_frame_equal(result, expected)