Skip to content

Owo #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might want to delete this or concert it into a testcase

"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" a b c d group\n",
"0 0.855664 0.237612 0.660391 0.896628 0\n",
"1 0.695109 0.026930 0.315360 0.784887 0\n",
"2 0.807515 0.301360 0.400504 0.055916 1\n",
"3 0.077397 0.571981 0.429654 0.180142 1\n",
"<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f43485b0190>\n",
"group\n",
"0 0.618052\n",
"1 0.235534\n",
"dtype: float64\n",
" a_sum a_mean b_mean c_sum d_range diff_a_b\n",
"group \n",
"0 1.550773 0.775387 0.132271 0.975751 0.111741 0.618052\n",
"1 0.884912 0.442456 0.436670 0.830158 0.124226 0.235534\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"df = pd.DataFrame(np.random.rand(4,4), columns=list('abcd'))\n",
"df['group'] = [0, 0, 1, 1]\n",
"\n",
"print(df)\n",
"\n",
"print(df.groupby('group'))\n",
"\n",
"print(df.groupby('group')['a'].max() - df.groupby('group')['b'].max())\n",
"\n",
"print(df.groupby('group').agg(\n",
" diff_a_b=(['a', 'b'], lambda x: x['a'].max() - x['b'].max())\n",
" a_sum=('a', 'sum'),\n",
" a_mean=('a', 'mean'),\n",
" b_mean=('b', 'mean'),\n",
" c_sum=('c', 'sum'),\n",
" d_range=('d', lambda x: x.max() - x.min()),\n",
"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
10 changes: 10 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,16 @@ must be either implemented on GroupBy or available via :ref:`dispatching

.. _groupby.aggregate.cython:


Aggregrating multiple columns
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

..ipython:: python

grouped.agg([ diff_c_d=(['C', 'D'], lambda x: x['C'].max() - x['D'].max())\n",



Cython-optimized aggregation functions
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,11 @@ Groupby/resample/rolling
- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`)

Grouby Aggregrations
^^^^^^^^^^^^^^^^^^^^

- added functionality to perform aggregrations on multiple columns

Reshaping
^^^^^^^^^

Expand Down
28 changes: 23 additions & 5 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,20 @@ def _aggregate(self, arg, *args, **kwargs):
raise SpecificationError("nested renamer is not supported")
elif isinstance(obj, ABCSeries):
raise SpecificationError("nested renamer is not supported")
elif isinstance(obj, ABCDataFrame) and k not in obj.columns:
raise KeyError(f"Column '{k}' does not exist!")
elif isinstance(obj, ABCDataFrame):

# OWO CHANGES
# Original check
if (k not in obj.columns):
# Check if list thingy
try:
keys = np.frombuffer(k, dtype=np.dtype('<U1'))
for key in keys:
# Check keys
if (key not in obj.columns):
raise KeyError(f"Column '{key}' does not exist!")
except TypeError:
raise KeyError(f"Column '{k}' does not exist!")

arg = new_arg

Expand Down Expand Up @@ -393,7 +405,15 @@ def _agg(arg, func):
"""
result = {}
for fname, agg_how in arg.items():
result[fname] = func(fname, agg_how)
# OWO CHANGES
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps encapsulate the changes in a new agg function to keep some consistency (agg_multi_d1? or something)

try:
items = np.frombuffer(fname, dtype=np.dtype('<U1'))
_obj = {}
for item in items:
_obj[item] = self._gotitem(item, ndim=1, subset=None)
Copy link
Author

@fpunny fpunny Apr 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is kinda bad... but hopefully with a formal PR, we can get feedback on how to do this better (maybe pipeline method would work better if we are to keep internal consistency)

result[fname] = agg_how[0](_obj)
except TypeError:
result[fname] = func(fname, agg_how)
return result

# set the final keys
Expand Down Expand Up @@ -424,11 +444,9 @@ def _agg(arg, func):

# no selection
else:

try:
result = _agg(arg, _agg_1dim)
except SpecificationError:

# we are aggregating expecting all 1d-returns
# but we have 2d
result = _agg(arg, _agg_2dim)
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,11 +908,17 @@ class DataFrameGroupBy(GroupBy):
)
@Appender(_shared_docs["aggregate"])
def aggregate(self, func=None, *args, **kwargs):

relabeling = func is None and is_multi_agg_with_relabel(**kwargs)
if relabeling:
func, columns, order = normalize_keyword_aggregation(kwargs)
# OWO CHANGES
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should make this into a new normalize_keyword_aggregation

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While we're at it, check if second field of tuple is a lambda, since native aggregation methods wouldn't work obviously

from types import LambdaType
for k, v in list(kwargs.items()):
if isinstance(v[0], list) & isinstance(v[1], LambdaType):
# v[0] is the first parameter given (the column(s) to group)
# v[1] is the 2nd parameter given and the opperation to be done to the column(s)
kwargs[k] = (np.array(v[0]).tobytes(),) + v[1:]

func, columns, order = normalize_keyword_aggregation(kwargs)
kwargs = {}
elif isinstance(func, list) and len(func) > len(set(func)):

Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,30 @@ def test_mangled(self):
)
tm.assert_frame_equal(result, expected)

def test_agg_multiple_columns(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
result = df.groupby("A").agg(
add=(["B", "C"], lambda x: x["B"].max() + x["C"].min()),
minus=(["C", "B"], lambda x: x["B"].max() - x["C"].min())
)
expected = pd.DataFrame(
{"add": [5, 9], "minus": [-1, -1]}, index=pd.Index([0, 1], name="A")
)
tm.assert_frame_equal(result, expected)

def test_agg_multi_missing_column_raises(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
with pytest.raises(KeyError, match="Column 'D' does not exist"):
df.groupby("A").agg(
minus=(["D", "C"], lambda x: x["D"].max() - x["C"].min()),
)

def test_agg_multi_missing_key_raises(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
with pytest.raises(KeyError, match="D"):
df.groupby("A").agg(
minus=(["B", "C"], lambda x: x["D"].max() - x["D"].min()),
)

@pytest.mark.parametrize(
"agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",
Expand Down