From 2800fa0e5236920ebb9c3cae46f871a2e2f76d04 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 May 2014 16:24:16 -0500 Subject: [PATCH 1/4] add the grouping code --- pandas/core/groupby.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ce64ed754180d..497902224a19d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1917,8 +1917,14 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_arraylike = any(isinstance(g, (list, tuple, Series, np.ndarray)) for g in keys) + # sugar for df.reset_index().groupby(['a', 'b']) where b was in index_names + from_col, from_idx, from_both = _from_index_and_columns(obj, keys) try: + if from_idx and from_col: + to_exclude = set(obj.index.names) - from_idx + obj = obj.reset_index() + group_axis = obj._get_axis(axis) if isinstance(obj, DataFrame): all_in_columns = all(g in obj.columns for g in keys) else: @@ -1940,6 +1946,12 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): groupings = [] exclusions = [] + + if from_col and from_idx: + # don't include those just there becaues of the reset_index + if to_exclude: + exclusions += list(to_exclude) + for i, (gpr, level) in enumerate(zip(keys, levels)): name = None try: @@ -1969,6 +1981,29 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): return grouper, exclusions, obj +def _from_index_and_columns(obj, keys): + """ + keys is already listlike + """ + if not all(isinstance(g, compat.string_types) for g in keys): + # TODO: Handle mix of callables and strings. + return None, None, None + ks = set(keys) + from_idx = ks & set(obj.index.names) + from_col = ks & set(obj.columns) + + # check for ambiguity: + from_both = from_idx & from_col + if from_both: + from warnings import warn + msg = ("Found {0} in both the columns and index labels. " + "Grouping by the columns".format(from_both)) + warn(msg) + + # don't need to do anything if the only ones from either are in both + return from_col, from_idx - from_both, from_both + + def _is_label_like(val): return isinstance(val, compat.string_types) or np.isscalar(val) From 45dc31b78b01813f0b0eaf66c4a8920f3ea62482 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 May 2014 16:40:29 -0500 Subject: [PATCH 2/4] add tests --- pandas/core/groupby.py | 4 +++- pandas/tests/test_groupby.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 497902224a19d..2d14262ce6e02 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1985,7 +1985,9 @@ def _from_index_and_columns(obj, keys): """ keys is already listlike """ - if not all(isinstance(g, compat.string_types) for g in keys): + not_all_string = not all(isinstance(g, compat.string_types) for g in keys) + not_df = not isinstance(obj, DataFrame) + if not_all_string or not_df: # TODO: Handle mix of callables and strings. return None, None, None ks = set(keys) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 1b70ae0309b10..18c4aaf8bc76f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -11,7 +11,8 @@ from pandas.core.common import rands from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, - _nargsort, _lexsort_indexer) + _nargsort, _lexsort_indexer, + _from_index_and_columns) from pandas.core.series import Series from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -4168,6 +4169,34 @@ def test_nargsort(self): expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) assert_equal(result, expected) + def test_from_index_and_columns(self): + # allowing by to spread across index and col names GH #5677 + df = DataFrame([[1, 2, 3, 4]], columns=['c1', 'c2', 'i1', 'i2']) + df = df.set_index(['i1', 'i2']) + + keys = ['c1'] + from_col, from_idx, from_both = _from_index_and_columns(df, keys) + self.assertEqual(from_col, set(['c1'])) + self.assertEqual(from_idx, set([])) + self.assertEqual(from_both, set([])) + + keys = ['c1', 'i1'] + from_col, from_idx, from_both = _from_index_and_columns(df, keys) + self.assertEqual(from_col, set(['c1'])) + self.assertEqual(from_idx, set(['i1'])) + self.assertEqual(from_both, set([])) + + df.index.names = ['i1', 'c1'] + keys = ['c1', 'i1'] + with tm.assert_produces_warning(UserWarning): + from_col, from_idx, from_both = _from_index_and_columns(df, keys) + self.assertEqual(from_col, set(['c1'])) + self.assertEqual(from_idx, set(['i1'])) + self.assertEqual(from_both, set(['c1'])) + + res = _from_index_and_columns(df['c1'], 'i1') + self.assertEqual(res, (None, None, None)) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() From 8a79ea150318b816d0bfd3fde95cb8a5fdd2fd7d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 May 2014 18:47:30 -0500 Subject: [PATCH 3/4] refactor index resetting --- pandas/core/groupby.py | 10 +++------- pandas/tests/test_groupby.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2d14262ce6e02..7f377d7cb425a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1922,9 +1922,10 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): try: if from_idx and from_col: - to_exclude = set(obj.index.names) - from_idx - obj = obj.reset_index() + # check the drop part... + obj = obj.reset_index(level=list(from_idx)).reset_index(drop=True) group_axis = obj._get_axis(axis) + if isinstance(obj, DataFrame): all_in_columns = all(g in obj.columns for g in keys) else: @@ -1947,11 +1948,6 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): groupings = [] exclusions = [] - if from_col and from_idx: - # don't include those just there becaues of the reset_index - if to_exclude: - exclusions += list(to_exclude) - for i, (gpr, level) in enumerate(zip(keys, levels)): name = None try: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 18c4aaf8bc76f..2fd35734e0265 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4169,6 +4169,21 @@ def test_nargsort(self): expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) assert_equal(result, expected) + def test_by_index_cols(self): + df = DataFrame([[1, 2, 'x', 'a', 'a'], + [1, 3, 'x', 'a', 'b'], + [1, 4, 'x', 'b', 'a'], + [1, 5, 'y', 'b', 'b']], + columns=['c1', 'c2', 'g1', 'i1', 'i2']) + df = df.set_index(['i1', 'i2']) + df.index.names = ['i1', 'g1'] + result = df.groupby(by=['g1', 'i1']).mean() + idx = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b')], + names=['g1', 'i1']) + expected = DataFrame([[1, 2.5], [1, 4], [1, 5]], + index=idx, columns=['c1', 'c2']) + assert_frame_equal(result, expected) + def test_from_index_and_columns(self): # allowing by to spread across index and col names GH #5677 df = DataFrame([[1, 2, 3, 4]], columns=['c1', 'c2', 'i1', 'i2']) From 6be446b9d6a73d675998a72273e5ebcab0086fe9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 May 2014 06:38:06 -0500 Subject: [PATCH 4/4] add test [ci skip] --- pandas/core/groupby.py | 4 ++-- pandas/tests/test_groupby.py | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7f377d7cb425a..9047f1504aa1a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1995,8 +1995,8 @@ def _from_index_and_columns(obj, keys): if from_both: from warnings import warn msg = ("Found {0} in both the columns and index labels. " - "Grouping by the columns".format(from_both)) - warn(msg) + "Grouping by the columns".format(from_both),) + warn(msg, FutureWarning) # don't need to do anything if the only ones from either are in both return from_col, from_idx - from_both, from_both diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 2fd35734e0265..ae06e60724264 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4171,19 +4171,26 @@ def test_nargsort(self): def test_by_index_cols(self): df = DataFrame([[1, 2, 'x', 'a', 'a'], - [1, 3, 'x', 'a', 'b'], - [1, 4, 'x', 'b', 'a'], - [1, 5, 'y', 'b', 'b']], + [2, 3, 'x', 'a', 'b'], + [3, 4, 'x', 'b', 'a'], + [4, 5, 'y', 'b', 'b']], columns=['c1', 'c2', 'g1', 'i1', 'i2']) df = df.set_index(['i1', 'i2']) - df.index.names = ['i1', 'g1'] + df.index.set_names(['i1', 'g1'], inplace=True) result = df.groupby(by=['g1', 'i1']).mean() idx = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'b')], names=['g1', 'i1']) - expected = DataFrame([[1, 2.5], [1, 4], [1, 5]], + expected = DataFrame([[1.5, 2.5], [1, 4], [1, 5]], index=idx, columns=['c1', 'c2']) assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = df.groupby('g1').mean() + expected = DataFrame([[2, 3], [4, 5]], + index=['x', 'y'], columns=['c1', 'c2']) + expected.index.set_names(['g1'], inplace=True) + assert_frame_equal(result, expected) + def test_from_index_and_columns(self): # allowing by to spread across index and col names GH #5677 df = DataFrame([[1, 2, 3, 4]], columns=['c1', 'c2', 'i1', 'i2']) @@ -4203,7 +4210,7 @@ def test_from_index_and_columns(self): df.index.names = ['i1', 'c1'] keys = ['c1', 'i1'] - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(FutureWarning): from_col, from_idx, from_both = _from_index_and_columns(df, keys) self.assertEqual(from_col, set(['c1'])) self.assertEqual(from_idx, set(['i1']))