diff --git a/RELEASE.rst b/RELEASE.rst index b5dd3eef68dea..1e7880016cdee 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -48,6 +48,8 @@ pandas 0.11.1 - Add iterator to ``Series.str`` (GH3638_) - ``pd.set_option()`` now allows N option, value pairs (GH3667_). - Added keyword parameters for different types of scatter_matrix subplots + - A ``filter`` method on grouped Series or DataFrames returns a subset of + the original (GH3680_, GH919_) **Improvements to existing features** diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index bc2ff9bbe1013..c5e38a72ec3e9 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -41,6 +41,12 @@ following: - Standardizing data (zscore) within group - Filling NAs within groups with a value derived from each group + - **Filtration**: discard some groups, according to a group-wise computation + that evaluates True or False. Some examples: + + - Discarding data that belongs to groups with only a few members + - Filtering out data based on the group sum or mean + - Some combination of the above: GroupBy will examine the results of the apply step and try to return a sensibly combined result if it doesn't fit into either of the above two categories @@ -489,6 +495,39 @@ and that the transformed data contains no NAs. grouped_trans.count() # counts after transformation grouped_trans.size() # Verify non-NA count equals group size +.. _groupby.filter: + +Filtration +---------- + +The ``filter`` method returns a subset of the original object. Suppose we +want to take only elements that belong to groups with a group sum greater +than 2. + +.. ipython:: python + + s = Series([1, 1, 2, 3, 3, 3]) + s.groupby(s).filter(lambda x: x.sum() > 2) + +The argument of ``filter`` must a function that, applied to the group as a +whole, returns ``True`` or ``False``. + +Another useful operation is filtering out elements that belong to groups +with only a couple members. + +.. ipython:: python + + df = DataFrame({'A': arange(8), 'B': list('aabbbbcc')}) + df.groupby('B').filter(lambda x: len(x) > 2) + +Alternatively, instead of dropping the offending groups, we can return a +like-indexed objects where the groups that do not pass the filter are filled +with NaNs. + +.. ipython:: python + + df.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + .. _groupby.dispatch: Dispatching to instance methods diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index b2fee1acbc4d6..0641ffae542c0 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -237,6 +237,35 @@ Enhancements pd.get_option('a.b') pd.get_option('b.c') + - The ``filter`` method for group objects returns a subset of the original + object. Suppose we want to take only elements that belong to groups with a + group sum greater than 2. + + .. ipython:: python + + s = Series([1, 1, 2, 3, 3, 3]) + s.groupby(s).filter(lambda x: x.sum() > 2) + + The argument of ``filter`` must a function that, applied to the group as a + whole, returns ``True`` or ``False``. + + Another useful operation is filtering out elements that belong to groups + with only a couple members. + + .. ipython:: python + + df = DataFrame({'A': arange(8), 'B': list('aabbbbcc')}) + df.groupby('B').filter(lambda x: len(x) > 2) + + Alternatively, instead of dropping the offending groups, we can return a + like-indexed objects where the groups that do not pass the filter are + filled with NaNs. + + .. ipython:: python + + df.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + + Bug Fixes ~~~~~~~~~ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 64606a6e644f9..0be5d438e5e7c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1558,6 +1558,42 @@ def transform(self, func, *args, **kwargs): result = _possibly_downcast_to_dtype(result, dtype) return self.obj.__class__(result,index=self.obj.index,name=self.obj.name) + def filter(self, func, dropna=True, *args, **kwargs): + """ + Return a copy of a Series excluding elements from groups that + do not satisfy the boolean criterion specified by func. + + Parameters + ---------- + func : function + To apply to each group. Should return True or False. + dropna : Drop groups that do not pass the filter. True by default; + if False, groups that evaluate False are filled with NaNs. + + Example + ------- + >>> grouped.filter(lambda x: x.mean() > 0) + + Returns + ------- + filtered : Series + """ + if isinstance(func, basestring): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + indexers = [self.obj.index.get_indexer(group.index) \ + if wrapper(group) else [] for _ , group in self] + + if len(indexers) == 0: + filtered = self.obj.take([]) # because np.concatenate would fail + else: + filtered = self.obj.take(np.concatenate(indexers)) + if dropna: + return filtered + else: + return filtered.reindex(self.obj.index) # Fill with NaNs. class NDFrameGroupBy(GroupBy): @@ -1928,47 +1964,22 @@ def transform(self, func, *args, **kwargs): obj = self._obj_with_exclusions gen = self.grouper.get_iterator(obj, axis=self.axis) - - if isinstance(func, basestring): - fast_path = lambda group: getattr(group, func)(*args, **kwargs) - slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis) - else: - fast_path = lambda group: func(group, *args, **kwargs) - slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis) + fast_path, slow_path = self._define_paths(func, *args, **kwargs) path = None for name, group in gen: object.__setattr__(group, 'name', name) - # decide on a fast path if path is None: - - path = slow_path + # Try slow path and fast path. try: - res = slow_path(group) - - # if we make it here, test if we can use the fast path - try: - res_fast = fast_path(group) - - # compare that we get the same results - if res.shape == res_fast.shape: - res_r = res.values.ravel() - res_fast_r = res_fast.values.ravel() - mask = notnull(res_r) - if (res_r[mask] == res_fast_r[mask]).all(): - path = fast_path - - except: - pass + path, res = self._choose_path(fast_path, slow_path, group) except TypeError: return self._transform_item_by_item(obj, fast_path) except Exception: # pragma: no cover res = fast_path(group) path = fast_path - else: - res = path(group) # broadcasting @@ -1988,6 +1999,35 @@ def transform(self, func, *args, **kwargs): concatenated.sort_index(inplace=True) return concatenated + def _define_paths(self, func, *args, **kwargs): + if isinstance(func, basestring): + fast_path = lambda group: getattr(group, func)(*args, **kwargs) + slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis) + else: + fast_path = lambda group: func(group, *args, **kwargs) + slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis) + return fast_path, slow_path + + def _choose_path(self, fast_path, slow_path, group): + path = slow_path + res = slow_path(group) + + # if we make it here, test if we can use the fast path + try: + res_fast = fast_path(group) + + # compare that we get the same results + if res.shape == res_fast.shape: + res_r = res.values.ravel() + res_fast_r = res_fast.values.ravel() + mask = notnull(res_r) + if (res_r[mask] == res_fast_r[mask]).all(): + path = fast_path + + except: + pass + return path, res + def _transform_item_by_item(self, obj, wrapper): # iterate through columns output = {} @@ -2008,6 +2048,63 @@ def _transform_item_by_item(self, obj, wrapper): return DataFrame(output, index=obj.index, columns=columns) + def filter(self, func, dropna=True, *args, **kwargs): + """ + Return a copy of a DataFrame excluding elements from groups that + do not satisfy the boolean criterion specified by func. + + Parameters + ---------- + f : function + Function to apply to each subframe. Should return True or False. + dropna : Drop groups that do not pass the filter. True by default; + if False, groups that evaluate False are filled with NaNs. + + Note + ---- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Example + -------- + >>> grouped = df.groupby(lambda x: mapping[x]) + >>> grouped.filter(lambda x: x['A'].sum() + x['B'].sum() > 0) + """ + from pandas.tools.merge import concat + + indexers = [] + + obj = self._obj_with_exclusions + gen = self.grouper.get_iterator(obj, axis=self.axis) + + fast_path, slow_path = self._define_paths(func, *args, **kwargs) + + path = None + for name, group in gen: + object.__setattr__(group, 'name', name) + + if path is None: + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except Exception: # pragma: no cover + res = fast_path(group) + path = fast_path + else: + res = path(group) + + if res: + indexers.append(self.obj.index.get_indexer(group.index)) + + if len(indexers) == 0: + filtered = self.obj.take([]) # because np.concatenate would fail + else: + filtered = self.obj.take(np.concatenate(indexers)) + if dropna: + return filtered + else: + return filtered.reindex(self.obj.index) # Fill with NaNs. + class DataFrameGroupBy(NDFrameGroupBy): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index cf62b16a9dd2a..f3a608b82e756 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2498,6 +2498,155 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + def test_filter_series(self): + import pandas as pd + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(s.index)) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(s.index)) + + def test_filter_single_column_df(self): + import pandas as pd + df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + grouper = df[0].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(df.index)) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(df.index)) + + def test_filter_multi_column_df(self): + import pandas as pd + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), expected) + + def test_filter_mixed_df(self): + import pandas as pd + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, + index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 10), expected) + + def test_filter_out_all_groups(self): + import pandas as pd + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 1000), s[[]]) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]]) + + def test_filter_out_no_groups(self): + import pandas as pd + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + filtered = grouped.filter(lambda x: x.mean() > 0) + filtered.sort() # was sorted by group + s.sort() # was sorted arbitrarily + assert_series_equal(filtered, s) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + filtered = grouped.filter(lambda x: x['A'].mean() > 0) + assert_frame_equal(filtered.sort(), df) + + def test_filter_condition_raises(self): + import pandas as pd + def raise_if_sum_is_zero(x): + if x.sum() == 0: + raise ValueError + else: + return x.sum() > 0 + s = pd.Series([-1,0,1,2]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + self.assertRaises(ValueError, + lambda: grouped.filter(raise_if_sum_is_zero)) + + def test_filter_against_workaround(self): + np.random.seed(0) + # Series of ints + s = Series(np.random.randint(0,100,1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.order(), old_way.order()) + + # Series of floats + s = 100*Series(np.random.random(1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.order(), old_way.order()) + + # Set up DataFrame of ints, floats, strings. + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 1000 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), + 'floats': N/10*Series(np.random.random(N)), + 'letters': Series(random_letters)}) + + # Group by ints; filter on floats. + grouped = df.groupby('ints') + old_way = df[grouped.floats.\ + transform(lambda x: x.mean() > N/20).astype('bool')] + new_way = grouped.filter(lambda x: x['floats'].mean() > N/20) + assert_frame_equal(new_way.sort(), old_way.sort()) + + # Group by floats (rounded); filter on strings. + grouper = df.floats.apply(lambda x: np.round(x, -1)) + grouped = df.groupby(grouper) + old_way = df[grouped.letters.\ + transform(lambda x: len(x) < N/10).astype('bool')] + new_way = grouped.filter( + lambda x: len(x.letters) < N/10) + assert_frame_equal(new_way.sort(), old_way.sort()) + + # Group by strings; filter on ints. + grouped = df.groupby('letters') + old_way = df[grouped.ints.\ + transform(lambda x: x.mean() > N/20).astype('bool')] + new_way = grouped.filter(lambda x: x['ints'].mean() > N/20) + assert_frame_equal(new_way.sort_index(), old_way.sort_index()) def assert_fp_equal(a, b): assert((np.abs(a - b) < 1e-12).all())