BUG/API groupby head and tail act like filter, since they dont aggregage, fixes column selection

hayd · hayd · commit cdc51d4919d8 · 2014-03-03T15:44:50.000-08:00
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -587,7 +587,7 @@ def head(self, n=5):
         """
         Returns first n rows of each group.
 
-        Essentially equivalent to ``.apply(lambda x: x.head(n))``
+        Essentially equivalent to ``.apply(lambda x: x.head(n))`` except ignores as_index flag.
 
         Example
         -------
@@ -599,17 +599,15 @@ def head(self, n=5):
         0  1  2
         2  5  6
         >>> df.groupby('A').head(1)
-             A  B
-        A
-        1 0  1  2
-        5 2  5  6
+           A  B
+        0  1  2
+        2  5  6
 
         """
+        obj = self._selected_obj
         rng = np.arange(self.grouper._max_groupsize, dtype='int64')
         in_head = self._cumcount_array(rng) < n
-        head = self.obj[in_head]
-        if self.as_index:
-            head.index = self._index_with_as_index(in_head)
+        head = obj[in_head]
         return head
 
     def tail(self, n=5):
@@ -628,17 +626,15 @@ def tail(self, n=5):
         0  1  2
         2  5  6
         >>> df.groupby('A').head(1)
-             A  B
-        A
-        1 0  1  2
-        5 2  5  6
+           A  B
+        0  1  2
+        2  5  6
 
         """
+        obj = self._selected_obj
         rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
         in_tail = self._cumcount_array(rng, ascending=False) > -n
-        tail = self.obj[in_tail]
-        if self.as_index:
-            tail.index = self._index_with_as_index(in_tail)
+        tail = obj[in_tail]
         return tail
 
     def _cumcount_array(self, arr, **kwargs):
@@ -654,6 +650,13 @@ def _cumcount_array(self, arr, **kwargs):
                 cumcounts[v] = arr[len(v)-1::-1]
         return cumcounts
 
+    @cache_readonly
+    def _selected_obj(self):
+        if self._selection is None or isinstance(self.obj, Series):
+            return self.obj
+        else:
+            return self.obj[self._selection]
+        
     def _index_with_as_index(self, b):
         """
         Take boolean mask of index to be returned from apply, if as_index=True
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1315,12 +1315,10 @@ def test_groupby_as_index_apply(self):
         g_not_as = df.groupby('user_id', as_index=False)
 
         res_as = g_as.head(2).index
-        exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)])
-        assert_index_equal(res_as, exp_as)
-
         res_not_as = g_not_as.head(2).index
-        exp_not_as = Index([0, 1, 2, 4])
-        assert_index_equal(res_not_as, exp_not_as)
+        exp = Index([0, 1, 2, 4])
+        assert_index_equal(res_as, exp)
+        assert_index_equal(res_not_as, exp)
 
         res_as_apply = g_as.apply(lambda x: x.head(2)).index
         res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
@@ -1355,11 +1353,8 @@ def test_groupby_head_tail(self):
         assert_frame_equal(df, g_not_as.head(7)) # contains all
         assert_frame_equal(df, g_not_as.tail(7))
 
-        # as_index=True, yuck
-        # prepend the A column as an index, in a roundabout way
-        df_as = df.copy()
-        df_as.index = df.set_index('A', append=True,
-                                        drop=False).index.swaplevel(0, 1)
+        # as_index=True, (used to be different)
+        df_as = df
 
         assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
         assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
@@ -1373,6 +1368,18 @@ def test_groupby_head_tail(self):
         assert_frame_equal(df_as, g_as.head(7)) # contains all
         assert_frame_equal(df_as, g_as.tail(7))
 
+        # test with selection
+        assert_frame_equal(g_as[[]].head(1), df_as.loc[[0,2], []])
+        assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0,2], ['A']])
+        assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0,2], ['B']])
+        assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0,2]])
+
+        assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0,2], []])
+        assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0,2], ['A']])
+        assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']])
+        assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]])
+
+
     def test_groupby_multiple_key(self):
         df = tm.makeTimeDataFrame()
         grouped = df.groupby([lambda x: x.year,