Skip to content

Commit f562308

Browse files
authored
BUG: bug in groupby on empty frame with multi groupers (#16090)
* TST: separate out groupby/test_nth * BUG: bug in groupby on empty frame with multi groupers xref #14784 closes #16064
1 parent d313e4d commit f562308

File tree

4 files changed

+255
-229
lines changed

4 files changed

+255
-229
lines changed

doc/source/whatsnew/v0.20.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1627,7 +1627,7 @@ Indexing
16271627
- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`)
16281628
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
16291629
- Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`)
1630-
- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`)
1630+
- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`)
16311631

16321632
I/O
16331633
^^^

pandas/core/indexes/multi.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,10 +1645,11 @@ def _get_labels_for_sorting(self):
16451645
"""
16461646
from pandas.core.categorical import Categorical
16471647

1648-
return [Categorical.from_codes(label,
1649-
np.arange(np.array(label).max() + 1,
1650-
dtype=label.dtype),
1651-
ordered=True)
1648+
def cats(label):
1649+
return np.arange(np.array(label).max() + 1 if len(label) else 0,
1650+
dtype=label.dtype)
1651+
1652+
return [Categorical.from_codes(label, cats(label), ordered=True)
16521653
for label in self.labels]
16531654

16541655
def sortlevel(self, level=0, ascending=True, sort_remaining=True):

pandas/tests/groupby/test_groupby.py

Lines changed: 1 addition & 224 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from numpy import nan
1010

1111
from pandas import (date_range, bdate_range, Timestamp,
12-
isnull, Index, MultiIndex, DataFrame, Series,
12+
Index, MultiIndex, DataFrame, Series,
1313
concat, Panel, DatetimeIndex)
1414
from pandas.errors import UnsupportedFunctionCall, PerformanceWarning
1515
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
@@ -87,229 +87,6 @@ def test_select_bad_cols(self):
8787
# will have to rethink regex if you change message!
8888
g[['A', 'C']]
8989

90-
def test_first_last_nth(self):
91-
# tests for first / last / nth
92-
grouped = self.df.groupby('A')
93-
first = grouped.first()
94-
expected = self.df.loc[[1, 0], ['B', 'C', 'D']]
95-
expected.index = Index(['bar', 'foo'], name='A')
96-
expected = expected.sort_index()
97-
assert_frame_equal(first, expected)
98-
99-
nth = grouped.nth(0)
100-
assert_frame_equal(nth, expected)
101-
102-
last = grouped.last()
103-
expected = self.df.loc[[5, 7], ['B', 'C', 'D']]
104-
expected.index = Index(['bar', 'foo'], name='A')
105-
assert_frame_equal(last, expected)
106-
107-
nth = grouped.nth(-1)
108-
assert_frame_equal(nth, expected)
109-
110-
nth = grouped.nth(1)
111-
expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy()
112-
expected.index = Index(['foo', 'bar'], name='A')
113-
expected = expected.sort_index()
114-
assert_frame_equal(nth, expected)
115-
116-
# it works!
117-
grouped['B'].first()
118-
grouped['B'].last()
119-
grouped['B'].nth(0)
120-
121-
self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
122-
self.assertTrue(isnull(grouped['B'].first()['foo']))
123-
self.assertTrue(isnull(grouped['B'].last()['foo']))
124-
self.assertTrue(isnull(grouped['B'].nth(0)['foo']))
125-
126-
# v0.14.0 whatsnew
127-
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
128-
g = df.groupby('A')
129-
result = g.first()
130-
expected = df.iloc[[1, 2]].set_index('A')
131-
assert_frame_equal(result, expected)
132-
133-
expected = df.iloc[[1, 2]].set_index('A')
134-
result = g.nth(0, dropna='any')
135-
assert_frame_equal(result, expected)
136-
137-
def test_first_last_nth_dtypes(self):
138-
139-
df = self.df_mixed_floats.copy()
140-
df['E'] = True
141-
df['F'] = 1
142-
143-
# tests for first / last / nth
144-
grouped = df.groupby('A')
145-
first = grouped.first()
146-
expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
147-
expected.index = Index(['bar', 'foo'], name='A')
148-
expected = expected.sort_index()
149-
assert_frame_equal(first, expected)
150-
151-
last = grouped.last()
152-
expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
153-
expected.index = Index(['bar', 'foo'], name='A')
154-
expected = expected.sort_index()
155-
assert_frame_equal(last, expected)
156-
157-
nth = grouped.nth(1)
158-
expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
159-
expected.index = Index(['bar', 'foo'], name='A')
160-
expected = expected.sort_index()
161-
assert_frame_equal(nth, expected)
162-
163-
# GH 2763, first/last shifting dtypes
164-
idx = lrange(10)
165-
idx.append(9)
166-
s = Series(data=lrange(11), index=idx, name='IntCol')
167-
self.assertEqual(s.dtype, 'int64')
168-
f = s.groupby(level=0).first()
169-
self.assertEqual(f.dtype, 'int64')
170-
171-
def test_nth(self):
172-
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
173-
g = df.groupby('A')
174-
175-
assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
176-
assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
177-
assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
178-
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
179-
assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
180-
assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
181-
assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
182-
assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
183-
assert_frame_equal(g[['B']].nth(0),
184-
df.loc[[0, 2], ['A', 'B']].set_index('A'))
185-
186-
exp = df.set_index('A')
187-
assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
188-
assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
189-
190-
exp['B'] = np.nan
191-
assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
192-
assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
193-
194-
# out of bounds, regression from 0.13.1
195-
# GH 6621
196-
df = DataFrame({'color': {0: 'green',
197-
1: 'green',
198-
2: 'red',
199-
3: 'red',
200-
4: 'red'},
201-
'food': {0: 'ham',
202-
1: 'eggs',
203-
2: 'eggs',
204-
3: 'ham',
205-
4: 'pork'},
206-
'two': {0: 1.5456590000000001,
207-
1: -0.070345000000000005,
208-
2: -2.4004539999999999,
209-
3: 0.46206000000000003,
210-
4: 0.52350799999999997},
211-
'one': {0: 0.56573799999999996,
212-
1: -0.9742360000000001,
213-
2: 1.033801,
214-
3: -0.78543499999999999,
215-
4: 0.70422799999999997}}).set_index(['color',
216-
'food'])
217-
218-
result = df.groupby(level=0, as_index=False).nth(2)
219-
expected = df.iloc[[-1]]
220-
assert_frame_equal(result, expected)
221-
222-
result = df.groupby(level=0, as_index=False).nth(3)
223-
expected = df.loc[[]]
224-
assert_frame_equal(result, expected)
225-
226-
# GH 7559
227-
# from the vbench
228-
df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
229-
s = df[1]
230-
g = df[0]
231-
expected = s.groupby(g).first()
232-
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
233-
assert_series_equal(expected2, expected, check_names=False)
234-
self.assertTrue(expected.name, 0)
235-
self.assertEqual(expected.name, 1)
236-
237-
# validate first
238-
v = s[g == 1].iloc[0]
239-
self.assertEqual(expected.iloc[0], v)
240-
self.assertEqual(expected2.iloc[0], v)
241-
242-
# this is NOT the same as .first (as sorted is default!)
243-
# as it keeps the order in the series (and not the group order)
244-
# related GH 7287
245-
expected = s.groupby(g, sort=False).first()
246-
result = s.groupby(g, sort=False).nth(0, dropna='all')
247-
assert_series_equal(result, expected)
248-
249-
# doc example
250-
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
251-
g = df.groupby('A')
252-
result = g.B.nth(0, dropna=True)
253-
expected = g.B.first()
254-
assert_series_equal(result, expected)
255-
256-
# test multiple nth values
257-
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
258-
columns=['A', 'B'])
259-
g = df.groupby('A')
260-
261-
assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
262-
assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
263-
assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
264-
assert_frame_equal(
265-
g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
266-
assert_frame_equal(
267-
g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
268-
assert_frame_equal(
269-
g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
270-
assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
271-
assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
272-
273-
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
274-
freq='B')
275-
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
276-
# get the first, fourth and last two business days for each month
277-
key = (df.index.year, df.index.month)
278-
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
279-
expected_dates = pd.to_datetime(
280-
['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
281-
'2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
282-
'2014/6/27', '2014/6/30'])
283-
expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
284-
assert_frame_equal(result, expected)
285-
286-
def test_nth_multi_index(self):
287-
# PR 9090, related to issue 8979
288-
# test nth on MultiIndex, should match .first()
289-
grouped = self.three_group.groupby(['A', 'B'])
290-
result = grouped.nth(0)
291-
expected = grouped.first()
292-
assert_frame_equal(result, expected)
293-
294-
def test_nth_multi_index_as_expected(self):
295-
# PR 9090, related to issue 8979
296-
# test nth on MultiIndex
297-
three_group = DataFrame(
298-
{'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
299-
'foo', 'foo', 'foo'],
300-
'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
301-
'two', 'two', 'one'],
302-
'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
303-
'dull', 'shiny', 'shiny', 'shiny']})
304-
grouped = three_group.groupby(['A', 'B'])
305-
result = grouped.nth(0)
306-
expected = DataFrame(
307-
{'C': ['dull', 'dull', 'dull', 'dull']},
308-
index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
309-
['one', 'two', 'one', 'two']],
310-
names=['A', 'B']))
311-
assert_frame_equal(result, expected)
312-
31390
def test_group_selection_cache(self):
31491
# GH 12839 nth, head, and tail should return same result consistently
31592
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])

0 commit comments

Comments
 (0)