From 238e30e2df55632b708faf45d67a0c6fc97f822d Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Tue, 28 Apr 2015 09:01:17 -0400 Subject: [PATCH 1/2] BUG: null group spills into final group when grouping on a categorical --- doc/source/whatsnew/v0.16.1.txt | 2 +- pandas/lib.pyx | 20 ++++++++++++-------- pandas/tests/test_categorical.py | 7 +++++++ 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 2ddf77d99d51d..d00ce29fbfe92 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -218,7 +218,7 @@ Bug Fixes - Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`) - +- Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`) - Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 0d53b19425c2f..de966d6e03ee2 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1306,9 +1306,10 @@ def duplicated(ndarray[object] values, take_last=False): def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: - Py_ssize_t i, group_size, n, lab, start + Py_ssize_t i, group_size, n, start + int64_t lab object slobj - ndarray[int64_t] starts + ndarray[int64_t] starts, ends n = len(labels) @@ -1318,13 +1319,16 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): start = 0 group_size = 0 for i in range(n): - group_size += 1 lab = labels[i] - if i == n - 1 or lab != labels[i + 1]: - starts[lab] = start - ends[lab] = start + group_size - start += group_size - group_size = 0 + if lab < 0: + start += 1 + else: + group_size += 1 + if i == n - 1 or lab != labels[i + 1]: + starts[lab] = start + ends[lab] = start + group_size + start += group_size + group_size = 0 return starts, ends diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 66e411d1eaddb..5a5401c8da3ca 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1841,6 +1841,13 @@ def f(x): tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) tm.assert_frame_equal(df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) + # GH 9603 + df = pd.DataFrame({'a': [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4]) + result = df.groupby(c).apply(len) + expected = pd.Series([1, 0, 0, 0], index=c.values.categories) + tm.assert_series_equal(result, expected) + def test_pivot_table(self): raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"], ordered=True) From d194c99db9df23e8a47f1a317492adab38efe306 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Wed, 29 Apr 2015 15:26:30 -0400 Subject: [PATCH 2/2] Fix missing index name in test --- pandas/tests/test_categorical.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 pandas/tests/test_categorical.py diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py old mode 100644 new mode 100755 index 5a5401c8da3ca..c03fd93f6173f --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1846,6 +1846,7 @@ def f(x): c = pd.cut(df.a, [0, 1, 2, 3, 4]) result = df.groupby(c).apply(len) expected = pd.Series([1, 0, 0, 0], index=c.values.categories) + expected.index.name = 'a' tm.assert_series_equal(result, expected) def test_pivot_table(self):