pandas-dev · shoyer · Apr 29, 2015 · Apr 28, 2015 · Apr 29, 2015 · evanpw
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -218,7 +218,7 @@ Bug Fixes
 - Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)
 
 
-
+- Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`)
 
 
 - Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`)

diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1306,9 +1306,10 @@ def duplicated(ndarray[object] values, take_last=False):
 
 def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
     cdef:
-        Py_ssize_t i, group_size, n, lab, start
+        Py_ssize_t i, group_size, n, start
+        int64_t lab
         object slobj
-        ndarray[int64_t] starts
+        ndarray[int64_t] starts, ends
 
     n = len(labels)
 
@@ -1318,13 +1319,16 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
     start = 0
     group_size = 0
     for i in range(n):
-        group_size += 1
         lab = labels[i]
-        if i == n - 1 or lab != labels[i + 1]:
-            starts[lab] = start
-            ends[lab] = start + group_size
-            start += group_size
-            group_size = 0
+        if lab < 0:
+            start += 1
+        else:
+            group_size += 1
+            if i == n - 1 or lab != labels[i + 1]:
+                starts[lab] = start
+                ends[lab] = start + group_size
+                start += group_size
+                group_size = 0
 
     return starts, ends
 

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1841,6 +1841,13 @@ def f(x):
         tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
         tm.assert_frame_equal(df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']])
 
+        # GH 9603
+        df = pd.DataFrame({'a': [1, 0, 0, 0]})
+        c = pd.cut(df.a, [0, 1, 2, 3, 4])
+        result = df.groupby(c).apply(len)
+        expected = pd.Series([1, 0, 0, 0], index=c.values.categories)
+        tm.assert_series_equal(result, expected)
+
     def test_pivot_table(self):
 
         raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"], ordered=True)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -218,7 +218,7 @@ Bug Fixes
		- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)



		- Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`)


		- Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`)
Expand Down