BUG,TST,CLN: improve handling of Series.value_counts's argument 'dropna' (GH9443)

Kodiologist · jreback · commit 0270484ed923 · 2015-03-08T12:05:23.000-04:00
- Fixed bug in Series.values_counts with excluding NaN for categorical
  type Series with dropna=True.
- Series.values_counts and Series.describe for categorical type will now
  put NaN entries at the end.
- Series.describe for categorical type will now give counts and frequencies
  of 0, not NA, for unused categories.
- Added Categorical.value_counts and Categorical.dropna for internal use.
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -297,6 +297,9 @@ Backwards incompatible API changes
      p / 0
      p // 0
 
+- ``Series.values_counts`` and ``Series.describe`` for categorical data will now put NaN entries at the end. (:issue:`9443`)
+- ``Series.describe`` for categorical data will now give counts and frequencies of 0, not NA, for unused categories (:issue:`9443`)
+
 
 Indexing Changes
 ~~~~~~~~~~~~~~~~
@@ -535,3 +538,9 @@ Bug Fixes
 - Fixed bug with reading CSV files from Amazon S3 on python 3 raising a TypeError (:issue:`9452`)
 
 - Bug in the Google BigQuery reader where the 'jobComplete' key may be present but False in the query results (:issue:`8728`)
+
+
+
+
+
+- Bug in ``Series.values_counts`` with excluding NaN for categorical type ``Series`` with ``dropna=True`` (:issue:`9443`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -202,57 +202,57 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
     from pandas.tools.tile import cut
     from pandas.tseries.period import PeriodIndex
 
-    is_period = com.is_period_arraylike(values)
     values = Series(values).values
-    is_category = com.is_categorical_dtype(values.dtype)
 
     if bins is not None:
         try:
             cat, bins = cut(values, bins, retbins=True)
         except TypeError:
             raise TypeError("bins argument only works with numeric data.")
         values = cat.codes
-    elif is_category:
-        bins = values.categories
-        cat = values
-        values = cat.codes
 
-    dtype = values.dtype
+    if com.is_categorical_dtype(values.dtype):
+        result = values.value_counts(dropna)
 
-    if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)) or is_period:
-        if is_period:
-            values = PeriodIndex(values)
+    else:
 
-        values = values.view(np.int64)
-        keys, counts = htable.value_count_int64(values)
+        dtype = values.dtype
+        is_period = com.is_period_arraylike(values)
 
-        if dropna:
-            from pandas.tslib import iNaT
-            msk = keys != iNaT
-            keys, counts = keys[msk], counts[msk]
-        # convert the keys back to the dtype we came in
-        keys = keys.astype(dtype)
+        if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
 
-    elif com.is_integer_dtype(dtype):
-        values = com._ensure_int64(values)
-        keys, counts = htable.value_count_int64(values)
+            if is_period:
+                values = PeriodIndex(values)
 
-    else:
-        values = com._ensure_object(values)
-        mask = com.isnull(values)
-        keys, counts = htable.value_count_object(values, mask)
-        if not dropna:
-            keys = np.insert(keys, 0, np.NaN)
-            counts = np.insert(counts, 0, mask.sum())
+            values = values.view(np.int64)
+            keys, counts = htable.value_count_int64(values)
+
+            if dropna:
+                from pandas.tslib import iNaT
+                msk = keys != iNaT
+                keys, counts = keys[msk], counts[msk]
+
+            # convert the keys back to the dtype we came in
+            keys = keys.astype(dtype)
+
+        elif com.is_integer_dtype(dtype):
+            values = com._ensure_int64(values)
+            keys, counts = htable.value_count_int64(values)
 
-    result = Series(counts, index=com._values_from_object(keys))
-    if bins is not None:
-        # TODO: This next line should be more efficient
-        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
-        if not is_category:
-            result.index = bins[:-1]
         else:
-            result.index = cat.categories
+            values = com._ensure_object(values)
+            mask = com.isnull(values)
+            keys, counts = htable.value_count_object(values, mask)
+            if not dropna and mask.any():
+                keys = np.insert(keys, 0, np.NaN)
+                counts = np.insert(counts, 0, mask.sum())
+
+        result = Series(counts, index=com._values_from_object(keys))
+
+        if bins is not None:
+            # TODO: This next line should be more efficient
+            result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
+            result.index = bins[:-1]
 
     if sort:
         result.sort()
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -876,6 +876,54 @@ def notnull(self):
         """
         return ~self.isnull()
 
+    def dropna(self):
+        """
+        Return the Categorical without null values.
+
+        Both missing values (-1 in .codes) and NA as a category are detected.
+        NA is removed from the categories if present.
+
+        Returns
+        -------
+        valid : Categorical
+        """
+        result = self[self.notnull()]
+        if isnull(result.categories).any():
+            result = result.remove_categories([np.nan])
+        return result
+
+    def value_counts(self, dropna=True):
+        """
+        Returns a Series containing counts of each category.
+
+        Every category will have an entry, even those with a count of 0.
+
+        Parameters
+        ----------
+        dropna : boolean, default True
+            Don't include counts of NaN, even if NaN is a category.
+
+        Returns
+        -------
+        counts : Series
+        """
+        import pandas.hashtable as htable
+        from pandas.core.series import Series
+
+        cat = self.dropna() if dropna else self
+        keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
+        result = Series(counts, index=keys)
+
+        ix = np.arange(len(cat.categories), dtype='int64')
+        if not dropna and -1 in keys:
+            ix = np.append(ix, -1)
+        result = result.reindex(ix, fill_value=0)
+        result.index = (np.append(cat.categories, np.nan)
+            if not dropna and -1 in keys
+            else cat.categories)
+
+        return result
+
     def get_values(self):
         """ Return the values.
 
@@ -1421,34 +1469,12 @@ def describe(self):
         description: `DataFrame`
             A dataframe with frequency and counts by category.
         """
-        # Hack?
-        from pandas.core.frame import DataFrame
-        counts = DataFrame({
-            'codes' : self._codes,
-            'values' : self._codes }
-                           ).groupby('codes').count()
-
+        counts = self.value_counts(dropna=False)
         freqs = counts / float(counts.sum())
 
         from pandas.tools.merge import concat
         result = concat([counts,freqs],axis=1)
         result.columns = ['counts','freqs']
-
-        # fill in the real categories
-        check = result.index == -1
-        if check.any():
-            # Sort -1 (=NaN) to the last position
-            index = np.arange(0, len(self.categories)+1, dtype='int64')
-            index[-1] = -1
-            result = result.reindex(index)
-            # build new index
-            categories = np.arange(0,len(self.categories)+1 ,dtype=object)
-            categories[:-1] = self.categories
-            categories[-1] = np.nan
-            result.index = categories.take(_ensure_platform_int(result.index))
-        else:
-            result.index = self.categories.take(_ensure_platform_int(result.index))
-            result = result.reindex(self.categories)
         result.index.name = 'categories'
 
         return result
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -254,6 +254,37 @@ def test_value_counts_nat(self):
         tm.assert_series_equal(algos.value_counts(dt), exp_dt)
         # TODO same for (timedelta)
 
+    def test_dropna(self):
+        # https://github.com/pydata/pandas/issues/9443#issuecomment-73719328
+
+        tm.assert_series_equal(
+            pd.Series([True, True, False]).value_counts(dropna=True),
+            pd.Series([2, 1], index=[True, False]))
+        tm.assert_series_equal(
+            pd.Series([True, True, False]).value_counts(dropna=False),
+            pd.Series([2, 1], index=[True, False]))
+
+        tm.assert_series_equal(
+            pd.Series([True, True, False, None]).value_counts(dropna=True),
+            pd.Series([2, 1], index=[True, False]))
+        tm.assert_series_equal(
+            pd.Series([True, True, False, None]).value_counts(dropna=False),
+            pd.Series([2, 1, 1], index=[True, False, np.nan]))
+
+        tm.assert_series_equal(
+            pd.Series([10.3, 5., 5.]).value_counts(dropna=True),
+            pd.Series([2, 1], index=[5., 10.3]))
+        tm.assert_series_equal(
+            pd.Series([10.3, 5., 5.]).value_counts(dropna=False),
+            pd.Series([2, 1], index=[5., 10.3]))
+
+        tm.assert_series_equal(
+            pd.Series([10.3, 5., 5., None]).value_counts(dropna=True),
+            pd.Series([2, 1], index=[5., 10.3]))
+        tm.assert_series_equal(
+            pd.Series([10.3, 5., 5., None]).value_counts(dropna=False),
+            pd.Series([2, 1, 1], index=[5., 10.3, np.nan]))
+
 def test_quantile():
     s = Series(np.random.randn(100))
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -389,8 +389,8 @@ def test_describe(self):
         cat = self.factor.copy()
         cat.set_categories(["a","b","c","d"], inplace=True)
         desc = cat.describe()
-        expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan],
-                                            freqs=[3/8., 2/8., 3/8., np.nan],
+        expected = DataFrame.from_dict(dict(counts=[3, 2, 3, 0],
+                                            freqs=[3/8., 2/8., 3/8., 0],
                                             categories=['a', 'b', 'c', 'd'])
                                             ).set_index('categories')
         tm.assert_frame_equal(desc, expected)
@@ -415,31 +415,20 @@ def test_describe(self):
                                             ).set_index('categories')
         tm.assert_frame_equal(desc, expected)
 
-        # having NaN as category and as "not available" should also print two NaNs in describe!
-        cat = pd.Categorical([np.nan,1, 2, 2])
-        cat.set_categories([1,2,np.nan], rename=True, inplace=True)
-        desc = cat.describe()
-        expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1],
-                                            freqs=[1/4., 2/4., np.nan, 1/4.],
-                                            categories=[1,2,np.nan,np.nan]
-                                            )
-                                            ).set_index('categories')
-        tm.assert_frame_equal(desc, expected)
-
-        # empty categories show up as NA
-        cat = Categorical(["a","b","b","b"], categories=['a','b','c'], ordered=True)
+        # NA as a category
+        cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan])
         result = cat.describe()
 
-        expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]],
+        expected = DataFrame([[0,0],[1,0.25],[2,0.5],[1,0.25]],
                              columns=['counts','freqs'],
-                             index=Index(['a','b','c'],name='categories'))
+                             index=Index(['b','a','c',np.nan],name='categories'))
         tm.assert_frame_equal(result,expected)
 
-        # NA as a category
-        cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] )
+        # NA as an unused category
+        cat = pd.Categorical(["a","c","c"], categories=["b","a","c",np.nan])
         result = cat.describe()
 
-        expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]],
+        expected = DataFrame([[0,0],[1,1/3.],[2,2/3.],[0,0]],
                              columns=['counts','freqs'],
                              index=Index(['b','a','c',np.nan],name='categories'))
         tm.assert_frame_equal(result,expected)
@@ -1573,6 +1562,46 @@ def test_value_counts(self):
         exp = Series([3,2,1,0], index=["c","b","a","d"])
         tm.assert_series_equal(res, exp)
 
+    def test_value_counts_with_nan(self):
+        # https://github.com/pydata/pandas/issues/9443
+
+        s = pd.Series(["a", "b", "a"], dtype="category")
+        tm.assert_series_equal(
+            s.value_counts(dropna=True),
+            pd.Series([2, 1], index=["a", "b"]))
+        tm.assert_series_equal(
+            s.value_counts(dropna=False),
+            pd.Series([2, 1], index=["a", "b"]))
+
+        s = pd.Series(["a", "b", None, "a", None, None], dtype="category")
+        tm.assert_series_equal(
+            s.value_counts(dropna=True),
+            pd.Series([2, 1], index=["a", "b"]))
+        tm.assert_series_equal(
+            s.value_counts(dropna=False),
+            pd.Series([3, 2, 1], index=[np.nan, "a", "b"]))
+        # When we aren't sorting by counts, and np.nan isn't a
+        # category, it should be last.
+        tm.assert_series_equal(
+            s.value_counts(dropna=False, sort=False),
+            pd.Series([2, 1, 3], index=["a", "b", np.nan]))
+
+        s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", np.nan]))
+        tm.assert_series_equal(
+            s.value_counts(dropna=True),
+            pd.Series([2, 1], index=["a", "b"]))
+        tm.assert_series_equal(
+            s.value_counts(dropna=False),
+            pd.Series([2, 1, 0], index=["a", "b", np.nan]))
+
+        s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], categories=["a", "b", np.nan]))
+        tm.assert_series_equal(
+            s.value_counts(dropna=True),
+            pd.Series([2, 1], index=["a", "b"]))
+        tm.assert_series_equal(
+            s.value_counts(dropna=False),
+            pd.Series([3, 2, 1], index=[np.nan, "a", "b"]))
+
     def test_groupby(self):
 
         cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"])