Skip to content

Commit 0270484

Browse files
Kodiologistjreback
authored andcommitted
BUG,TST,CLN: improve handling of Series.value_counts's argument 'dropna' (GH9443)
- Fixed bug in Series.values_counts with excluding NaN for categorical type Series with dropna=True. - Series.values_counts and Series.describe for categorical type will now put NaN entries at the end. - Series.describe for categorical type will now give counts and frequencies of 0, not NA, for unused categories. - Added Categorical.value_counts and Categorical.dropna for internal use.
1 parent 671c4b3 commit 0270484

File tree

5 files changed

+173
-78
lines changed

5 files changed

+173
-78
lines changed

doc/source/whatsnew/v0.16.0.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,9 @@ Backwards incompatible API changes
297297
p / 0
298298
p // 0
299299

300+
- ``Series.values_counts`` and ``Series.describe`` for categorical data will now put NaN entries at the end. (:issue:`9443`)
301+
- ``Series.describe`` for categorical data will now give counts and frequencies of 0, not NA, for unused categories (:issue:`9443`)
302+
300303

301304
Indexing Changes
302305
~~~~~~~~~~~~~~~~
@@ -535,3 +538,9 @@ Bug Fixes
535538
- Fixed bug with reading CSV files from Amazon S3 on python 3 raising a TypeError (:issue:`9452`)
536539

537540
- Bug in the Google BigQuery reader where the 'jobComplete' key may be present but False in the query results (:issue:`8728`)
541+
542+
543+
544+
545+
546+
- Bug in ``Series.values_counts`` with excluding NaN for categorical type ``Series`` with ``dropna=True`` (:issue:`9443`)

pandas/core/algorithms.py

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -202,57 +202,57 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
202202
from pandas.tools.tile import cut
203203
from pandas.tseries.period import PeriodIndex
204204

205-
is_period = com.is_period_arraylike(values)
206205
values = Series(values).values
207-
is_category = com.is_categorical_dtype(values.dtype)
208206

209207
if bins is not None:
210208
try:
211209
cat, bins = cut(values, bins, retbins=True)
212210
except TypeError:
213211
raise TypeError("bins argument only works with numeric data.")
214212
values = cat.codes
215-
elif is_category:
216-
bins = values.categories
217-
cat = values
218-
values = cat.codes
219213

220-
dtype = values.dtype
214+
if com.is_categorical_dtype(values.dtype):
215+
result = values.value_counts(dropna)
221216

222-
if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)) or is_period:
223-
if is_period:
224-
values = PeriodIndex(values)
217+
else:
225218

226-
values = values.view(np.int64)
227-
keys, counts = htable.value_count_int64(values)
219+
dtype = values.dtype
220+
is_period = com.is_period_arraylike(values)
228221

229-
if dropna:
230-
from pandas.tslib import iNaT
231-
msk = keys != iNaT
232-
keys, counts = keys[msk], counts[msk]
233-
# convert the keys back to the dtype we came in
234-
keys = keys.astype(dtype)
222+
if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
235223

236-
elif com.is_integer_dtype(dtype):
237-
values = com._ensure_int64(values)
238-
keys, counts = htable.value_count_int64(values)
224+
if is_period:
225+
values = PeriodIndex(values)
239226

240-
else:
241-
values = com._ensure_object(values)
242-
mask = com.isnull(values)
243-
keys, counts = htable.value_count_object(values, mask)
244-
if not dropna:
245-
keys = np.insert(keys, 0, np.NaN)
246-
counts = np.insert(counts, 0, mask.sum())
227+
values = values.view(np.int64)
228+
keys, counts = htable.value_count_int64(values)
229+
230+
if dropna:
231+
from pandas.tslib import iNaT
232+
msk = keys != iNaT
233+
keys, counts = keys[msk], counts[msk]
234+
235+
# convert the keys back to the dtype we came in
236+
keys = keys.astype(dtype)
237+
238+
elif com.is_integer_dtype(dtype):
239+
values = com._ensure_int64(values)
240+
keys, counts = htable.value_count_int64(values)
247241

248-
result = Series(counts, index=com._values_from_object(keys))
249-
if bins is not None:
250-
# TODO: This next line should be more efficient
251-
result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
252-
if not is_category:
253-
result.index = bins[:-1]
254242
else:
255-
result.index = cat.categories
243+
values = com._ensure_object(values)
244+
mask = com.isnull(values)
245+
keys, counts = htable.value_count_object(values, mask)
246+
if not dropna and mask.any():
247+
keys = np.insert(keys, 0, np.NaN)
248+
counts = np.insert(counts, 0, mask.sum())
249+
250+
result = Series(counts, index=com._values_from_object(keys))
251+
252+
if bins is not None:
253+
# TODO: This next line should be more efficient
254+
result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
255+
result.index = bins[:-1]
256256

257257
if sort:
258258
result.sort()

pandas/core/categorical.py

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,54 @@ def notnull(self):
876876
"""
877877
return ~self.isnull()
878878

879+
def dropna(self):
880+
"""
881+
Return the Categorical without null values.
882+
883+
Both missing values (-1 in .codes) and NA as a category are detected.
884+
NA is removed from the categories if present.
885+
886+
Returns
887+
-------
888+
valid : Categorical
889+
"""
890+
result = self[self.notnull()]
891+
if isnull(result.categories).any():
892+
result = result.remove_categories([np.nan])
893+
return result
894+
895+
def value_counts(self, dropna=True):
896+
"""
897+
Returns a Series containing counts of each category.
898+
899+
Every category will have an entry, even those with a count of 0.
900+
901+
Parameters
902+
----------
903+
dropna : boolean, default True
904+
Don't include counts of NaN, even if NaN is a category.
905+
906+
Returns
907+
-------
908+
counts : Series
909+
"""
910+
import pandas.hashtable as htable
911+
from pandas.core.series import Series
912+
913+
cat = self.dropna() if dropna else self
914+
keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
915+
result = Series(counts, index=keys)
916+
917+
ix = np.arange(len(cat.categories), dtype='int64')
918+
if not dropna and -1 in keys:
919+
ix = np.append(ix, -1)
920+
result = result.reindex(ix, fill_value=0)
921+
result.index = (np.append(cat.categories, np.nan)
922+
if not dropna and -1 in keys
923+
else cat.categories)
924+
925+
return result
926+
879927
def get_values(self):
880928
""" Return the values.
881929
@@ -1421,34 +1469,12 @@ def describe(self):
14211469
description: `DataFrame`
14221470
A dataframe with frequency and counts by category.
14231471
"""
1424-
# Hack?
1425-
from pandas.core.frame import DataFrame
1426-
counts = DataFrame({
1427-
'codes' : self._codes,
1428-
'values' : self._codes }
1429-
).groupby('codes').count()
1430-
1472+
counts = self.value_counts(dropna=False)
14311473
freqs = counts / float(counts.sum())
14321474

14331475
from pandas.tools.merge import concat
14341476
result = concat([counts,freqs],axis=1)
14351477
result.columns = ['counts','freqs']
1436-
1437-
# fill in the real categories
1438-
check = result.index == -1
1439-
if check.any():
1440-
# Sort -1 (=NaN) to the last position
1441-
index = np.arange(0, len(self.categories)+1, dtype='int64')
1442-
index[-1] = -1
1443-
result = result.reindex(index)
1444-
# build new index
1445-
categories = np.arange(0,len(self.categories)+1 ,dtype=object)
1446-
categories[:-1] = self.categories
1447-
categories[-1] = np.nan
1448-
result.index = categories.take(_ensure_platform_int(result.index))
1449-
else:
1450-
result.index = self.categories.take(_ensure_platform_int(result.index))
1451-
result = result.reindex(self.categories)
14521478
result.index.name = 'categories'
14531479

14541480
return result

pandas/tests/test_algos.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,37 @@ def test_value_counts_nat(self):
254254
tm.assert_series_equal(algos.value_counts(dt), exp_dt)
255255
# TODO same for (timedelta)
256256

257+
def test_dropna(self):
258+
# https://github.com/pydata/pandas/issues/9443#issuecomment-73719328
259+
260+
tm.assert_series_equal(
261+
pd.Series([True, True, False]).value_counts(dropna=True),
262+
pd.Series([2, 1], index=[True, False]))
263+
tm.assert_series_equal(
264+
pd.Series([True, True, False]).value_counts(dropna=False),
265+
pd.Series([2, 1], index=[True, False]))
266+
267+
tm.assert_series_equal(
268+
pd.Series([True, True, False, None]).value_counts(dropna=True),
269+
pd.Series([2, 1], index=[True, False]))
270+
tm.assert_series_equal(
271+
pd.Series([True, True, False, None]).value_counts(dropna=False),
272+
pd.Series([2, 1, 1], index=[True, False, np.nan]))
273+
274+
tm.assert_series_equal(
275+
pd.Series([10.3, 5., 5.]).value_counts(dropna=True),
276+
pd.Series([2, 1], index=[5., 10.3]))
277+
tm.assert_series_equal(
278+
pd.Series([10.3, 5., 5.]).value_counts(dropna=False),
279+
pd.Series([2, 1], index=[5., 10.3]))
280+
281+
tm.assert_series_equal(
282+
pd.Series([10.3, 5., 5., None]).value_counts(dropna=True),
283+
pd.Series([2, 1], index=[5., 10.3]))
284+
tm.assert_series_equal(
285+
pd.Series([10.3, 5., 5., None]).value_counts(dropna=False),
286+
pd.Series([2, 1, 1], index=[5., 10.3, np.nan]))
287+
257288
def test_quantile():
258289
s = Series(np.random.randn(100))
259290

pandas/tests/test_categorical.py

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -389,8 +389,8 @@ def test_describe(self):
389389
cat = self.factor.copy()
390390
cat.set_categories(["a","b","c","d"], inplace=True)
391391
desc = cat.describe()
392-
expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan],
393-
freqs=[3/8., 2/8., 3/8., np.nan],
392+
expected = DataFrame.from_dict(dict(counts=[3, 2, 3, 0],
393+
freqs=[3/8., 2/8., 3/8., 0],
394394
categories=['a', 'b', 'c', 'd'])
395395
).set_index('categories')
396396
tm.assert_frame_equal(desc, expected)
@@ -415,31 +415,20 @@ def test_describe(self):
415415
).set_index('categories')
416416
tm.assert_frame_equal(desc, expected)
417417

418-
# having NaN as category and as "not available" should also print two NaNs in describe!
419-
cat = pd.Categorical([np.nan,1, 2, 2])
420-
cat.set_categories([1,2,np.nan], rename=True, inplace=True)
421-
desc = cat.describe()
422-
expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1],
423-
freqs=[1/4., 2/4., np.nan, 1/4.],
424-
categories=[1,2,np.nan,np.nan]
425-
)
426-
).set_index('categories')
427-
tm.assert_frame_equal(desc, expected)
428-
429-
# empty categories show up as NA
430-
cat = Categorical(["a","b","b","b"], categories=['a','b','c'], ordered=True)
418+
# NA as a category
419+
cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan])
431420
result = cat.describe()
432421

433-
expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]],
422+
expected = DataFrame([[0,0],[1,0.25],[2,0.5],[1,0.25]],
434423
columns=['counts','freqs'],
435-
index=Index(['a','b','c'],name='categories'))
424+
index=Index(['b','a','c',np.nan],name='categories'))
436425
tm.assert_frame_equal(result,expected)
437426

438-
# NA as a category
439-
cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] )
427+
# NA as an unused category
428+
cat = pd.Categorical(["a","c","c"], categories=["b","a","c",np.nan])
440429
result = cat.describe()
441430

442-
expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]],
431+
expected = DataFrame([[0,0],[1,1/3.],[2,2/3.],[0,0]],
443432
columns=['counts','freqs'],
444433
index=Index(['b','a','c',np.nan],name='categories'))
445434
tm.assert_frame_equal(result,expected)
@@ -1573,6 +1562,46 @@ def test_value_counts(self):
15731562
exp = Series([3,2,1,0], index=["c","b","a","d"])
15741563
tm.assert_series_equal(res, exp)
15751564

1565+
def test_value_counts_with_nan(self):
1566+
# https://github.com/pydata/pandas/issues/9443
1567+
1568+
s = pd.Series(["a", "b", "a"], dtype="category")
1569+
tm.assert_series_equal(
1570+
s.value_counts(dropna=True),
1571+
pd.Series([2, 1], index=["a", "b"]))
1572+
tm.assert_series_equal(
1573+
s.value_counts(dropna=False),
1574+
pd.Series([2, 1], index=["a", "b"]))
1575+
1576+
s = pd.Series(["a", "b", None, "a", None, None], dtype="category")
1577+
tm.assert_series_equal(
1578+
s.value_counts(dropna=True),
1579+
pd.Series([2, 1], index=["a", "b"]))
1580+
tm.assert_series_equal(
1581+
s.value_counts(dropna=False),
1582+
pd.Series([3, 2, 1], index=[np.nan, "a", "b"]))
1583+
# When we aren't sorting by counts, and np.nan isn't a
1584+
# category, it should be last.
1585+
tm.assert_series_equal(
1586+
s.value_counts(dropna=False, sort=False),
1587+
pd.Series([2, 1, 3], index=["a", "b", np.nan]))
1588+
1589+
s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", np.nan]))
1590+
tm.assert_series_equal(
1591+
s.value_counts(dropna=True),
1592+
pd.Series([2, 1], index=["a", "b"]))
1593+
tm.assert_series_equal(
1594+
s.value_counts(dropna=False),
1595+
pd.Series([2, 1, 0], index=["a", "b", np.nan]))
1596+
1597+
s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], categories=["a", "b", np.nan]))
1598+
tm.assert_series_equal(
1599+
s.value_counts(dropna=True),
1600+
pd.Series([2, 1], index=["a", "b"]))
1601+
tm.assert_series_equal(
1602+
s.value_counts(dropna=False),
1603+
pd.Series([3, 2, 1], index=[np.nan, "a", "b"]))
1604+
15761605
def test_groupby(self):
15771606

15781607
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"])

0 commit comments

Comments
 (0)