From 56285b9c2a33547142c52dbbd9e27ad4cfb1f415 Mon Sep 17 00:00:00 2001 From: Ksenia Bobrova Date: Thu, 15 Mar 2018 22:34:26 +0100 Subject: [PATCH] remove NaN in categories checking --- pandas/core/arrays/categorical.py | 40 +++++-------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b37f88d8bfdce..c6c46956a6eaf 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1258,7 +1258,7 @@ def isna(self): """ Detect missing values - Both missing values (-1 in .codes) and NA as a category are detected. + Missing values (-1 in .codes) are detected. Returns ------- @@ -1273,13 +1273,6 @@ def isna(self): """ ret = self._codes == -1 - - # String/object and float categories can hold np.nan - if self.categories.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.categories: - nan_pos = np.where(isna(self.categories))[0] - # we only have one NA in categories - ret = np.logical_or(ret, self._codes == nan_pos) return ret isnull = isna @@ -1315,16 +1308,14 @@ def dropna(self): """ Return the Categorical without null values. - Both missing values (-1 in .codes) and NA as a category are detected. - NA is removed from the categories if present. + Missing values (-1 in .codes) are detected. Returns ------- valid : Categorical """ result = self[self.notna()] - if isna(result.categories).any(): - result = result.remove_categories([np.nan]) + return result def value_counts(self, dropna=True): @@ -1336,7 +1327,7 @@ def value_counts(self, dropna=True): Parameters ---------- dropna : boolean, default True - Don't include counts of NaN, even if NaN is a category. + Don't include counts of NaN. Returns ------- @@ -1348,11 +1339,9 @@ def value_counts(self, dropna=True): """ from numpy import bincount - from pandas import isna, Series, CategoricalIndex + from pandas import Series, CategoricalIndex - obj = (self.remove_categories([np.nan]) if dropna and - isna(self.categories).any() else self) - code, cat = obj._codes, obj.categories + code, cat = self._codes, self.categories ncat, mask = len(cat), 0 <= code ix, clean = np.arange(ncat), mask.all() @@ -1627,14 +1616,6 @@ def fillna(self, value=None, method=None, limit=None): values = self._codes - # Make sure that we also get NA in categories - if self.categories.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.categories: - values = values.copy() - nan_pos = np.where(isna(self.categories))[0] - # we only have one NA in categories - values[values == nan_pos] = -1 - # pad / bfill if method is not None: @@ -1888,15 +1869,6 @@ def __setitem__(self, key, value): key = np.asarray(key) lindexer = self.categories.get_indexer(rvalue) - - # FIXME: the following can be removed after GH7820 is fixed: - # https://github.com/pandas-dev/pandas/issues/7820 - # float categories do currently return -1 for np.nan, even if np.nan is - # included in the index -> "repair" this here - if isna(rvalue).any() and isna(self.categories).any(): - nan_pos = np.where(isna(self.categories))[0] - lindexer[lindexer == -1] = nan_pos - lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer