diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 4b3c96da10efd..de987edcdc679 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -184,3 +184,5 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) + +- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 44c91862227d8..ea6e9012f7e8a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -883,8 +883,8 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._codes = inv cat._categories = cat.categories.take(idx) + cat._codes = _coerce_indexer_dtype(inv, self._categories) if not inplace: return cat diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 40ef5354e91bd..5a0d079efb4c2 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1022,14 +1022,14 @@ def f(): def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) - exp_categories_all = np.array(["a", "b", "c", "d", "e"]) - exp_categories_dropped = np.array(["a", "b", "c", "d"]) + exp_categories_all = Index(["a", "b", "c", "d", "e"]) + exp_categories_dropped = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(c.categories, exp_categories_all) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, exp_categories_dropped) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, exp_categories_dropped) + self.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories(inplace=True) self.assert_numpy_array_equal(c.categories, exp_categories_dropped) @@ -1039,15 +1039,18 @@ def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, - np.array(["a", "b", "c"])) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, + Index(np.array(["a", "b", "c"]))) + exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(res.codes, exp_codes) + self.assert_index_equal(c.categories, exp_categories_all) val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] cat = pd.Categorical(values=val, categories=list('ABCDEFG')) out = cat.remove_unused_categories() - self.assert_numpy_array_equal(out.categories, ['B', 'D', 'F']) - self.assert_numpy_array_equal(out.codes, [2, -1, 1, 0, 1, 2, -1]) + self.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) + exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(out.codes, exp_codes) self.assertEqual(out.get_values().tolist(), val) alpha = list('abcdefghijklmnopqrstuvwxyz')