From 5323cf84c0a9812596e2d151c84cca7bb604ab0a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 31 Oct 2014 16:31:56 -0400 Subject: [PATCH] BUG: bug in selecting from a Categorical with iloc (GH8623) BUG: bug in groupby-transform with a Categorical (GH8623) BUG: bug in duplicated/drop_duplicates with a Categorical (GH8623) --- doc/source/whatsnew/v0.15.1.txt | 4 +++- pandas/core/frame.py | 11 +++++----- pandas/core/internals.py | 29 ++++++++++++++++++++++--- pandas/core/series.py | 8 ++++++- pandas/tests/test_categorical.py | 37 ++++++++++++++++++++++++++++++++ 5 files changed, 79 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index 2b7a75f29705e..e96adc2bd9559 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -183,7 +183,9 @@ Bug Fixes - Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`) - Bug in writing Categorical columns to an SQL database with ``to_sql`` (:issue:`8624`). - Bug in comparing ``Categorical`` of datetime raising when being compared to a scalar datetime (:issue:`8687`) - +- Bug in selecting from a ``Categorical`` with ``.iloc`` (:issue:`8623`) +- Bug in groupby-transform with a Categorical (:issue:`8623`) +- Bug in duplicated/drop_duplicates with a Categorical (:issue:`8623`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 524c485db218b..29aad379c8424 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2732,19 +2732,20 @@ def _m8_to_i8(x): return x.view(np.int64) return x + # if we are only duplicating on Categoricals this can be much faster if subset is None: - values = list(_m8_to_i8(self.values.T)) + values = list(_m8_to_i8(self.get_values().T)) else: if np.iterable(subset) and not isinstance(subset, compat.string_types): if isinstance(subset, tuple): if subset in self.columns: - values = [self[subset].values] + values = [self[subset].get_values()] else: - values = [_m8_to_i8(self[x].values) for x in subset] + values = [_m8_to_i8(self[x].get_values()) for x in subset] else: - values = [_m8_to_i8(self[x].values) for x in subset] + values = [_m8_to_i8(self[x].get_values()) for x in subset] else: - values = [self[subset].values] + values = [self[subset].get_values()] keys = lib.fast_zip_fillna(values) duplicated = lib.duplicated(keys, take_last=take_last) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9b95aff465d55..f3f88583b2445 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -58,6 +58,7 @@ class Block(PandasObject): _verify_integrity = True _validate_ndim = True _ftype = 'dense' + _holder = None def __init__(self, values, placement, ndim=None, fastpath=False): if ndim is None: @@ -476,6 +477,14 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): def _concat_blocks(self, blocks, values): """ return the block concatenation """ + + # dispatch to a categorical to handle the concat + if self._holder is None: + + for b in blocks: + if b.is_categorical: + return b._concat_blocks(blocks,values) + return self._holder(values[0]) # block actions #### @@ -1739,10 +1748,24 @@ def _concat_blocks(self, blocks, values): return the block concatenation """ - categories = self.values.categories - for b in blocks: + # we could have object blocks and categorical's here + # if we only have a single cateogoricals then combine everything + # else its a non-compat categorical + + categoricals = [ b for b in blocks if b.is_categorical ] + objects = [ b for b in blocks if not b.is_categorical and b.is_object ] + + # convert everything to object and call it a day + if len(objects) + len(categoricals) != len(blocks): + raise ValueError("try to combine non-object blocks and categoricals") + + # validate the categories + categories = None + for b in categoricals: + if categories is None: + categories = b.values.categories if not categories.equals(b.values.categories): - raise ValueError("incompatible levels in categorical block merge") + raise ValueError("incompatible categories in categorical block merge") return self._holder(values[0], categories=categories) diff --git a/pandas/core/series.py b/pandas/core/series.py index f5d729b61e770..68bf4f0f022d7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -475,7 +475,13 @@ def _ixs(self, i, axis=0): value : scalar (int) or Series (slice, sequence) """ try: - return _index.get_value_at(self.values, i) + + # dispatch to the values if we need + values = self.values + if isinstance(values, np.ndarray): + return _index.get_value_at(values, i) + else: + return values[i] except IndexError: raise except: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 3b84d4aa34756..444eb87a399e5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1030,6 +1030,21 @@ def test_basic(self): str(df.values) str(df) + # GH8623 + x = pd.DataFrame([[1,'John P. Doe'],[2,'Jane Dove'],[1,'John P. Doe']], + columns=['person_id','person_name']) + x['person_name'] = pd.Categorical(x.person_name) # doing this breaks transform + + expected = x.iloc[0].person_name + result = x.person_name.iloc[0] + self.assertEqual(result,expected) + + result = x.person_name[0] + self.assertEqual(result,expected) + + result = x.person_name.loc[0] + self.assertEqual(result,expected) + def test_creation_astype(self): l = ["a","b","c","a"] s = pd.Series(l) @@ -1477,6 +1492,28 @@ def test_groupby(self): result = gb.sum() tm.assert_frame_equal(result, expected) + # GH 8623 + x=pd.DataFrame([[1,'John P. Doe'],[2,'Jane Dove'],[1,'John P. Doe']], + columns=['person_id','person_name']) + x['person_name'] = pd.Categorical(x.person_name) + + g = x.groupby(['person_id']) + result = g.transform(lambda x:x) + tm.assert_frame_equal(result, x[['person_name']]) + + result = x.drop_duplicates('person_name') + expected = x.iloc[[0,1]] + tm.assert_frame_equal(result, expected) + + def f(x): + return x.drop_duplicates('person_name').iloc[0] + + result = g.apply(f) + expected = x.iloc[[0,1]].copy() + expected.index = Index([1,2],name='person_id') + expected['person_name'] = expected['person_name'].astype('object') + tm.assert_frame_equal(result, expected) + def test_pivot_table(self): raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"])