diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index eec424f619bde..77de97e5b384b 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -1110,3 +1110,4 @@ Bug Fixes - Regression in ``NDFrame.loc`` indexing when rows/columns were converted to Float64Index if target was an empty list/ndarray (:issue:`7774`) - Bug in ``Series`` that allows it to be indexed by a ``DataFrame`` which has unexpected results. Such indexing is no longer permitted (:issue:`8444`) - Bug in item assignment of a ``DataFrame`` with multi-index columns where right-hand-side columns were not aligned (:issue:`7655`) +- Bug in unpickling of categorical series and dataframe columns (:issue:`8518`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index aa5fa29784912..dd9b5d71f508f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -187,6 +187,8 @@ class Categorical(PandasObject): # For comparisons, so that numpy uses our implementation if the compare ops, which raise __array_priority__ = 1000 + ordered = False + name = None def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, levels=None): @@ -718,6 +720,19 @@ def __array__(self, dtype=None): return np.asarray(ret, dtype) return ret + def __setstate__(self, state): + """Necessary for making this object picklable""" + if not isinstance(state, dict): + raise Exception('invalid pickle state') + + if 'labels' in state: + state['_codes'] = state.pop('labels') + if '_levels' in state: + state['categories'] = state.pop('_levels') + + for k, v in compat.iteritems(state): + setattr(self,k,v) + @property def T(self): return self diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c88d799a54fed..9be680d998216 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1070,16 +1070,19 @@ class NonConsolidatableMixIn(object): def __init__(self, values, placement, ndim=None, fastpath=False,): + # Placement must be converted to BlockPlacement via property setter + # before ndim logic, because placement may be a slice which doesn't + # have a length. + self.mgr_locs = placement + # kludgetastic if ndim is None: - if len(placement) != 1: + if len(self.mgr_locs) != 1: ndim = 1 else: ndim = 2 self.ndim = ndim - self.mgr_locs = placement - if not isinstance(values, self._holder): raise TypeError("values must be {0}".format(self._holder.__name__)) @@ -1852,6 +1855,7 @@ def get_values(self, dtype=None): .reshape(self.values.shape) return self.values + class SparseBlock(NonConsolidatableMixIn, Block): """ implement as a list of sparse arrays of the same dtype """ __slots__ = () @@ -1861,27 +1865,6 @@ class SparseBlock(NonConsolidatableMixIn, Block): _ftype = 'sparse' _holder = SparseArray - def __init__(self, values, placement, - ndim=None, fastpath=False,): - - # Placement must be converted to BlockPlacement via property setter - # before ndim logic, because placement may be a slice which doesn't - # have a length. - self.mgr_locs = placement - - # kludgetastic - if ndim is None: - if len(self.mgr_locs) != 1: - ndim = 1 - else: - ndim = 2 - self.ndim = ndim - - if not isinstance(values, SparseArray): - raise TypeError("values must be SparseArray") - - self.values = values - @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle new file mode 100644 index 0000000000000..d7d20b06df305 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle differ diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py index b20a1e5b60b86..56ef1aa9b0f19 100644 --- a/pandas/io/tests/generate_legacy_pickles.py +++ b/pandas/io/tests/generate_legacy_pickles.py @@ -60,7 +60,7 @@ def create_data(): from pandas import (Series,TimeSeries,DataFrame,Panel, SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, Index,MultiIndex,PeriodIndex, - date_range,period_range,bdate_range,Timestamp) + date_range,period_range,bdate_range,Timestamp,Categorical) nan = np.nan data = { @@ -85,7 +85,8 @@ def create_data(): mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], [3,4,3,4,5]])), names=['one','two'])), - dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A'])) + dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), + cat=Series(Categorical(['foo', 'bar', 'baz']))) frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), @@ -95,7 +96,11 @@ def create_data(): ['one','two','one','two','three']])), names=['first','second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A'])) + columns=['A', 'B', 'A']), + cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), + cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), + B=np.arange(3))), + ) panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A'])) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index a523df4cc2461..5bc7558efb471 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -11,7 +11,7 @@ import pandas.util.testing as tm import pandas as pd from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, randn) + assert_almost_equal, assert_frame_equal, randn, assert_series_equal) from pandas.compat import zip, u @@ -363,6 +363,15 @@ def test_non_unique_pickle(self): mgr2 = self.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + def test_categorical_block_pickle(self): + mgr = create_mgr('a: category') + mgr2 = self.round_trip_pickle(mgr) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + smgr = create_single_mgr('category') + smgr2 = self.round_trip_pickle(smgr) + assert_series_equal(Series(smgr), Series(smgr2)) + def test_get_scalar(self): for item in self.mgr.items: for i, index in enumerate(self.mgr.axes[1]):