BUG: fix CategoricalBlock pickling

immerrr · immerrr · commit 47a2e382c52d · 2014-10-09T16:05:26.000+04:00
* TST: add categorical frame and series to generate_legacy_pickles

* TST: generate pickle for 0.15.0
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -1071,3 +1071,4 @@ Bug Fixes
 - Regression in ``NDFrame.loc`` indexing when rows/columns were converted to Float64Index if target was an empty list/ndarray (:issue:`7774`)
 - Bug in ``Series`` that allows it to be indexed by a ``DataFrame`` which has unexpected results.  Such indexing is no longer permitted (:issue:`8444`)
 - Bug in item assignment of a ``DataFrame`` with multi-index columns where right-hand-side columns were not aligned (:issue:`7655`)
+- Bug in unpickling of categorical series and dataframe columns (:issue:`8518`)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1070,16 +1070,19 @@ class NonConsolidatableMixIn(object):
     def __init__(self, values, placement,
                  ndim=None, fastpath=False,):
 
+        # Placement must be converted to BlockPlacement via property setter
+        # before ndim logic, because placement may be a slice which doesn't
+        # have a length.
+        self.mgr_locs = placement
+
         # kludgetastic
         if ndim is None:
-            if len(placement) != 1:
+            if len(self.mgr_locs) != 1:
                 ndim = 1
             else:
                 ndim = 2
         self.ndim = ndim
 
-        self.mgr_locs = placement
-
         if not isinstance(values, self._holder):
             raise TypeError("values must be {0}".format(self._holder.__name__))
 
@@ -1852,6 +1855,7 @@ def get_values(self, dtype=None):
                       .reshape(self.values.shape)
         return self.values
 
+
 class SparseBlock(NonConsolidatableMixIn, Block):
     """ implement as a list of sparse arrays of the same dtype """
     __slots__ = ()
@@ -1861,27 +1865,6 @@ class SparseBlock(NonConsolidatableMixIn, Block):
     _ftype = 'sparse'
     _holder = SparseArray
 
-    def __init__(self, values, placement,
-                 ndim=None, fastpath=False,):
-
-        # Placement must be converted to BlockPlacement via property setter
-        # before ndim logic, because placement may be a slice which doesn't
-        # have a length.
-        self.mgr_locs = placement
-
-        # kludgetastic
-        if ndim is None:
-            if len(self.mgr_locs) != 1:
-                ndim = 1
-            else:
-                ndim = 2
-        self.ndim = ndim
-
-        if not isinstance(values, SparseArray):
-            raise TypeError("values must be SparseArray")
-
-        self.values = values
-
     @property
     def shape(self):
         return (len(self.mgr_locs), self.sp_index.length)
diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle
diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py
@@ -60,7 +60,7 @@ def create_data():
     from pandas import (Series,TimeSeries,DataFrame,Panel,
                         SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel,
                         Index,MultiIndex,PeriodIndex,
-                        date_range,period_range,bdate_range,Timestamp)
+                        date_range,period_range,bdate_range,Timestamp,Categorical)
     nan = np.nan
 
     data = {
@@ -85,7 +85,8 @@ def create_data():
                   mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
                                                                                                     [3,4,3,4,5]])),
                                                                                            names=['one','two'])),
-                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))
+                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
+                  cat=Series(Categorical(['foo', 'bar', 'baz'])))
 
     frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
                  int = DataFrame(dict(A = series['int']  , B = series['int']   + 1)),
@@ -95,7 +96,11 @@ def create_data():
                                                                        ['one','two','one','two','three']])),
                                                              names=['first','second'])),
                  dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
-                               columns=['A', 'B', 'A']))
+                               columns=['A', 'B', 'A']),
+                 cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
+                 cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
+                                              B=np.arange(3))),
+    )
     panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
                  dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                              items=['A', 'B', 'A']))
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -11,7 +11,7 @@
 import pandas.util.testing as tm
 import pandas as pd
 from pandas.util.testing import (
-    assert_almost_equal, assert_frame_equal, randn)
+    assert_almost_equal, assert_frame_equal, randn, assert_series_equal)
 from pandas.compat import zip, u
 
 
@@ -363,6 +363,15 @@ def test_non_unique_pickle(self):
         mgr2 = self.round_trip_pickle(mgr)
         assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
 
+    def test_categorical_block_pickle(self):
+        mgr = create_mgr('a: category')
+        mgr2 = self.round_trip_pickle(mgr)
+        assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
+        smgr = create_single_mgr('category')
+        smgr2 = self.round_trip_pickle(smgr)
+        assert_series_equal(Series(smgr), Series(smgr2))
+
     def test_get_scalar(self):
         for item in self.mgr.items:
             for i, index in enumerate(self.mgr.axes[1]):