diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 5e3f97944c243..103ac2a34a49a 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -191,6 +191,8 @@ Bug Fixes - Bug in pickles contains ``DateOffset`` may raise ``AttributeError`` when ``normalize`` attribute is reffered internally (:issue:`7748`) +- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity + when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`) - Bug in ``is_superperiod`` and ``is_subperiod`` cannot handle higher frequencies than ``S`` (:issue:`7760`, :issue:`7772`, :issue:`7803`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f649baeb16278..cad7b579aa554 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2271,10 +2271,23 @@ def unpickle_block(values, mgr_locs): ax_arrays, bvalues, bitems = state[:3] self.axes = [_ensure_index(ax) for ax in ax_arrays] + + if len(bitems) == 1 and self.axes[0].equals(bitems[0]): + # This is a workaround for pre-0.14.1 pickles that didn't + # support unpickling multi-block frames/panels with non-unique + # columns/items, because given a manager with items ["a", "b", + # "a"] there's no way of knowing which block's "a" is where. + # + # Single-block case can be supported under the assumption that + # block items corresponded to manager items 1-to-1. + all_mgr_locs = [slice(0, len(bitems[0]))] + else: + all_mgr_locs = [self.axes[0].get_indexer(blk_items) + for blk_items in bitems] + self.blocks = tuple( - unpickle_block(values, - self.axes[0].get_indexer(items)) - for values, items in zip(bvalues, bitems)) + unpickle_block(values, mgr_locs) + for values, mgr_locs in zip(bvalues, all_mgr_locs)) self._post_setstate() diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle new file mode 100644 index 0000000000000..3ffecb77ef8c9 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle new file mode 100644 index 0000000000000..19cbcddc4ded8 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle new file mode 100644 index 0000000000000..af530fcd3fb39 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle differ diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py index 3a0386c7660d4..b20a1e5b60b86 100644 --- a/pandas/io/tests/generate_legacy_pickles.py +++ b/pandas/io/tests/generate_legacy_pickles.py @@ -1,6 +1,7 @@ """ self-contained to write legacy pickle files """ from __future__ import print_function + def _create_sp_series(): import numpy as np @@ -53,6 +54,7 @@ def _create_sp_frame(): def create_data(): """ create the pickle data """ + from distutils.version import LooseVersion import numpy as np import pandas from pandas import (Series,TimeSeries,DataFrame,Panel, @@ -92,13 +94,23 @@ def create_data(): index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'], ['one','two','one','two','three']])), names=['first','second'])), - dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A'])) + dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), + columns=['A', 'B', 'A'])) panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A'])) + if LooseVersion(pandas.__version__) >= '0.14.1': + # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and + # panels if their columns/items were non-unique. + mixed_dup_df = DataFrame(data) + mixed_dup_df.columns = list("ABCDA") + + mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) + mixed_dup_panel.items = ['ItemA', 'ItemA'] + frame['mixed_dup'] = mixed_dup_df + panel['mixed_dup'] = mixed_dup_panel return dict( series = series, frame = frame,