Skip to content

BUG: fix reading pre-0.14.1 pickles of containers with one block and dup items #7794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ Bug Fixes

- Bug in pickles contains ``DateOffset`` may raise ``AttributeError`` when ``normalize`` attribute is reffered internally (:issue:`7748`)

- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity
when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)


- Bug in ``is_superperiod`` and ``is_subperiod`` cannot handle higher frequencies than ``S`` (:issue:`7760`, :issue:`7772`, :issue:`7803`)
Expand Down
19 changes: 16 additions & 3 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2271,10 +2271,23 @@ def unpickle_block(values, mgr_locs):
ax_arrays, bvalues, bitems = state[:3]

self.axes = [_ensure_index(ax) for ax in ax_arrays]

if len(bitems) == 1 and self.axes[0].equals(bitems[0]):
# This is a workaround for pre-0.14.1 pickles that didn't
# support unpickling multi-block frames/panels with non-unique
# columns/items, because given a manager with items ["a", "b",
# "a"] there's no way of knowing which block's "a" is where.
#
# Single-block case can be supported under the assumption that
# block items corresponded to manager items 1-to-1.
all_mgr_locs = [slice(0, len(bitems[0]))]
else:
all_mgr_locs = [self.axes[0].get_indexer(blk_items)
for blk_items in bitems]

self.blocks = tuple(
unpickle_block(values,
self.axes[0].get_indexer(items))
for values, items in zip(bvalues, bitems))
unpickle_block(values, mgr_locs)
for values, mgr_locs in zip(bvalues, all_mgr_locs))

self._post_setstate()

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
16 changes: 14 additions & 2 deletions pandas/io/tests/generate_legacy_pickles.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" self-contained to write legacy pickle files """
from __future__ import print_function


def _create_sp_series():

import numpy as np
Expand Down Expand Up @@ -53,6 +54,7 @@ def _create_sp_frame():
def create_data():
""" create the pickle data """

from distutils.version import LooseVersion
import numpy as np
import pandas
from pandas import (Series,TimeSeries,DataFrame,Panel,
Expand Down Expand Up @@ -92,13 +94,23 @@ def create_data():
index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
['one','two','one','two','three']])),
names=['first','second'])),
dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
columns=['A', 'B', 'A']))
dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
columns=['A', 'B', 'A']))
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
items=['A', 'B', 'A']))

if LooseVersion(pandas.__version__) >= '0.14.1':
# Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
# panels if their columns/items were non-unique.
mixed_dup_df = DataFrame(data)
mixed_dup_df.columns = list("ABCDA")

mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int']))
mixed_dup_panel.items = ['ItemA', 'ItemA']

frame['mixed_dup'] = mixed_dup_df
panel['mixed_dup'] = mixed_dup_panel

return dict( series = series,
frame = frame,
Expand Down