From 407be1c6cabea2e4807054f957ee93b10e346344 Mon Sep 17 00:00:00 2001 From: Ka Wo Chen Date: Mon, 25 Jan 2016 00:20:58 -0500 Subject: [PATCH] DEPR: GH10623 remove items from msgpack.encode for blocks --- doc/source/io.rst | 18 + doc/source/whatsnew/v0.18.0.txt | 28 +- pandas/core/common.py | 26 ++ pandas/core/internals.py | 12 + pandas/io/packers.py | 370 +++++++++--------- .../0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack | Bin 0 -> 10307 bytes .../0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack | Bin 0 -> 9300 bytes .../io/tests/generate_legacy_storage_files.py | 147 ++++--- pandas/io/tests/test_packers.py | 15 +- 9 files changed, 370 insertions(+), 246 deletions(-) create mode 100644 pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack create mode 100644 pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack diff --git a/doc/source/io.rst b/doc/source/io.rst index e2f2301beb078..459d79ec4d98c 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2539,6 +2539,24 @@ both on the writing (serialization), and reading (deserialization). optimizations in the io of the ``msgpack`` data. Since this is marked as an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. + As a result of writing format changes and other issues: + +----------------------+------------------------+ + | Packed with | Can be unpacked with | + +======================+========================+ + | pre-0.17 / Python 2 | any | + +----------------------+------------------------+ + | pre-0.17 / Python 3 | any | + +----------------------+------------------------+ + | 0.17 / Python 2 | - 0.17 / Python 2 | + | | - >=0.18 / any Python | + +----------------------+------------------------+ + | 0.17 / Python 3 | >=0.18 / any Python | + +----------------------+------------------------+ + | 0.18 | >= 0.18 | + +======================+========================+ + + Reading (files packed by older versions) is backward-compatibile, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. + .. ipython:: python df = DataFrame(np.random.rand(5,2),columns=list('AB')) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 8429739902927..47e78cf558a16 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -513,6 +513,33 @@ Subtraction by ``Timedelta`` in a ``Series`` by a ``Timestamp`` works (:issue:`1 ``pd.Timestamp`` to rehydrate any timestamp like object from its isoformat (:issue:`12300`). +Changes to msgpack +^^^^^^^^^^^^^^^^^^ + +Forward incompatible changes in ``msgpack`` writing format were made over 0.17.0 and 0.18.0; older versions of pandas cannot read files packed by newer versions (:issue:`12129`, `10527`) + +Bug in ``to_msgpack`` and ``read_msgpack`` introduced in 0.17.0 and fixed in 0.18.0, caused files packed in Python 2 unreadable by Python 3 (:issue:`12142`) + +.. warning:: + + As a result of a number of issues: + + +----------------------+------------------------+ + | Packed with | Can be unpacked with | + +======================+========================+ + | pre-0.17 / Python 2 | any | + +----------------------+------------------------+ + | pre-0.17 / Python 3 | any | + +----------------------+------------------------+ + | 0.17 / Python 2 | - 0.17 / Python 2 | + | | - >=0.18 / any Python | + +----------------------+------------------------+ + | 0.17 / Python 3 | >=0.18 / any Python | + +----------------------+------------------------+ + | 0.18 | >= 0.18 | + +======================+========================+ + + 0.18.0 is backward-compatible for reading files packed by older versions, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. Signature change for .rank ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -806,7 +833,6 @@ assignments are valid for multi-line expressions. Other API Changes ^^^^^^^^^^^^^^^^^ - - ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) .. ipython:: python diff --git a/pandas/core/common.py b/pandas/core/common.py index 70c02c5632d80..4f3ec58910950 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3039,3 +3039,29 @@ def _random_state(state=None): else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None") + + +def pandas_dtype(dtype): + """ + Converts input into a pandas only dtype object or a numpy dtype object. + + Parameters + ---------- + dtype : object to be converted + + Returns + ------- + np.dtype or a pandas dtype + """ + if isinstance(dtype, compat.string_types): + try: + return DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + pass + + try: + return CategoricalDtype.construct_from_string(dtype) + except TypeError: + pass + + return np.dtype(dtype) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8973ea025e611..c6b04757e201c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2098,6 +2098,14 @@ def __init__(self, values, placement, ndim=2, **kwargs): if not isinstance(values, self._holder): values = self._holder(values) + + dtype = kwargs.pop('dtype', None) + + if dtype is not None: + if isinstance(dtype, compat.string_types): + dtype = DatetimeTZDtype.construct_from_string(dtype) + values = values.tz_localize('UTC').tz_convert(dtype.tz) + if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") @@ -2428,6 +2436,10 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, else: klass = ObjectBlock + elif klass is DatetimeTZBlock and not is_datetimetz(values): + return klass(values, ndim=ndim, fastpath=fastpath, + placement=placement, dtype=dtype) + return klass(values, ndim=ndim, fastpath=fastpath, placement=placement) # TODO: flexible with index=None and/or items=None diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 372c8d80e5a1a..701b78d2771fb 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -44,7 +44,7 @@ import numpy as np from pandas import compat -from pandas.compat import u +from pandas.compat import u, u_safe from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT) @@ -52,7 +52,7 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame -from pandas.core.common import needs_i8_conversion +from pandas.core.common import needs_i8_conversion, pandas_dtype from pandas.io.common import get_filepath_or_buffer from pandas.core.internals import BlockManager, make_block import pandas.core.internals as internals @@ -84,6 +84,8 @@ def to_msgpack(path_or_buf, *args, **kwargs): """ global compressor compressor = kwargs.pop('compress', None) + if compressor: + compressor = u(compressor) append = kwargs.pop('append', None) if append: mode = 'a+b' @@ -180,7 +182,7 @@ def dtype_for(t): """ return my dtype mapping, whether number or name """ if t in dtype_dict: return dtype_dict[t] - return np.typeDict[t] + return np.typeDict.get(t, t) c2f_dict = {'complex': np.float64, 'complex128': np.float64, @@ -248,15 +250,17 @@ def unconvert(values, dtype, compress=None): if dtype == np.object_: return np.array(values, dtype=object) + dtype = pandas_dtype(dtype).base + if not as_is_ext: values = values.encode('latin1') - if compress == 'zlib': + if compress == u'zlib': import zlib values = zlib.decompress(values) return np.frombuffer(values, dtype=dtype) - elif compress == 'blosc': + elif compress == u'blosc': import blosc values = blosc.decompress(values) return np.frombuffer(values, dtype=dtype) @@ -269,53 +273,52 @@ def encode(obj): """ Data encoder """ - tobj = type(obj) if isinstance(obj, Index): if isinstance(obj, RangeIndex): - return {'typ': 'range_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'start': getattr(obj, '_start', None), - 'stop': getattr(obj, '_stop', None), - 'step': getattr(obj, '_step', None)} + return {u'typ': u'range_index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'start': getattr(obj, '_start', None), + u'stop': getattr(obj, '_stop', None), + u'step': getattr(obj, '_step', None)} elif isinstance(obj, PeriodIndex): - return {'typ': 'period_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'freq': getattr(obj, 'freqstr', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'compress': compressor} + return {u'typ': u'period_index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'freq': u_safe(getattr(obj, 'freqstr', None)), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.asi8), + u'compress': compressor} elif isinstance(obj, DatetimeIndex): tz = getattr(obj, 'tz', None) # store tz info and data as UTC if tz is not None: - tz = tz.zone + tz = u(tz.zone) obj = obj.tz_convert('UTC') - return {'typ': 'datetime_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'freq': getattr(obj, 'freqstr', None), - 'tz': tz, - 'compress': compressor} + return {u'typ': u'datetime_index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.asi8), + u'freq': u_safe(getattr(obj, 'freqstr', None)), + u'tz': tz, + u'compress': compressor} elif isinstance(obj, MultiIndex): - return {'typ': 'multi_index', - 'klass': obj.__class__.__name__, - 'names': getattr(obj, 'names', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return {u'typ': u'multi_index', + u'klass': u(obj.__class__.__name__), + u'names': getattr(obj, 'names', None), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.values), + u'compress': compressor} else: - return {'typ': 'index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return {u'typ': u'index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.values), + u'compress': compressor} elif isinstance(obj, Series): if isinstance(obj, SparseSeries): raise NotImplementedError( @@ -332,13 +335,13 @@ def encode(obj): # d[f] = getattr(obj, f, None) # return d else: - return {'typ': 'series', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'index': obj.index, - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return {u'typ': u'series', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'index': obj.index, + u'dtype': u(obj.dtype.name), + u'data': convert(obj.values), + u'compress': compressor} elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): raise NotImplementedError( @@ -371,86 +374,85 @@ def encode(obj): data = data.consolidate() # the block manager - return {'typ': 'block_manager', - 'klass': obj.__class__.__name__, - 'axes': data.axes, - 'blocks': [{'items': data.items.take(b.mgr_locs), - 'locs': b.mgr_locs.as_array, - 'values': convert(b.values), - 'shape': b.values.shape, - 'dtype': b.dtype.name, - 'klass': b.__class__.__name__, - 'compress': compressor - } for b in data.blocks]} + return {u'typ': u'block_manager', + u'klass': u(obj.__class__.__name__), + u'axes': data.axes, + u'blocks': [{u'locs': b.mgr_locs.as_array, + u'values': convert(b.values), + u'shape': b.values.shape, + u'dtype': u(b.dtype.name), + u'klass': u(b.__class__.__name__), + u'compress': compressor} for b in data.blocks] + } elif isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64, NaTType)): if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: - tz = tz.zone + tz = u(tz.zone) offset = obj.offset if offset is not None: - offset = offset.freqstr - return {'typ': 'timestamp', - 'value': obj.value, - 'offset': offset, - 'tz': tz} + offset = u(offset.freqstr) + return {u'typ': u'timestamp', + u'value': obj.value, + u'offset': offset, + u'tz': tz} if isinstance(obj, NaTType): - return {'typ': 'nat'} + return {u'typ': u'nat'} elif isinstance(obj, np.timedelta64): - return {'typ': 'timedelta64', - 'data': obj.view('i8')} + return {u'typ': u'timedelta64', + u'data': obj.view('i8')} elif isinstance(obj, timedelta): - return {'typ': 'timedelta', - 'data': (obj.days, obj.seconds, obj.microseconds)} + return {u'typ': u'timedelta', + u'data': (obj.days, obj.seconds, obj.microseconds)} elif isinstance(obj, np.datetime64): - return {'typ': 'datetime64', - 'data': str(obj)} + return {u'typ': u'datetime64', + u'data': u(str(obj))} elif isinstance(obj, datetime): - return {'typ': 'datetime', - 'data': obj.isoformat()} + return {u'typ': u'datetime', + u'data': u(obj.isoformat())} elif isinstance(obj, date): - return {'typ': 'date', - 'data': obj.isoformat()} + return {u'typ': u'date', + u'data': u(obj.isoformat())} raise Exception("cannot encode this datetimelike object: %s" % obj) elif isinstance(obj, Period): - return {'typ': 'period', - 'ordinal': obj.ordinal, - 'freq': obj.freq} + return {u'typ': u'period', + u'ordinal': obj.ordinal, + u'freq': u(obj.freq)} elif isinstance(obj, BlockIndex): - return {'typ': 'block_index', - 'klass': obj.__class__.__name__, - 'blocs': obj.blocs, - 'blengths': obj.blengths, - 'length': obj.length} + return {u'typ': u'block_index', + u'klass': u(obj.__class__.__name__), + u'blocs': obj.blocs, + u'blengths': obj.blengths, + u'length': obj.length} elif isinstance(obj, IntIndex): - return {'typ': 'int_index', - 'klass': obj.__class__.__name__, - 'indices': obj.indices, - 'length': obj.length} + return {u'typ': u'int_index', + u'klass': u(obj.__class__.__name__), + u'indices': obj.indices, + u'length': obj.length} elif isinstance(obj, np.ndarray): - return {'typ': 'ndarray', - 'shape': obj.shape, - 'ndim': obj.ndim, - 'dtype': obj.dtype.name, - 'data': convert(obj), - 'compress': compressor} + return {u'typ': u'ndarray', + u'shape': obj.shape, + u'ndim': obj.ndim, + u'dtype': u(obj.dtype.name), + u'data': convert(obj), + u'compress': compressor} elif isinstance(obj, np.number): if np.iscomplexobj(obj): - return {'typ': 'np_scalar', - 'sub_typ': 'np_complex', - 'dtype': obj.dtype.name, - 'real': obj.real.__repr__(), - 'imag': obj.imag.__repr__()} + return {u'typ': u'np_scalar', + u'sub_typ': u'np_complex', + u'dtype': u(obj.dtype.name), + u'real': u(obj.real.__repr__()), + u'imag': u(obj.imag.__repr__())} else: - return {'typ': 'np_scalar', - 'dtype': obj.dtype.name, - 'data': obj.__repr__()} + return {u'typ': u'np_scalar', + u'dtype': u(obj.dtype.name), + u'data': u(obj.__repr__())} elif isinstance(obj, complex): - return {'typ': 'np_complex', - 'real': obj.real.__repr__(), - 'imag': obj.imag.__repr__()} + return {u'typ': u'np_complex', + u'real': u(obj.real.__repr__()), + u'imag': u(obj.imag.__repr__())} return obj @@ -460,83 +462,91 @@ def decode(obj): Decoder for deserializing numpy data types. """ - typ = obj.get('typ') + typ = obj.get(u'typ') if typ is None: return obj - elif typ == 'timestamp': - return Timestamp(obj['value'], tz=obj['tz'], offset=obj['offset']) - elif typ == 'nat': + elif typ == u'timestamp': + return Timestamp(obj[u'value'], tz=obj[u'tz'], offset=obj[u'offset']) + elif typ == u'nat': return NaT - elif typ == 'period': - return Period(ordinal=obj['ordinal'], freq=obj['freq']) - elif typ == 'index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) - return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) - elif typ == 'range_index': - return globals()[obj['klass']](obj['start'], - obj['stop'], - obj['step'], - name=obj['name']) - elif typ == 'multi_index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) + elif typ == u'period': + return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq']) + elif typ == u'index': + dtype = dtype_for(obj[u'dtype']) + data = unconvert(obj[u'data'], dtype, + obj.get(u'compress')) + return globals()[obj[u'klass']](data, dtype=dtype, name=obj[u'name']) + elif typ == u'range_index': + return globals()[obj[u'klass']](obj[u'start'], + obj[u'stop'], + obj[u'step'], + name=obj[u'name']) + elif typ == u'multi_index': + dtype = dtype_for(obj[u'dtype']) + data = unconvert(obj[u'data'], dtype, + obj.get(u'compress')) data = [tuple(x) for x in data] - return globals()[obj['klass']].from_tuples(data, names=obj['names']) - elif typ == 'period_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq']) - return globals()[obj['klass']](data, **d) - elif typ == 'datetime_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq'], verify_integrity=False) - result = globals()[obj['klass']](data, **d) - tz = obj['tz'] + return globals()[obj[u'klass']].from_tuples(data, names=obj[u'names']) + elif typ == u'period_index': + data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) + d = dict(name=obj[u'name'], freq=obj[u'freq']) + return globals()[obj[u'klass']](data, **d) + elif typ == u'datetime_index': + data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) + d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False) + result = globals()[obj[u'klass']](data, **d) + tz = obj[u'tz'] # reverse tz conversion if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result - elif typ == 'series': - dtype = dtype_for(obj['dtype']) - index = obj['index'] - return globals()[obj['klass']](unconvert(obj['data'], dtype, - obj['compress']), - index=index, - dtype=dtype, - name=obj['name']) - elif typ == 'block_manager': - axes = obj['axes'] + elif typ == u'series': + dtype = dtype_for(obj[u'dtype']) + pd_dtype = pandas_dtype(dtype) + np_dtype = pandas_dtype(dtype).base + index = obj[u'index'] + result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype, + obj[u'compress']), + index=index, + dtype=np_dtype, + name=obj[u'name']) + tz = getattr(pd_dtype, 'tz', None) + if tz: + result = result.dt.tz_localize('UTC').dt.tz_convert(tz) + return result + + elif typ == u'block_manager': + axes = obj[u'axes'] def create_block(b): - values = unconvert(b['values'], dtype_for(b['dtype']), - b['compress']).reshape(b['shape']) + values = unconvert(b[u'values'], dtype_for(b[u'dtype']), + b[u'compress']).reshape(b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 - if 'locs' in b: - placement = b['locs'] + if u'locs' in b: + placement = b[u'locs'] else: - placement = axes[0].get_indexer(b['items']) + placement = axes[0].get_indexer(b[u'items']) return make_block(values=values, - klass=getattr(internals, b['klass']), - placement=placement) - - blocks = [create_block(b) for b in obj['blocks']] - return globals()[obj['klass']](BlockManager(blocks, axes)) - elif typ == 'datetime': - return parse(obj['data']) - elif typ == 'datetime64': - return np.datetime64(parse(obj['data'])) - elif typ == 'date': - return parse(obj['data']).date() - elif typ == 'timedelta': - return timedelta(*obj['data']) - elif typ == 'timedelta64': - return np.timedelta64(int(obj['data'])) + klass=getattr(internals, b[u'klass']), + placement=placement, + dtype=b[u'dtype']) + + blocks = [create_block(b) for b in obj[u'blocks']] + return globals()[obj[u'klass']](BlockManager(blocks, axes)) + elif typ == u'datetime': + return parse(obj[u'data']) + elif typ == u'datetime64': + return np.datetime64(parse(obj[u'data'])) + elif typ == u'date': + return parse(obj[u'data']).date() + elif typ == u'timedelta': + return timedelta(*obj[u'data']) + elif typ == u'timedelta64': + return np.timedelta64(int(obj[u'data'])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return globals()[obj['klass']]( @@ -554,25 +564,25 @@ def create_block(b): # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) - elif typ == 'block_index': - return globals()[obj['klass']](obj['length'], obj['blocs'], - obj['blengths']) - elif typ == 'int_index': - return globals()[obj['klass']](obj['length'], obj['indices']) - elif typ == 'ndarray': - return unconvert(obj['data'], np.typeDict[obj['dtype']], - obj.get('compress')).reshape(obj['shape']) - elif typ == 'np_scalar': - if obj.get('sub_typ') == 'np_complex': - return c2f(obj['real'], obj['imag'], obj['dtype']) + elif typ == u'block_index': + return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'], + obj[u'blengths']) + elif typ == u'int_index': + return globals()[obj[u'klass']](obj[u'length'], obj[u'indices']) + elif typ == u'ndarray': + return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']], + obj.get(u'compress')).reshape(obj[u'shape']) + elif typ == u'np_scalar': + if obj.get(u'sub_typ') == u'np_complex': + return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype']) else: - dtype = dtype_for(obj['dtype']) + dtype = dtype_for(obj[u'dtype']) try: - return dtype(obj['data']) + return dtype(obj[u'data']) except: - return dtype.type(obj['data']) - elif typ == 'np_complex': - return complex(obj['real'] + '+' + obj['imag'] + 'j') + return dtype.type(obj[u'data']) + elif typ == u'np_complex': + return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j') elif isinstance(obj, (dict, list, set)): return obj else: diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack b/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack new file mode 100644 index 0000000000000000000000000000000000000000..e89b5dd99150e04a0702cfb6f68b8f5f8b243c27 GIT binary patch literal 10307 zcmeHNOK%%h7#%;xc{Qks5~&LoC_+%75YaRZP+n;oRf1 zXYTzT=X*?6n4Zce&4nv0o-`cuW7a-vWpa7bwzpW+S=?g!g|uPYtb2mrby68~Qv5a3 zlg_1_$sQ4N#OUbAJ6Zc(7N5(T7g=P2#rW;w-D&v>`37TqO4GEPckd)__i9@IqV<-* zWLNx4fnR-b{V##hp{Gy%#PRiSzWq$#=G5Ci3;grf-v@U&zHBdjDDcyG-}k${ES@zo z<`#>kvd#)FXoY-t&ge+#uEajoD)>V9FQPz;62=JQggRk@u#K>tu!9hi>YNmBRAQ21 zPy%L&oS9Eq$<@FMjMIvA!USO(VLM?5VJBf1;UU6qLM*UrN-jjLkTiByY3z_%Bk8Nn zo;A`&-oDH_gpO^;$mEulRmhDMN16VLu_7+vd`iA>6~eMYWZCuh5pp6t)u{9%H(kS==Sbw}#X0mm!&^=5;z(3_i{jN}nOO zdw+@BJhx!w#|N4rLy4`-4Fr3uAI#oQg@Ji9mYmN)x?X1RbsH8r#UkS@a#}Je8<*6FIIKO~Vc`9N7`9f!)5Tehe^0p&_;;fZTa$?W((6S=K^)gG$ zq^;Qt_L@+mV`glxIs5IcK6yn9#fr@{!SY z=C%UB(OlOR&an8(R&mFuVFwO{Lp=98JwH9S1e#y}NrRw(&kE#1Si-2FA5m{H6 zy%fXE|HEQ9O}ge%sFz&Us>l0~%(AQ0#E)QF4x(^|S zaXhv{+7ns~AiFSPz+4ceUt0g45oXQmtG*aKV=OY=IiEK<>5)Ri1leG;D)M62OX?26 z>&TF+<;Ee6Wc!HgEMASnYGRe0CsuTx1@{^11l8q{F?Bsac$5&%1wwt4#vdnqicll$ zBYceT3Bti3&dBNu-9d!ive8=>EUuwl3&BH8FtKLRaiCBfLJTcS#~S_Mthfe=`Ot9) z5lrBbjf_Z$BH)%_f%%Wkzd!Hm(X^ApSAP-s$#(-o{T%n+{Q4t-PyY7mPJa*9>-M+H z@-k077(yVY@q$JQ1mQKXKsI=1-m;F#L-@c$UgSMWUmqy@Bv3~yWY=w)2c`kp8E64s%J7w^-Tt<<}wCW6mb&MJKZ zeM&eDtB|OK=%?wEyt&^5Iu2UItw&1Sr1*tVlRvl#5bgqm+W_G{Xv{XL+H0Q7P^BSs zRW5gYWQNJWs1l-k#5mCJKJpcx(opqB0=sR2L0mN~Eubqk?h}Xig?_|S`-@g>XI1|+ z;dWZ;Zt0avu?bmqC5Ec@b!|Ob;F;=!XBlah$hQ-$=P@nRZ1*m%O5@6G%DnAw?Kc6Vk1G+__( z&HsJB_rCXizxUou7geGpi8_;yk7Z_zxOO>1785C1)OvgJ^Z+j38kMDtLR59tNT%at ze45``lt5Z9OKvo*c{`pSkP1`GyVIWOZ*8h@z=B{L`zDfx98oo`d%TSkW0A%7QptE zVKYK!B)13PK`WxQAzG?C_wO~{Z54#}3Hfb?@gofdhTnX7<6nm1u4npwrTE5o-+#_< ze(0Uw82K2zUsv3Ggr= z)S{I1)AB3QSV~KlueIfK?3z_&b9yMEhZ^*7L=V$#)L=|d^}L)WdF?|v2uj2?2xhxM zc~e(sv&EtEWL_R*9J7M4(Y0EGRBThm_}VB0uQ2;&-OaUEp#jfKsMMLaCLuH_IZomb znoU`DmY_X|Y;GOH3QZXwh^aC1OUu;FpMJh32#jP)OkPw?lrlv*kx+@Y%6fG^AXC{YdVWMym@@DkDi3UD0?$k^ z!O&T?@-uA5JacJmoS$JkOjNkW7>T5!YP5Ps<#gQ0h-s3#YM8xgip!2M#Llr6ygibV zM=uUbVp=>;l%jd0W*wc*GKiwd0d;NOQ5{HlL?H$@0HT{AHpBU?Nq!$&75`vd`T1b2=C33-MJJpBK|{QBlOpR&^E9@uUXJ5unX{6z!QMofF}Wa08at-0zL=$Jm3p} zrvYCCd%3ZS(bU+Vq4=*9EUCLv0QLZI{?u=&@DP*`~tcG_BPB&&`r=i&@HexqWc_!_n^CUK8&iaN1`V2bPw6U zo`^b4+-oOGbqQueTl){mfQ^mhX%4V)vpBir!LBRGsve3~QB^Ba)F4z#=fq54YKgg` zr#P{crAkTT^vqY?k{&nP>lyXU)i&y*<3K~Zpb%1SaOcV6ci5(kg>Ih~tcW%3-}C={(bgger@OEH&hXP8I=b2^Zoc*H#|)qP z^R@N%79Q~ST(HlSCpX|D2SjOcbz+DbG6z?*Hw$#80z5wzNfW#?X0|R zE(-Xo8=bBccprIf3MjxhBM!9$^9pq4*|XHS)-iS~BP^M$szK7KPgq%_fohVr^L0$` zf|Nd+qj>N*_dqi^0@zBnYfW#Ck84lP4%|###UZcd+fMg#9v`@wpBdrop19c@bFQ*( z7N8krb7y=QHCmC&s6AWIb#r2pE~WEQ?Uo9$^{U!62w5` z>(1k#`mask53FZV9TiieqF%9GyJoabNrJQYZCq|e!2Xo7%M~Ov$oF{hJ*&_LQ%j*z(asd TfQJDe0XzcuDBxp&T#x<-G>Dtb literal 0 HcmV?d00001 diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py index f556c980bb80c..bfa8ff6d30a9c 100644 --- a/pandas/io/tests/generate_legacy_storage_files.py +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -6,6 +6,7 @@ Index, MultiIndex, bdate_range, to_msgpack, date_range, period_range, Timestamp, Categorical, Period) +from pandas.compat import u import os import sys import numpy as np @@ -13,6 +14,9 @@ import platform as pl +_loose_version = LooseVersion(pandas.__version__) + + def _create_sp_series(): nan = np.nan @@ -22,7 +26,7 @@ def _create_sp_series(): arr[-1:] = nan bseries = SparseSeries(arr, kind='block') - bseries.name = 'bseries' + bseries.name = u'bseries' return bseries @@ -36,17 +40,17 @@ def _create_sp_tsseries(): date_index = bdate_range('1/1/2011', periods=len(arr)) bseries = SparseSeries(arr, index=date_index, kind='block') - bseries.name = 'btsseries' + bseries.name = u'btsseries' return bseries def _create_sp_frame(): nan = np.nan - data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10).astype(np.int64), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + u'C': np.arange(10).astype(np.int64), + u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} dates = bdate_range('1/1/2011', periods=10) return SparseDataFrame(data, index=dates) @@ -56,79 +60,79 @@ def create_data(): """ create the pickle/msgpack data """ data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] + u'A': [0., 1., 2., 3., np.nan], + u'B': [0, 1, 0, 1, 0], + u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'], + u'D': date_range('1/1/2009', periods=5), + u'E': [0., 1, Timestamp('20100101'), u'foo', 2.] } - scalars = dict(timestamp=Timestamp('20130101')) - if LooseVersion(pandas.__version__) >= '0.17.0': - scalars['period'] = Period('2012', 'M') + scalars = dict(timestamp=Timestamp('20130101'), + period=Period('2012', 'M')) index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', - 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', - 'two', 'one', 'two']])), - names=['first', 'second'])) - series = dict(float=Series(data['A']), - int=Series(data['B']), - mixed=Series(data['E']), + tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', + u'foo', u'qux', u'qux'], + [u'one', u'two', u'one', u'two', u'one', + u'two', u'one', u'two']])), + names=[u'first', u'second'])) + series = dict(float=Series(data[u'A']), + int=Series(data[u'B']), + mixed=Series(data[u'E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples( tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), - names=['one', 'two'])), + names=[u'one', u'two'])), dup=Series(np.arange(5).astype(np.float64), - index=['A', 'B', 'C', 'D', 'A']), - cat=Series(Categorical(['foo', 'bar', 'baz'])), + index=[u'A', u'B', u'C', u'D', u'A']), + cat=Series(Categorical([u'foo', u'bar', u'baz'])), dt=Series(date_range('20130101', periods=5)), dt_tz=Series(date_range('20130101', periods=5, - tz='US/Eastern'))) - if LooseVersion(pandas.__version__) >= '0.17.0': - series['period'] = Series([Period('2000Q1')] * 5) + tz='US/Eastern')), + period=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) - mixed_dup_df.columns = list("ABCDA") - frame = dict(float=DataFrame(dict(A=series['float'], - B=series['float'] + 1)), - int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), - mixed=DataFrame(dict([(k, data[k]) - for k in ['A', 'B', 'C', 'D']])), - mi=DataFrame(dict(A=np.arange(5).astype(np.float64), - B=np.arange(5).astype(np.int64)), + mixed_dup_df.columns = list(u"ABCDA") + frame = dict(float=DataFrame({u'A': series[u'float'], + u'B': series[u'float'] + 1}), + int=DataFrame({u'A': series[u'int'], + u'B': series[u'int'] + 1}), + mixed=DataFrame({k: data[k] + for k in [u'A', u'B', u'C', u'D']}), + mi=DataFrame({u'A': np.arange(5).astype(np.float64), + u'B': np.arange(5).astype(np.int64)}, index=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', - 'baz', 'baz'], - ['one', 'two', 'one', - 'two', 'three']])), - names=['first', 'second'])), + tuple(zip(*[[u'bar', u'bar', u'baz', + u'baz', u'baz'], + [u'one', u'two', u'one', + u'two', u'three']])), + names=[u'first', u'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A']), - cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), - cat_and_float=DataFrame(dict( - A=Categorical(['foo', 'bar', 'baz']), - B=np.arange(3).astype(np.int64))), + columns=[u'A', u'B', u'A']), + cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}), + cat_and_float=DataFrame({ + u'A': Categorical([u'foo', u'bar', u'baz']), + u'B': np.arange(3).astype(np.int64)}), mixed_dup=mixed_dup_df, - dt_mixed_tzs=DataFrame(dict( - A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='CET')), index=range(5)), + dt_mixed_tzs=DataFrame({ + u'A': Timestamp('20130102', tz='US/Eastern'), + u'B': Timestamp('20130603', tz='CET')}, index=range(5)) ) - mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) - mixed_dup_panel.items = ['ItemA', 'ItemA'] - panel = dict(float=Panel(dict(ItemA=frame['float'], - ItemB=frame['float'] + 1)), + mixed_dup_panel = Panel({u'ItemA': frame[u'float'], + u'ItemB': frame[u'int']}) + mixed_dup_panel.items = [u'ItemA', u'ItemA'] + panel = dict(float=Panel({u'ItemA': frame[u'float'], + u'ItemB': frame[u'float'] + 1}), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), - items=['A', 'B', 'A']), + items=[u'A', u'B', u'A']), mixed_dup=mixed_dup_panel) return dict(series=series, @@ -147,26 +151,38 @@ def create_pickle_data(): # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and # panels if their columns/items were non-unique. - if LooseVersion(pandas.__version__) < '0.14.1': + if _loose_version < '0.14.1': del data['frame']['mixed_dup'] del data['panel']['mixed_dup'] + if _loose_version < '0.17.0': + del data['series']['period'] + del data['scalars']['period'] return data +def _u(x): + return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x + + def create_msgpack_data(): data = create_data() - if LooseVersion(pandas.__version__) < '0.17.0': + if _loose_version < '0.17.0': del data['frame']['mixed_dup'] del data['panel']['mixed_dup'] del data['frame']['dup'] del data['panel']['dup'] + if _loose_version < '0.18.0': + del data['series']['dt_tz'] + del data['frame']['dt_mixed_tzs'] # Not supported del data['sp_series'] del data['sp_frame'] del data['series']['cat'] + del data['series']['period'] del data['frame']['cat_onecol'] del data['frame']['cat_and_float'] - return data + del data['scalars']['period'] + return _u(data) def platform_name(): @@ -199,7 +215,7 @@ def write_legacy_pickles(output_dir): print("created pickle file: %s" % pth) -def write_legacy_msgpack(output_dir): +def write_legacy_msgpack(output_dir, compress): version = pandas.__version__ @@ -208,9 +224,9 @@ def write_legacy_msgpack(output_dir): print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: msgpack") - pth = '{0}.msgpack'.format(platform_name()) - to_msgpack(os.path.join(output_dir, pth), create_msgpack_data()) + to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), + compress=compress) print("created msgpack file: %s" % pth) @@ -219,17 +235,22 @@ def write_legacy_file(): # force our cwd to be the first searched sys.path.insert(0, '.') - if len(sys.argv) != 3: + if not (3 <= len(sys.argv) <= 4): exit("Specify output directory and storage type: generate_legacy_" - "storage_files.py ") + "storage_files.py " + "") output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) + try: + compress_type = str(sys.argv[3]) + except IndexError: + compress_type = None if storage_type == 'pickle': write_legacy_pickles(output_dir=output_dir) elif storage_type == 'msgpack': - write_legacy_msgpack(output_dir=output_dir) + write_legacy_msgpack(output_dir=output_dir, compress=compress_type) else: exit("storage_type must be one of {'pickle', 'msgpack'}") diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index d1c05069b4172..d0e7d00d79cb0 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -331,11 +331,16 @@ def setUp(self): 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 + + [Timestamp('20130603', tz='CET')] * 3, + 'G': [Timestamp('20130102', tz='US/Eastern')] * 5 } self.d['float'] = Series(data['A']) self.d['int'] = Series(data['B']) self.d['mixed'] = Series(data['E']) + self.d['dt_tz_mixed'] = Series(data['F']) + self.d['dt_tz'] = Series(data['G']) def test_basic(self): @@ -357,13 +362,14 @@ def setUp(self): 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + 'F': [Timestamp('20130102', tz='US/Eastern')] * 5, + 'G': [Timestamp('20130603', tz='CET')] * 5 } self.frame = { 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), - 'mixed': DataFrame(dict([(k, data[k]) - for k in ['A', 'B', 'C', 'D']]))} + 'mixed': DataFrame(data)} self.panel = { 'float': Panel(dict(ItemA=self.frame['float'], @@ -713,6 +719,11 @@ def read_msgpacks(self, version): pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) n = 0 for f in os.listdir(pth): + # GH12142 0.17 files packed in P2 can't be read in P3 + if (compat.PY3 and + version.startswith('0.17.') and + f.split('.')[-4][-1] == '2'): + continue vf = os.path.join(pth, f) self.compare(vf, version) n += 1