diff --git a/doc/source/io.rst b/doc/source/io.rst index e2f2301beb078..459d79ec4d98c 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2539,6 +2539,24 @@ both on the writing (serialization), and reading (deserialization). optimizations in the io of the ``msgpack`` data. Since this is marked as an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. + As a result of writing format changes and other issues: + +----------------------+------------------------+ + | Packed with | Can be unpacked with | + +======================+========================+ + | pre-0.17 / Python 2 | any | + +----------------------+------------------------+ + | pre-0.17 / Python 3 | any | + +----------------------+------------------------+ + | 0.17 / Python 2 | - 0.17 / Python 2 | + | | - >=0.18 / any Python | + +----------------------+------------------------+ + | 0.17 / Python 3 | >=0.18 / any Python | + +----------------------+------------------------+ + | 0.18 | >= 0.18 | + +======================+========================+ + + Reading (files packed by older versions) is backward-compatibile, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. + .. ipython:: python df = DataFrame(np.random.rand(5,2),columns=list('AB')) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 8429739902927..47e78cf558a16 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -513,6 +513,33 @@ Subtraction by ``Timedelta`` in a ``Series`` by a ``Timestamp`` works (:issue:`1 ``pd.Timestamp`` to rehydrate any timestamp like object from its isoformat (:issue:`12300`). +Changes to msgpack +^^^^^^^^^^^^^^^^^^ + +Forward incompatible changes in ``msgpack`` writing format were made over 0.17.0 and 0.18.0; older versions of pandas cannot read files packed by newer versions (:issue:`12129`, `10527`) + +Bug in ``to_msgpack`` and ``read_msgpack`` introduced in 0.17.0 and fixed in 0.18.0, caused files packed in Python 2 unreadable by Python 3 (:issue:`12142`) + +.. warning:: + + As a result of a number of issues: + + +----------------------+------------------------+ + | Packed with | Can be unpacked with | + +======================+========================+ + | pre-0.17 / Python 2 | any | + +----------------------+------------------------+ + | pre-0.17 / Python 3 | any | + +----------------------+------------------------+ + | 0.17 / Python 2 | - 0.17 / Python 2 | + | | - >=0.18 / any Python | + +----------------------+------------------------+ + | 0.17 / Python 3 | >=0.18 / any Python | + +----------------------+------------------------+ + | 0.18 | >= 0.18 | + +======================+========================+ + + 0.18.0 is backward-compatible for reading files packed by older versions, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. Signature change for .rank ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -806,7 +833,6 @@ assignments are valid for multi-line expressions. Other API Changes ^^^^^^^^^^^^^^^^^ - - ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) .. ipython:: python diff --git a/pandas/core/common.py b/pandas/core/common.py index 70c02c5632d80..4f3ec58910950 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3039,3 +3039,29 @@ def _random_state(state=None): else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None") + + +def pandas_dtype(dtype): + """ + Converts input into a pandas only dtype object or a numpy dtype object. + + Parameters + ---------- + dtype : object to be converted + + Returns + ------- + np.dtype or a pandas dtype + """ + if isinstance(dtype, compat.string_types): + try: + return DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + pass + + try: + return CategoricalDtype.construct_from_string(dtype) + except TypeError: + pass + + return np.dtype(dtype) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8973ea025e611..c6b04757e201c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2098,6 +2098,14 @@ def __init__(self, values, placement, ndim=2, **kwargs): if not isinstance(values, self._holder): values = self._holder(values) + + dtype = kwargs.pop('dtype', None) + + if dtype is not None: + if isinstance(dtype, compat.string_types): + dtype = DatetimeTZDtype.construct_from_string(dtype) + values = values.tz_localize('UTC').tz_convert(dtype.tz) + if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") @@ -2428,6 +2436,10 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, else: klass = ObjectBlock + elif klass is DatetimeTZBlock and not is_datetimetz(values): + return klass(values, ndim=ndim, fastpath=fastpath, + placement=placement, dtype=dtype) + return klass(values, ndim=ndim, fastpath=fastpath, placement=placement) # TODO: flexible with index=None and/or items=None diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 372c8d80e5a1a..701b78d2771fb 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -44,7 +44,7 @@ import numpy as np from pandas import compat -from pandas.compat import u +from pandas.compat import u, u_safe from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT) @@ -52,7 +52,7 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame -from pandas.core.common import needs_i8_conversion +from pandas.core.common import needs_i8_conversion, pandas_dtype from pandas.io.common import get_filepath_or_buffer from pandas.core.internals import BlockManager, make_block import pandas.core.internals as internals @@ -84,6 +84,8 @@ def to_msgpack(path_or_buf, *args, **kwargs): """ global compressor compressor = kwargs.pop('compress', None) + if compressor: + compressor = u(compressor) append = kwargs.pop('append', None) if append: mode = 'a+b' @@ -180,7 +182,7 @@ def dtype_for(t): """ return my dtype mapping, whether number or name """ if t in dtype_dict: return dtype_dict[t] - return np.typeDict[t] + return np.typeDict.get(t, t) c2f_dict = {'complex': np.float64, 'complex128': np.float64, @@ -248,15 +250,17 @@ def unconvert(values, dtype, compress=None): if dtype == np.object_: return np.array(values, dtype=object) + dtype = pandas_dtype(dtype).base + if not as_is_ext: values = values.encode('latin1') - if compress == 'zlib': + if compress == u'zlib': import zlib values = zlib.decompress(values) return np.frombuffer(values, dtype=dtype) - elif compress == 'blosc': + elif compress == u'blosc': import blosc values = blosc.decompress(values) return np.frombuffer(values, dtype=dtype) @@ -269,53 +273,52 @@ def encode(obj): """ Data encoder """ - tobj = type(obj) if isinstance(obj, Index): if isinstance(obj, RangeIndex): - return {'typ': 'range_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'start': getattr(obj, '_start', None), - 'stop': getattr(obj, '_stop', None), - 'step': getattr(obj, '_step', None)} + return {u'typ': u'range_index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'start': getattr(obj, '_start', None), + u'stop': getattr(obj, '_stop', None), + u'step': getattr(obj, '_step', None)} elif isinstance(obj, PeriodIndex): - return {'typ': 'period_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'freq': getattr(obj, 'freqstr', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'compress': compressor} + return {u'typ': u'period_index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'freq': u_safe(getattr(obj, 'freqstr', None)), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.asi8), + u'compress': compressor} elif isinstance(obj, DatetimeIndex): tz = getattr(obj, 'tz', None) # store tz info and data as UTC if tz is not None: - tz = tz.zone + tz = u(tz.zone) obj = obj.tz_convert('UTC') - return {'typ': 'datetime_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'freq': getattr(obj, 'freqstr', None), - 'tz': tz, - 'compress': compressor} + return {u'typ': u'datetime_index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.asi8), + u'freq': u_safe(getattr(obj, 'freqstr', None)), + u'tz': tz, + u'compress': compressor} elif isinstance(obj, MultiIndex): - return {'typ': 'multi_index', - 'klass': obj.__class__.__name__, - 'names': getattr(obj, 'names', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return {u'typ': u'multi_index', + u'klass': u(obj.__class__.__name__), + u'names': getattr(obj, 'names', None), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.values), + u'compress': compressor} else: - return {'typ': 'index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return {u'typ': u'index', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'dtype': u(obj.dtype.name), + u'data': convert(obj.values), + u'compress': compressor} elif isinstance(obj, Series): if isinstance(obj, SparseSeries): raise NotImplementedError( @@ -332,13 +335,13 @@ def encode(obj): # d[f] = getattr(obj, f, None) # return d else: - return {'typ': 'series', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'index': obj.index, - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return {u'typ': u'series', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'index': obj.index, + u'dtype': u(obj.dtype.name), + u'data': convert(obj.values), + u'compress': compressor} elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): raise NotImplementedError( @@ -371,86 +374,85 @@ def encode(obj): data = data.consolidate() # the block manager - return {'typ': 'block_manager', - 'klass': obj.__class__.__name__, - 'axes': data.axes, - 'blocks': [{'items': data.items.take(b.mgr_locs), - 'locs': b.mgr_locs.as_array, - 'values': convert(b.values), - 'shape': b.values.shape, - 'dtype': b.dtype.name, - 'klass': b.__class__.__name__, - 'compress': compressor - } for b in data.blocks]} + return {u'typ': u'block_manager', + u'klass': u(obj.__class__.__name__), + u'axes': data.axes, + u'blocks': [{u'locs': b.mgr_locs.as_array, + u'values': convert(b.values), + u'shape': b.values.shape, + u'dtype': u(b.dtype.name), + u'klass': u(b.__class__.__name__), + u'compress': compressor} for b in data.blocks] + } elif isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64, NaTType)): if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: - tz = tz.zone + tz = u(tz.zone) offset = obj.offset if offset is not None: - offset = offset.freqstr - return {'typ': 'timestamp', - 'value': obj.value, - 'offset': offset, - 'tz': tz} + offset = u(offset.freqstr) + return {u'typ': u'timestamp', + u'value': obj.value, + u'offset': offset, + u'tz': tz} if isinstance(obj, NaTType): - return {'typ': 'nat'} + return {u'typ': u'nat'} elif isinstance(obj, np.timedelta64): - return {'typ': 'timedelta64', - 'data': obj.view('i8')} + return {u'typ': u'timedelta64', + u'data': obj.view('i8')} elif isinstance(obj, timedelta): - return {'typ': 'timedelta', - 'data': (obj.days, obj.seconds, obj.microseconds)} + return {u'typ': u'timedelta', + u'data': (obj.days, obj.seconds, obj.microseconds)} elif isinstance(obj, np.datetime64): - return {'typ': 'datetime64', - 'data': str(obj)} + return {u'typ': u'datetime64', + u'data': u(str(obj))} elif isinstance(obj, datetime): - return {'typ': 'datetime', - 'data': obj.isoformat()} + return {u'typ': u'datetime', + u'data': u(obj.isoformat())} elif isinstance(obj, date): - return {'typ': 'date', - 'data': obj.isoformat()} + return {u'typ': u'date', + u'data': u(obj.isoformat())} raise Exception("cannot encode this datetimelike object: %s" % obj) elif isinstance(obj, Period): - return {'typ': 'period', - 'ordinal': obj.ordinal, - 'freq': obj.freq} + return {u'typ': u'period', + u'ordinal': obj.ordinal, + u'freq': u(obj.freq)} elif isinstance(obj, BlockIndex): - return {'typ': 'block_index', - 'klass': obj.__class__.__name__, - 'blocs': obj.blocs, - 'blengths': obj.blengths, - 'length': obj.length} + return {u'typ': u'block_index', + u'klass': u(obj.__class__.__name__), + u'blocs': obj.blocs, + u'blengths': obj.blengths, + u'length': obj.length} elif isinstance(obj, IntIndex): - return {'typ': 'int_index', - 'klass': obj.__class__.__name__, - 'indices': obj.indices, - 'length': obj.length} + return {u'typ': u'int_index', + u'klass': u(obj.__class__.__name__), + u'indices': obj.indices, + u'length': obj.length} elif isinstance(obj, np.ndarray): - return {'typ': 'ndarray', - 'shape': obj.shape, - 'ndim': obj.ndim, - 'dtype': obj.dtype.name, - 'data': convert(obj), - 'compress': compressor} + return {u'typ': u'ndarray', + u'shape': obj.shape, + u'ndim': obj.ndim, + u'dtype': u(obj.dtype.name), + u'data': convert(obj), + u'compress': compressor} elif isinstance(obj, np.number): if np.iscomplexobj(obj): - return {'typ': 'np_scalar', - 'sub_typ': 'np_complex', - 'dtype': obj.dtype.name, - 'real': obj.real.__repr__(), - 'imag': obj.imag.__repr__()} + return {u'typ': u'np_scalar', + u'sub_typ': u'np_complex', + u'dtype': u(obj.dtype.name), + u'real': u(obj.real.__repr__()), + u'imag': u(obj.imag.__repr__())} else: - return {'typ': 'np_scalar', - 'dtype': obj.dtype.name, - 'data': obj.__repr__()} + return {u'typ': u'np_scalar', + u'dtype': u(obj.dtype.name), + u'data': u(obj.__repr__())} elif isinstance(obj, complex): - return {'typ': 'np_complex', - 'real': obj.real.__repr__(), - 'imag': obj.imag.__repr__()} + return {u'typ': u'np_complex', + u'real': u(obj.real.__repr__()), + u'imag': u(obj.imag.__repr__())} return obj @@ -460,83 +462,91 @@ def decode(obj): Decoder for deserializing numpy data types. """ - typ = obj.get('typ') + typ = obj.get(u'typ') if typ is None: return obj - elif typ == 'timestamp': - return Timestamp(obj['value'], tz=obj['tz'], offset=obj['offset']) - elif typ == 'nat': + elif typ == u'timestamp': + return Timestamp(obj[u'value'], tz=obj[u'tz'], offset=obj[u'offset']) + elif typ == u'nat': return NaT - elif typ == 'period': - return Period(ordinal=obj['ordinal'], freq=obj['freq']) - elif typ == 'index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) - return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) - elif typ == 'range_index': - return globals()[obj['klass']](obj['start'], - obj['stop'], - obj['step'], - name=obj['name']) - elif typ == 'multi_index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) + elif typ == u'period': + return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq']) + elif typ == u'index': + dtype = dtype_for(obj[u'dtype']) + data = unconvert(obj[u'data'], dtype, + obj.get(u'compress')) + return globals()[obj[u'klass']](data, dtype=dtype, name=obj[u'name']) + elif typ == u'range_index': + return globals()[obj[u'klass']](obj[u'start'], + obj[u'stop'], + obj[u'step'], + name=obj[u'name']) + elif typ == u'multi_index': + dtype = dtype_for(obj[u'dtype']) + data = unconvert(obj[u'data'], dtype, + obj.get(u'compress')) data = [tuple(x) for x in data] - return globals()[obj['klass']].from_tuples(data, names=obj['names']) - elif typ == 'period_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq']) - return globals()[obj['klass']](data, **d) - elif typ == 'datetime_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq'], verify_integrity=False) - result = globals()[obj['klass']](data, **d) - tz = obj['tz'] + return globals()[obj[u'klass']].from_tuples(data, names=obj[u'names']) + elif typ == u'period_index': + data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) + d = dict(name=obj[u'name'], freq=obj[u'freq']) + return globals()[obj[u'klass']](data, **d) + elif typ == u'datetime_index': + data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) + d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False) + result = globals()[obj[u'klass']](data, **d) + tz = obj[u'tz'] # reverse tz conversion if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result - elif typ == 'series': - dtype = dtype_for(obj['dtype']) - index = obj['index'] - return globals()[obj['klass']](unconvert(obj['data'], dtype, - obj['compress']), - index=index, - dtype=dtype, - name=obj['name']) - elif typ == 'block_manager': - axes = obj['axes'] + elif typ == u'series': + dtype = dtype_for(obj[u'dtype']) + pd_dtype = pandas_dtype(dtype) + np_dtype = pandas_dtype(dtype).base + index = obj[u'index'] + result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype, + obj[u'compress']), + index=index, + dtype=np_dtype, + name=obj[u'name']) + tz = getattr(pd_dtype, 'tz', None) + if tz: + result = result.dt.tz_localize('UTC').dt.tz_convert(tz) + return result + + elif typ == u'block_manager': + axes = obj[u'axes'] def create_block(b): - values = unconvert(b['values'], dtype_for(b['dtype']), - b['compress']).reshape(b['shape']) + values = unconvert(b[u'values'], dtype_for(b[u'dtype']), + b[u'compress']).reshape(b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 - if 'locs' in b: - placement = b['locs'] + if u'locs' in b: + placement = b[u'locs'] else: - placement = axes[0].get_indexer(b['items']) + placement = axes[0].get_indexer(b[u'items']) return make_block(values=values, - klass=getattr(internals, b['klass']), - placement=placement) - - blocks = [create_block(b) for b in obj['blocks']] - return globals()[obj['klass']](BlockManager(blocks, axes)) - elif typ == 'datetime': - return parse(obj['data']) - elif typ == 'datetime64': - return np.datetime64(parse(obj['data'])) - elif typ == 'date': - return parse(obj['data']).date() - elif typ == 'timedelta': - return timedelta(*obj['data']) - elif typ == 'timedelta64': - return np.timedelta64(int(obj['data'])) + klass=getattr(internals, b[u'klass']), + placement=placement, + dtype=b[u'dtype']) + + blocks = [create_block(b) for b in obj[u'blocks']] + return globals()[obj[u'klass']](BlockManager(blocks, axes)) + elif typ == u'datetime': + return parse(obj[u'data']) + elif typ == u'datetime64': + return np.datetime64(parse(obj[u'data'])) + elif typ == u'date': + return parse(obj[u'data']).date() + elif typ == u'timedelta': + return timedelta(*obj[u'data']) + elif typ == u'timedelta64': + return np.timedelta64(int(obj[u'data'])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return globals()[obj['klass']]( @@ -554,25 +564,25 @@ def create_block(b): # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) - elif typ == 'block_index': - return globals()[obj['klass']](obj['length'], obj['blocs'], - obj['blengths']) - elif typ == 'int_index': - return globals()[obj['klass']](obj['length'], obj['indices']) - elif typ == 'ndarray': - return unconvert(obj['data'], np.typeDict[obj['dtype']], - obj.get('compress')).reshape(obj['shape']) - elif typ == 'np_scalar': - if obj.get('sub_typ') == 'np_complex': - return c2f(obj['real'], obj['imag'], obj['dtype']) + elif typ == u'block_index': + return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'], + obj[u'blengths']) + elif typ == u'int_index': + return globals()[obj[u'klass']](obj[u'length'], obj[u'indices']) + elif typ == u'ndarray': + return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']], + obj.get(u'compress')).reshape(obj[u'shape']) + elif typ == u'np_scalar': + if obj.get(u'sub_typ') == u'np_complex': + return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype']) else: - dtype = dtype_for(obj['dtype']) + dtype = dtype_for(obj[u'dtype']) try: - return dtype(obj['data']) + return dtype(obj[u'data']) except: - return dtype.type(obj['data']) - elif typ == 'np_complex': - return complex(obj['real'] + '+' + obj['imag'] + 'j') + return dtype.type(obj[u'data']) + elif typ == u'np_complex': + return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j') elif isinstance(obj, (dict, list, set)): return obj else: diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack b/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack new file mode 100644 index 0000000000000..e89b5dd99150e Binary files /dev/null and b/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack differ diff --git a/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack b/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack new file mode 100644 index 0000000000000..98efdabedea72 Binary files /dev/null and b/pandas/io/tests/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack differ diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py index f556c980bb80c..bfa8ff6d30a9c 100644 --- a/pandas/io/tests/generate_legacy_storage_files.py +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -6,6 +6,7 @@ Index, MultiIndex, bdate_range, to_msgpack, date_range, period_range, Timestamp, Categorical, Period) +from pandas.compat import u import os import sys import numpy as np @@ -13,6 +14,9 @@ import platform as pl +_loose_version = LooseVersion(pandas.__version__) + + def _create_sp_series(): nan = np.nan @@ -22,7 +26,7 @@ def _create_sp_series(): arr[-1:] = nan bseries = SparseSeries(arr, kind='block') - bseries.name = 'bseries' + bseries.name = u'bseries' return bseries @@ -36,17 +40,17 @@ def _create_sp_tsseries(): date_index = bdate_range('1/1/2011', periods=len(arr)) bseries = SparseSeries(arr, index=date_index, kind='block') - bseries.name = 'btsseries' + bseries.name = u'btsseries' return bseries def _create_sp_frame(): nan = np.nan - data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10).astype(np.int64), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + u'C': np.arange(10).astype(np.int64), + u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} dates = bdate_range('1/1/2011', periods=10) return SparseDataFrame(data, index=dates) @@ -56,79 +60,79 @@ def create_data(): """ create the pickle/msgpack data """ data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] + u'A': [0., 1., 2., 3., np.nan], + u'B': [0, 1, 0, 1, 0], + u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'], + u'D': date_range('1/1/2009', periods=5), + u'E': [0., 1, Timestamp('20100101'), u'foo', 2.] } - scalars = dict(timestamp=Timestamp('20130101')) - if LooseVersion(pandas.__version__) >= '0.17.0': - scalars['period'] = Period('2012', 'M') + scalars = dict(timestamp=Timestamp('20130101'), + period=Period('2012', 'M')) index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', - 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', - 'two', 'one', 'two']])), - names=['first', 'second'])) - series = dict(float=Series(data['A']), - int=Series(data['B']), - mixed=Series(data['E']), + tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', + u'foo', u'qux', u'qux'], + [u'one', u'two', u'one', u'two', u'one', + u'two', u'one', u'two']])), + names=[u'first', u'second'])) + series = dict(float=Series(data[u'A']), + int=Series(data[u'B']), + mixed=Series(data[u'E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples( tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), - names=['one', 'two'])), + names=[u'one', u'two'])), dup=Series(np.arange(5).astype(np.float64), - index=['A', 'B', 'C', 'D', 'A']), - cat=Series(Categorical(['foo', 'bar', 'baz'])), + index=[u'A', u'B', u'C', u'D', u'A']), + cat=Series(Categorical([u'foo', u'bar', u'baz'])), dt=Series(date_range('20130101', periods=5)), dt_tz=Series(date_range('20130101', periods=5, - tz='US/Eastern'))) - if LooseVersion(pandas.__version__) >= '0.17.0': - series['period'] = Series([Period('2000Q1')] * 5) + tz='US/Eastern')), + period=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) - mixed_dup_df.columns = list("ABCDA") - frame = dict(float=DataFrame(dict(A=series['float'], - B=series['float'] + 1)), - int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), - mixed=DataFrame(dict([(k, data[k]) - for k in ['A', 'B', 'C', 'D']])), - mi=DataFrame(dict(A=np.arange(5).astype(np.float64), - B=np.arange(5).astype(np.int64)), + mixed_dup_df.columns = list(u"ABCDA") + frame = dict(float=DataFrame({u'A': series[u'float'], + u'B': series[u'float'] + 1}), + int=DataFrame({u'A': series[u'int'], + u'B': series[u'int'] + 1}), + mixed=DataFrame({k: data[k] + for k in [u'A', u'B', u'C', u'D']}), + mi=DataFrame({u'A': np.arange(5).astype(np.float64), + u'B': np.arange(5).astype(np.int64)}, index=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', - 'baz', 'baz'], - ['one', 'two', 'one', - 'two', 'three']])), - names=['first', 'second'])), + tuple(zip(*[[u'bar', u'bar', u'baz', + u'baz', u'baz'], + [u'one', u'two', u'one', + u'two', u'three']])), + names=[u'first', u'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A']), - cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), - cat_and_float=DataFrame(dict( - A=Categorical(['foo', 'bar', 'baz']), - B=np.arange(3).astype(np.int64))), + columns=[u'A', u'B', u'A']), + cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}), + cat_and_float=DataFrame({ + u'A': Categorical([u'foo', u'bar', u'baz']), + u'B': np.arange(3).astype(np.int64)}), mixed_dup=mixed_dup_df, - dt_mixed_tzs=DataFrame(dict( - A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='CET')), index=range(5)), + dt_mixed_tzs=DataFrame({ + u'A': Timestamp('20130102', tz='US/Eastern'), + u'B': Timestamp('20130603', tz='CET')}, index=range(5)) ) - mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) - mixed_dup_panel.items = ['ItemA', 'ItemA'] - panel = dict(float=Panel(dict(ItemA=frame['float'], - ItemB=frame['float'] + 1)), + mixed_dup_panel = Panel({u'ItemA': frame[u'float'], + u'ItemB': frame[u'int']}) + mixed_dup_panel.items = [u'ItemA', u'ItemA'] + panel = dict(float=Panel({u'ItemA': frame[u'float'], + u'ItemB': frame[u'float'] + 1}), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), - items=['A', 'B', 'A']), + items=[u'A', u'B', u'A']), mixed_dup=mixed_dup_panel) return dict(series=series, @@ -147,26 +151,38 @@ def create_pickle_data(): # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and # panels if their columns/items were non-unique. - if LooseVersion(pandas.__version__) < '0.14.1': + if _loose_version < '0.14.1': del data['frame']['mixed_dup'] del data['panel']['mixed_dup'] + if _loose_version < '0.17.0': + del data['series']['period'] + del data['scalars']['period'] return data +def _u(x): + return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x + + def create_msgpack_data(): data = create_data() - if LooseVersion(pandas.__version__) < '0.17.0': + if _loose_version < '0.17.0': del data['frame']['mixed_dup'] del data['panel']['mixed_dup'] del data['frame']['dup'] del data['panel']['dup'] + if _loose_version < '0.18.0': + del data['series']['dt_tz'] + del data['frame']['dt_mixed_tzs'] # Not supported del data['sp_series'] del data['sp_frame'] del data['series']['cat'] + del data['series']['period'] del data['frame']['cat_onecol'] del data['frame']['cat_and_float'] - return data + del data['scalars']['period'] + return _u(data) def platform_name(): @@ -199,7 +215,7 @@ def write_legacy_pickles(output_dir): print("created pickle file: %s" % pth) -def write_legacy_msgpack(output_dir): +def write_legacy_msgpack(output_dir, compress): version = pandas.__version__ @@ -208,9 +224,9 @@ def write_legacy_msgpack(output_dir): print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: msgpack") - pth = '{0}.msgpack'.format(platform_name()) - to_msgpack(os.path.join(output_dir, pth), create_msgpack_data()) + to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), + compress=compress) print("created msgpack file: %s" % pth) @@ -219,17 +235,22 @@ def write_legacy_file(): # force our cwd to be the first searched sys.path.insert(0, '.') - if len(sys.argv) != 3: + if not (3 <= len(sys.argv) <= 4): exit("Specify output directory and storage type: generate_legacy_" - "storage_files.py ") + "storage_files.py " + "") output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) + try: + compress_type = str(sys.argv[3]) + except IndexError: + compress_type = None if storage_type == 'pickle': write_legacy_pickles(output_dir=output_dir) elif storage_type == 'msgpack': - write_legacy_msgpack(output_dir=output_dir) + write_legacy_msgpack(output_dir=output_dir, compress=compress_type) else: exit("storage_type must be one of {'pickle', 'msgpack'}") diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index d1c05069b4172..d0e7d00d79cb0 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -331,11 +331,16 @@ def setUp(self): 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 + + [Timestamp('20130603', tz='CET')] * 3, + 'G': [Timestamp('20130102', tz='US/Eastern')] * 5 } self.d['float'] = Series(data['A']) self.d['int'] = Series(data['B']) self.d['mixed'] = Series(data['E']) + self.d['dt_tz_mixed'] = Series(data['F']) + self.d['dt_tz'] = Series(data['G']) def test_basic(self): @@ -357,13 +362,14 @@ def setUp(self): 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + 'F': [Timestamp('20130102', tz='US/Eastern')] * 5, + 'G': [Timestamp('20130603', tz='CET')] * 5 } self.frame = { 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), - 'mixed': DataFrame(dict([(k, data[k]) - for k in ['A', 'B', 'C', 'D']]))} + 'mixed': DataFrame(data)} self.panel = { 'float': Panel(dict(ItemA=self.frame['float'], @@ -713,6 +719,11 @@ def read_msgpacks(self, version): pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) n = 0 for f in os.listdir(pth): + # GH12142 0.17 files packed in P2 can't be read in P3 + if (compat.PY3 and + version.startswith('0.17.') and + f.split('.')[-4][-1] == '2'): + continue vf = os.path.join(pth, f) self.compare(vf, version) n += 1