diff --git a/doc/source/io.rst b/doc/source/io.rst index b36ae8c2ed450..1b19599177c9a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2926,6 +2926,45 @@ any pickled pandas object (or any other pickled object) from file: These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated. +.. _io.pickle.compression: + +Read/Write compressed pickle files +'''''''''''''' + +.. versionadded:: 0.20.0 + +:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read +and write compressed pickle files. Compression types of ``gzip``, ``bz2``, ``xz`` supports +both read and write. ``zip`` file supports read only and must contain only one data file +to be read in. +Compression type can be an explicitely parameter or be inferred from the file extension. +If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or +``'.xz'``, respectively. + +.. ipython:: python + + df = pd.DataFrame({ + 'A': np.random.randn(1000), + 'B': np.random.randn(1000), + 'C': np.random.randn(1000)}) + df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type + df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension + df.to_pickle("data.pkl.gz") # default, using "infer" + df["A"].to_pickle("s1.pkl.bz2") + + df = pd.read_pickle("data.pkl.compress", compression="gzip") + df = pd.read_pickle("data.pkl.xz", compression="infer") + df = pd.read_pickle("data.pkl.gz") + s = pd.read_pickle("s1.pkl.bz2") + +.. ipython:: python + :suppress: + import os + os.remove("data.pkl.compress") + os.remove("data.pkl.xz") + os.remove("data.pkl.gz") + os.remove("s1.pkl.bz2") + .. _io.msgpack: msgpack diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 54df7514a882d..d5c438e8c08d1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -97,6 +97,40 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). df = pd.read_table(url, compression='bz2') # explicitly specify compression df.head(2) +.. _whatsnew_0200.enhancements.pickle_compression: + +Pickle file I/O now supports compression +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` +can now read from and write to compressed pickle files. Compression methods +can be an explicit parameter or be inferred from the file extension. +See :ref:`Read/Write compressed pickle files ` + +.. ipython:: python + + df = pd.DataFrame({ + 'A': np.random.randn(1000), + 'B': np.random.randn(1000), + 'C': np.random.randn(1000)}) + df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type + df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension + df.to_pickle("data.pkl.gz") # default, using "infer" + df["A"].to_pickle("s1.pkl.bz2") + + df = pd.read_pickle("data.pkl.compress", compression="gzip") + df = pd.read_pickle("data.pkl.xz", compression="infer") + df = pd.read_pickle("data.pkl.gz") + s = pd.read_pickle("s1.pkl.bz2") + +.. ipython:: python + :suppress: + import os + os.remove("data.pkl.compress") + os.remove("data.pkl.xz") + os.remove("data.pkl.gz") + os.remove("s1.pkl.bz2") + .. _whatsnew_0200.enhancements.uint64_support: UInt64 Support Improved diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 127aac970fbc1..61a1514dd997a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1278,7 +1278,7 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype) - def to_pickle(self, path): + def to_pickle(self, path, compression='infer'): """ Pickle (serialize) object to input file path. @@ -1286,9 +1286,13 @@ def to_pickle(self, path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 """ from pandas.io.pickle import to_pickle - return to_pickle(self, path) + return to_pickle(self, path, compression=compression) def to_clipboard(self, excel=None, sep=None, **kwargs): """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 74c51b74ca18a..e42d218d7925f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -305,7 +305,7 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False): + memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. @@ -320,7 +320,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, Supported compression protocols are gzip, bz2, zip, and xz memory_map : boolean, default False See parsers._parser_params for more information. - + is_text : boolean, default True + whether file/buffer is in text format (csv, json, etc.), or in binary + mode (pickle, etc.) Returns ------- f : file-like @@ -394,13 +396,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding) - else: + elif is_text: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace') + else: + # Python 3 and binary mode + f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding - if compat.PY3 and (compression or isinstance(f, need_text_wrapping)): + if compat.PY3 and is_text and\ + (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 2358c296f782e..969a2a51cb15d 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -4,9 +4,10 @@ from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 from pandas.types.common import is_datetime64_dtype, _NS_DTYPE +from pandas.io.common import _get_handle, _infer_compression -def to_pickle(obj, path): +def to_pickle(obj, path, compression='infer'): """ Pickle (serialize) object to input file path @@ -15,12 +16,23 @@ def to_pickle(obj, path): obj : any object path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 """ - with open(path, 'wb') as f: + inferred_compression = _infer_compression(path, compression) + f, fh = _get_handle(path, 'wb', + compression=inferred_compression, + is_text=False) + try: pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) + finally: + for _f in fh: + _f.close() -def read_pickle(path): +def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path @@ -32,12 +44,32 @@ def read_pickle(path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz', + or 'zip' respectively, and no decompression otherwise. + Set to None for no decompression. + + .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file """ + inferred_compression = _infer_compression(path, compression) + + def read_wrapper(func): + # wrapper file handle open/close operation + f, fh = _get_handle(path, 'rb', + compression=inferred_compression, + is_text=False) + try: + return func(f) + finally: + for _f in fh: + _f.close() + def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then @@ -48,19 +80,16 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - with open(path, 'rb') as fh: - return pkl.load(fh) + return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: - with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=False) - + return read_wrapper( + lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: - with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=True) - + return read_wrapper( + lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: @@ -68,6 +97,7 @@ def try_read(path, encoding=None): return try_read(path, encoding='latin1') raise + # compat with sparse pickle / unpickle diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index c736ec829808a..2fffc3c39ec26 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -15,15 +15,14 @@ import pytest import os - from distutils.version import LooseVersion - import pandas as pd from pandas import Index from pandas.compat import is_platform_little_endian import pandas import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd +import shutil @pytest.fixture(scope='module') @@ -302,3 +301,196 @@ def test_pickle_v0_15_2(): # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + +# --------------------- +# test pickle compression +# --------------------- +_compression_to_extension = { + None: ".none", + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', +} + + +def get_random_path(): + return u'__%s__.pickle' % tm.rands(10) + + +def compress_file(src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return + + if compression == 'gzip': + import gzip + f = gzip.open(dest_path, "w") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(dest_path, "w") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(dest_path, "w", + compression=zipfile.ZIP_DEFLATED) + zip_file.write(src_path, os.path.basename(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(dest_path, "w") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + if compression != "zip": + f.write(open(src_path, "rb").read()) + f.close() + + +def decompress_file(src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return + + if compression == 'gzip': + import gzip + f = gzip.open(src_path, "r") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(src_path, "r") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(src_path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError('ZIP file {} error. Only one file per ZIP.' + .format(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(src_path, "r") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + open(dest_path, "wb").write(f.read()) + f.close() + + +@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) +def test_write_explicit(compression): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".compressed" + path2 = base + ".raw" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to compressed file + df.to_pickle(p1, compression=compression) + + # decompress + decompress_file(p1, p2, compression=compression) + + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) +def test_write_explicit_bad(compression): + with tm.assertRaisesRegexp(ValueError, + "Unrecognized compression type"): + with tm.ensure_clean(get_random_path()) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) + + +@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) +def test_write_infer(ext): + if ext == '.xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ext + path2 = base + ".raw" + compression = None + for c in _compression_to_extension: + if _compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to compressed file by inferred compression method + df.to_pickle(p1) + + # decompress + decompress_file(p1, p2, compression=compression) + + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) +def test_read_explicit(compression): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".raw" + path2 = base + ".compressed" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to uncompressed file + df.to_pickle(p1, compression=None) + + # compress + compress_file(p1, p2, compression=compression) + + # read compressed file + df2 = pd.read_pickle(p2, compression=compression) + + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip', + '.no_compress']) +def test_read_infer(ext): + if ext == '.xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".raw" + path2 = base + ext + compression = None + for c in _compression_to_extension: + if _compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to uncompressed file + df.to_pickle(p1, compression=None) + + # compress + compress_file(p1, p2, compression=compression) + + # read compressed file by inferred compression method + df2 = pd.read_pickle(p2) + + tm.assert_frame_equal(df, df2)