diff --git a/nibabel/openers.py b/nibabel/openers.py index 41edcc5cfb..592419375d 100644 --- a/nibabel/openers.py +++ b/nibabel/openers.py @@ -42,7 +42,24 @@ HAVE_INDEXED_GZIP = False -def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False): +class DeterministicGzipFile(gzip.GzipFile): + """ Deterministic variant of GzipFile + + This writer does not add filename information to the header, and defaults + to a modification time (``mtime``) of 0 seconds. + """ + def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtime=0): + # These two guards are copied from + # https://github.com/python/cpython/blob/6ab65c6/Lib/gzip.py#L171-L174 + if mode and 'b' not in mode: + mode += 'b' + if fileobj is None: + fileobj = self.myfileobj = open(filename, mode or 'rb') + return super().__init__(filename="", mode=mode, compresslevel=compresslevel, + fileobj=fileobj, mtime=mtime) + + +def _gzip_open(filename, mode='rb', compresslevel=9, mtime=0, keep_open=False): # use indexed_gzip if possible for faster read access. If keep_open == # True, we tell IndexedGzipFile to keep the file handle open. Otherwise @@ -52,7 +69,7 @@ def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False): # Fall-back to built-in GzipFile else: - gzip_file = gzip.GzipFile(filename, mode, compresslevel) + gzip_file = DeterministicGzipFile(filename, mode, compresslevel, mtime=mtime) return gzip_file @@ -83,7 +100,7 @@ class Opener(object): passed to opening method when `fileish` is str. Change of defaults as for \*args """ - gz_def = (_gzip_open, ('mode', 'compresslevel', 'keep_open')) + gz_def = (_gzip_open, ('mode', 'compresslevel', 'mtime', 'keep_open')) bz2_def = (BZ2File, ('mode', 'buffering', 'compresslevel')) zstd_def = (_zstd_open, ('mode', 'level_or_option', 'zstd_dict')) compress_ext_map = { @@ -163,10 +180,7 @@ def name(self): self._name will be None if object was created with a fileobj, otherwise it will be the filename. """ - try: - return self.fobj.name - except AttributeError: - return self._name + return self._name @property def mode(self): diff --git a/nibabel/tests/test_openers.py b/nibabel/tests/test_openers.py index e8106c1c74..21f26acccc 100644 --- a/nibabel/tests/test_openers.py +++ b/nibabel/tests/test_openers.py @@ -12,12 +12,15 @@ from gzip import GzipFile from io import BytesIO, UnsupportedOperation from distutils.version import StrictVersion +import hashlib +import time from numpy.compat.py3k import asstr, asbytes from ..openers import (Opener, ImageOpener, HAVE_INDEXED_GZIP, BZ2File, + DeterministicGzipFile, ) from ..tmpdirs import InTemporaryDirectory from ..volumeutils import BinOpener @@ -367,3 +370,126 @@ def test_iter(): lobj = Opener(Lunk('')) with pytest.raises(TypeError): list(lobj) + + +def md5sum(fname): + with open(fname, "rb") as fobj: + return hashlib.md5(fobj.read()).hexdigest() + + +def test_DeterministicGzipFile(): + with InTemporaryDirectory(): + msg = b"Hello, I'd like to have an argument." + + # No filename, no mtime + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj: + gzobj.write(msg) + anon_chksum = md5sum("ref.gz") + + with DeterministicGzipFile("default.gz", "wb") as fobj: + internal_fobj = fobj.myfileobj + fobj.write(msg) + # Check that myfileobj is being closed by GzipFile.close() + # This is in case GzipFile changes its internal implementation + assert internal_fobj.closed + + assert md5sum("default.gz") == anon_chksum + + # No filename, current mtime + now = time.time() + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=now) as gzobj: + gzobj.write(msg) + now_chksum = md5sum("ref.gz") + + with DeterministicGzipFile("now.gz", "wb", mtime=now) as fobj: + fobj.write(msg) + + assert md5sum("now.gz") == now_chksum + + # Change in default behavior + with mock.patch("time.time") as t: + t.return_value = now + + # GzipFile will use time.time() + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj) as gzobj: + gzobj.write(msg) + assert md5sum("ref.gz") == now_chksum + + # DeterministicGzipFile will use 0 + with DeterministicGzipFile("now.gz", "wb") as fobj: + fobj.write(msg) + assert md5sum("now.gz") == anon_chksum + + # GzipFile is filename dependent, DeterministicGzipFile is independent + with GzipFile("filenameA.gz", mode="wb", mtime=0) as gzobj: + gzobj.write(msg) + fnameA_chksum = md5sum("filenameA.gz") + assert fnameA_chksum != anon_chksum + + with DeterministicGzipFile("filenameA.gz", "wb") as fobj: + fobj.write(msg) + + # But the contents are the same with different filenames + assert md5sum("filenameA.gz") == anon_chksum + + +def test_DeterministicGzipFile_fileobj(): + with InTemporaryDirectory(): + msg = b"Hello, I'd like to have an argument." + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj: + gzobj.write(msg) + ref_chksum = md5sum("ref.gz") + + with open("test.gz", "wb") as fobj: + with DeterministicGzipFile(filename="", mode="wb", fileobj=fobj) as gzobj: + gzobj.write(msg) + md5sum("test.gz") == ref_chksum + + with open("test.gz", "wb") as fobj: + with DeterministicGzipFile(fileobj=fobj, mode="wb") as gzobj: + gzobj.write(msg) + md5sum("test.gz") == ref_chksum + + with open("test.gz", "wb") as fobj: + with DeterministicGzipFile(filename="test.gz", mode="wb", fileobj=fobj) as gzobj: + gzobj.write(msg) + md5sum("test.gz") == ref_chksum + + +def test_bitwise_determinism(): + with InTemporaryDirectory(): + msg = b"Hello, I'd like to have an argument." + # Canonical reference: No filename, no mtime + # Use default compresslevel + with open("ref.gz", "wb") as fobj: + with GzipFile(filename="", mode="wb", + compresslevel=1, fileobj=fobj, + mtime=0) as gzobj: + gzobj.write(msg) + anon_chksum = md5sum("ref.gz") + + # Different times, different filenames + now = time.time() + with mock.patch("time.time") as t: + t.return_value = now + with Opener("a.gz", "wb") as fobj: + fobj.write(msg) + t.return_value = now + 1 + with Opener("b.gz", "wb") as fobj: + fobj.write(msg) + + assert md5sum("a.gz") == anon_chksum + assert md5sum("b.gz") == anon_chksum + + # Users can still set mtime, but filenames will not be embedded + with Opener("filenameA.gz", "wb", mtime=0xCAFE10C0) as fobj: + fobj.write(msg) + with Opener("filenameB.gz", "wb", mtime=0xCAFE10C0) as fobj: + fobj.write(msg) + fnameA_chksum = md5sum("filenameA.gz") + fnameB_chksum = md5sum("filenameB.gz") + assert fnameA_chksum == fnameB_chksum != anon_chksum