Skip to content

ENH: Create gzip header deterministically by default #1024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 25, 2021
28 changes: 21 additions & 7 deletions nibabel/openers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,24 @@
HAVE_INDEXED_GZIP = False


def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False):
class DeterministicGzipFile(gzip.GzipFile):
""" Deterministic variant of GzipFile

This writer does not add filename information to the header, and defaults
to a modification time (``mtime``) of 0 seconds.
"""
def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtime=0):
# These two guards are copied from
# https://github.com/python/cpython/blob/6ab65c6/Lib/gzip.py#L171-L174
if mode and 'b' not in mode:
mode += 'b'
if fileobj is None:
fileobj = self.myfileobj = open(filename, mode or 'rb')
return super().__init__(filename="", mode=mode, compresslevel=compresslevel,
fileobj=fileobj, mtime=mtime)


def _gzip_open(filename, mode='rb', compresslevel=9, mtime=0, keep_open=False):

# use indexed_gzip if possible for faster read access. If keep_open ==
# True, we tell IndexedGzipFile to keep the file handle open. Otherwise
Expand All @@ -52,7 +69,7 @@ def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False):

# Fall-back to built-in GzipFile
else:
gzip_file = gzip.GzipFile(filename, mode, compresslevel)
gzip_file = DeterministicGzipFile(filename, mode, compresslevel, mtime=mtime)

return gzip_file

Expand Down Expand Up @@ -83,7 +100,7 @@ class Opener(object):
passed to opening method when `fileish` is str. Change of defaults as
for \*args
"""
gz_def = (_gzip_open, ('mode', 'compresslevel', 'keep_open'))
gz_def = (_gzip_open, ('mode', 'compresslevel', 'mtime', 'keep_open'))
bz2_def = (BZ2File, ('mode', 'buffering', 'compresslevel'))
zstd_def = (_zstd_open, ('mode', 'level_or_option', 'zstd_dict'))
compress_ext_map = {
Expand Down Expand Up @@ -163,10 +180,7 @@ def name(self):
self._name will be None if object was created with a fileobj, otherwise
it will be the filename.
"""
try:
return self.fobj.name
except AttributeError:
return self._name
return self._name

@property
def mode(self):
Expand Down
126 changes: 126 additions & 0 deletions nibabel/tests/test_openers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
from gzip import GzipFile
from io import BytesIO, UnsupportedOperation
from distutils.version import StrictVersion
import hashlib
import time

from numpy.compat.py3k import asstr, asbytes
from ..openers import (Opener,
ImageOpener,
HAVE_INDEXED_GZIP,
BZ2File,
DeterministicGzipFile,
)
from ..tmpdirs import InTemporaryDirectory
from ..volumeutils import BinOpener
Expand Down Expand Up @@ -367,3 +370,126 @@ def test_iter():
lobj = Opener(Lunk(''))
with pytest.raises(TypeError):
list(lobj)


def md5sum(fname):
with open(fname, "rb") as fobj:
return hashlib.md5(fobj.read()).hexdigest()


def test_DeterministicGzipFile():
with InTemporaryDirectory():
msg = b"Hello, I'd like to have an argument."

# No filename, no mtime
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj:
gzobj.write(msg)
anon_chksum = md5sum("ref.gz")

with DeterministicGzipFile("default.gz", "wb") as fobj:
internal_fobj = fobj.myfileobj
fobj.write(msg)
# Check that myfileobj is being closed by GzipFile.close()
# This is in case GzipFile changes its internal implementation
assert internal_fobj.closed

assert md5sum("default.gz") == anon_chksum

# No filename, current mtime
now = time.time()
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=now) as gzobj:
gzobj.write(msg)
now_chksum = md5sum("ref.gz")

with DeterministicGzipFile("now.gz", "wb", mtime=now) as fobj:
fobj.write(msg)

assert md5sum("now.gz") == now_chksum

# Change in default behavior
with mock.patch("time.time") as t:
t.return_value = now

# GzipFile will use time.time()
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj) as gzobj:
gzobj.write(msg)
assert md5sum("ref.gz") == now_chksum

# DeterministicGzipFile will use 0
with DeterministicGzipFile("now.gz", "wb") as fobj:
fobj.write(msg)
assert md5sum("now.gz") == anon_chksum

# GzipFile is filename dependent, DeterministicGzipFile is independent
with GzipFile("filenameA.gz", mode="wb", mtime=0) as gzobj:
gzobj.write(msg)
fnameA_chksum = md5sum("filenameA.gz")
assert fnameA_chksum != anon_chksum

with DeterministicGzipFile("filenameA.gz", "wb") as fobj:
fobj.write(msg)

# But the contents are the same with different filenames
assert md5sum("filenameA.gz") == anon_chksum


def test_DeterministicGzipFile_fileobj():
with InTemporaryDirectory():
msg = b"Hello, I'd like to have an argument."
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj:
gzobj.write(msg)
ref_chksum = md5sum("ref.gz")

with open("test.gz", "wb") as fobj:
with DeterministicGzipFile(filename="", mode="wb", fileobj=fobj) as gzobj:
gzobj.write(msg)
md5sum("test.gz") == ref_chksum

with open("test.gz", "wb") as fobj:
with DeterministicGzipFile(fileobj=fobj, mode="wb") as gzobj:
gzobj.write(msg)
md5sum("test.gz") == ref_chksum

with open("test.gz", "wb") as fobj:
with DeterministicGzipFile(filename="test.gz", mode="wb", fileobj=fobj) as gzobj:
gzobj.write(msg)
md5sum("test.gz") == ref_chksum


def test_bitwise_determinism():
with InTemporaryDirectory():
msg = b"Hello, I'd like to have an argument."
# Canonical reference: No filename, no mtime
# Use default compresslevel
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb",
compresslevel=1, fileobj=fobj,
mtime=0) as gzobj:
gzobj.write(msg)
anon_chksum = md5sum("ref.gz")

# Different times, different filenames
now = time.time()
with mock.patch("time.time") as t:
t.return_value = now
with Opener("a.gz", "wb") as fobj:
fobj.write(msg)
t.return_value = now + 1
with Opener("b.gz", "wb") as fobj:
fobj.write(msg)

assert md5sum("a.gz") == anon_chksum
assert md5sum("b.gz") == anon_chksum

# Users can still set mtime, but filenames will not be embedded
with Opener("filenameA.gz", "wb", mtime=0xCAFE10C0) as fobj:
fobj.write(msg)
with Opener("filenameB.gz", "wb", mtime=0xCAFE10C0) as fobj:
fobj.write(msg)
fnameA_chksum = md5sum("filenameA.gz")
fnameB_chksum = md5sum("filenameB.gz")
assert fnameA_chksum == fnameB_chksum != anon_chksum