Skip to content

gh-84481: Add ZipFile.data_offset attribute #132165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Doc/library/zipfile.rst
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,14 @@ The following data attributes are also available:
it should be no longer than 65535 bytes. Comments longer than this will be
truncated.

.. attribute:: ZipFile.data_offset

The offset to the start of ZIP data from the beginning of the file. When the
:class:`ZipFile` is opened in either mode ``'w'`` or ``'x'`` and the
underlying file does not support ``tell()``, the value will be ``None``
instead.

.. versionadded:: 3.14

.. _path-objects:

Expand Down
48 changes: 48 additions & 0 deletions Lib/test/test_zipfile/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3312,6 +3312,54 @@ def test_execute_zip64(self):
self.assertIn(b'number in executable: 5', output)


class TestDataOffsetPrependedZip(unittest.TestCase):
"""Test .data_offset on reading zip files with an executable prepended."""

def setUp(self):
self.exe_zip = findfile('exe_with_zip', subdir='archivetestdata')
self.exe_zip64 = findfile('exe_with_z64', subdir='archivetestdata')

def _test_data_offset(self, name):
with zipfile.ZipFile(name) as zipfp:
self.assertEqual(zipfp.data_offset, 713)

def test_data_offset_with_exe_prepended(self):
self._test_data_offset(self.exe_zip)

def test_data_offset_with_exe_prepended_zip64(self):
self._test_data_offset(self.exe_zip64)

class TestDataOffsetZipWrite(unittest.TestCase):
"""Test .data_offset for ZipFile opened in write mode."""

def setUp(self):
os.mkdir(TESTFNDIR)
self.addCleanup(rmtree, TESTFNDIR)
self.test_path = os.path.join(TESTFNDIR, 'testoffset.zip')

def test_data_offset_write_no_prefix(self):
with io.BytesIO() as fp:
with zipfile.ZipFile(fp, "w") as zipfp:
self.assertEqual(zipfp.data_offset, 0)

def test_data_offset_write_with_prefix(self):
with io.BytesIO() as fp:
fp.write(b"this is a prefix")
with zipfile.ZipFile(fp, "w") as zipfp:
self.assertEqual(zipfp.data_offset, 16)

def test_data_offset_write_no_tell(self):
# The initializer in ZipFile checks if tell raises AttributeError or
# OSError when creating a file in write mode when deducing the offset
# of the beginning of zip data
class NoTellBytesIO(io.BytesIO):
def tell(self):
raise OSError("Unimplemented!")
with NoTellBytesIO() as fp:
with zipfile.ZipFile(fp, "w") as zipfp:
self.assertIs(zipfp.data_offset, None)


class EncodedMetadataTests(unittest.TestCase):
file_names = ['\u4e00', '\u4e8c', '\u4e09'] # Han 'one', 'two', 'three'
file_content = [
Expand Down
12 changes: 12 additions & 0 deletions Lib/zipfile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1413,10 +1413,12 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
self._didModify = True
try:
self.start_dir = self.fp.tell()
self._data_offset = self.start_dir
except (AttributeError, OSError):
self.fp = _Tellable(self.fp)
self.start_dir = 0
self._seekable = False
self._data_offset = None
Copy link
Member

@picnixz picnixz Apr 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may not be set if mode == 'a' and if BadZipFile is raised. In this case, we should still have a None according to the docs (otherwise an AttributeError will be raised).

For mode 'r', it should be noted that failing _RealGetContents() means that the object will never be initialized so we don't care about the lack of attribute.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe that in this case, since mode 'a' calls tell() unconditionally, we can set the data_offset to the result of that, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though honestly the defensive thing to do here is to initialize it to None early on then change it later when possible.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though honestly the defensive thing to do here is to initialize it to None early on then change it later when possible.

Yes, that's what I thought.

else:
# Some file-like objects can provide tell() but not seek()
try:
Expand Down Expand Up @@ -1486,6 +1488,10 @@ def _RealGetContents(self):
# If Zip64 extension structures are present, account for them
concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)

# store the offset to the beginning of data for the
# .data_offset property
self._data_offset = concat

if self.debug > 2:
inferred = concat + offset_cd
print("given, inferred, offset", offset_cd, inferred, concat)
Expand Down Expand Up @@ -1551,6 +1557,12 @@ def _RealGetContents(self):
zinfo._end_offset = end_offset
end_offset = zinfo.header_offset

@property
def data_offset(self):
"""The offset to the start of zip data in the file or None if
unavailable."""
return self._data_offset

def namelist(self):
"""Return a list of file names in the archive."""
return [data.filename for data in self.filelist]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Add the :attr:`zipfile.ZipFile.data_offset` attribute, which stores the
offset to the beginning of ZIP data in a file when available. When the
:class:`zipfile.ZipFile` is opened in either mode ``'w'`` or ``'x'`` and the
underlying file does not support ``tell()``, the value will be ``None``
instead.
Loading