Skip to content

gh-103200: Fix performance issues with zipimport.invalidate_caches() #103208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
45 changes: 41 additions & 4 deletions Lib/test/test_zipimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,10 +520,10 @@ def testInvalidateCaches(self):
z.writestr(zinfo, data)

zi = zipimport.zipimporter(TEMP_ZIP)
self.assertEqual(zi._files.keys(), files.keys())
self.assertEqual(zi._get_files().keys(), files.keys())
# Check that the file information remains accurate after reloading
zi.invalidate_caches()
self.assertEqual(zi._files.keys(), files.keys())
self.assertEqual(zi._get_files().keys(), files.keys())
# Add a new file to the ZIP archive
newfile = {"spam2" + pyc_ext: (NOW, test_pyc)}
files.update(newfile)
Expand All @@ -535,17 +535,54 @@ def testInvalidateCaches(self):
z.writestr(zinfo, data)
# Check that we can detect the new file after invalidating the cache
zi.invalidate_caches()
self.assertEqual(zi._files.keys(), files.keys())
self.assertEqual(zi._get_files().keys(), files.keys())
spec = zi.find_spec('spam2')
self.assertIsNotNone(spec)
self.assertIsInstance(spec.loader, zipimport.zipimporter)
# Check that the cached data is removed if the file is deleted
os.remove(TEMP_ZIP)
zi.invalidate_caches()
self.assertFalse(zi._files)
self.assertFalse(zi._get_files())
self.assertIsNone(zipimport._zip_directory_cache.get(zi.archive))
self.assertIsNone(zi.find_spec("name_does_not_matter"))

def testInvalidateCachesWithMultipleZipimports(self):
packdir = TESTPACK + os.sep
packdir2 = packdir + TESTPACK2 + os.sep
files = {packdir + "__init__" + pyc_ext: (NOW, test_pyc),
packdir2 + "__init__" + pyc_ext: (NOW, test_pyc),
packdir2 + TESTMOD + pyc_ext: (NOW, test_pyc),
"spam" + pyc_ext: (NOW, test_pyc)}
self.addCleanup(os_helper.unlink, TEMP_ZIP)
with ZipFile(TEMP_ZIP, "w") as z:
for name, (mtime, data) in files.items():
zinfo = ZipInfo(name, time.localtime(mtime))
zinfo.compress_type = self.compression
zinfo.comment = b"spam"
z.writestr(zinfo, data)

zi = zipimport.zipimporter(TEMP_ZIP)
self.assertEqual(zi._get_files().keys(), files.keys())
# Zipimporter for the same path.
zi2 = zipimport.zipimporter(TEMP_ZIP)
self.assertEqual(zi2._get_files().keys(), files.keys())
# Add a new file to the ZIP archive to make the cache wrong.
newfile = {"spam2" + pyc_ext: (NOW, test_pyc)}
files.update(newfile)
with ZipFile(TEMP_ZIP, "a") as z:
for name, (mtime, data) in newfile.items():
zinfo = ZipInfo(name, time.localtime(mtime))
zinfo.compress_type = self.compression
zinfo.comment = b"spam"
z.writestr(zinfo, data)
# Invalidate the cache of the first zipimporter.
zi.invalidate_caches()
# Check that the second zipimporter detects the new file and isn't using a stale cache.
self.assertEqual(zi2._get_files().keys(), files.keys())
spec = zi2.find_spec('spam2')
self.assertIsNotNone(spec)
self.assertIsInstance(spec.loader, zipimport.zipimporter)

def testZipImporterMethodsInSubDirectory(self):
packdir = TESTPACK + os.sep
packdir2 = packdir + TESTPACK2 + os.sep
Expand Down
46 changes: 25 additions & 21 deletions Lib/zipimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,8 @@ def __init__(self, path):
raise ZipImportError('not a Zip file', path=path)
break

try:
files = _zip_directory_cache[path]
except KeyError:
files = _read_directory(path)
_zip_directory_cache[path] = files
self._files = files
if path not in _zip_directory_cache:
_zip_directory_cache[path] = _read_directory(path)
self.archive = path
# a prefix directory following the ZIP file path.
self.prefix = _bootstrap_external._path_join(*prefix[::-1])
Expand Down Expand Up @@ -152,7 +148,7 @@ def get_data(self, pathname):
key = pathname[len(self.archive + path_sep):]

try:
toc_entry = self._files[key]
toc_entry = self._get_files()[key]
except KeyError:
raise OSError(0, '', key)
return _get_data(self.archive, toc_entry)
Expand Down Expand Up @@ -189,7 +185,7 @@ def get_source(self, fullname):
fullpath = f'{path}.py'

try:
toc_entry = self._files[fullpath]
toc_entry = self._get_files()[fullpath]
except KeyError:
# we have the module, but no source
return None
Expand Down Expand Up @@ -268,14 +264,22 @@ def get_resource_reader(self, fullname):
return ZipReader(self, fullname)


def invalidate_caches(self):
"""Reload the file data of the archive path."""
def _get_files(self):
"""Return the files within the archive path."""
try:
self._files = _read_directory(self.archive)
_zip_directory_cache[self.archive] = self._files
except ZipImportError:
_zip_directory_cache.pop(self.archive, None)
self._files = {}
files = _zip_directory_cache[self.archive]
except KeyError:
try:
files = _zip_directory_cache[self.archive] = _read_directory(self.archive)
except ZipImportError:
files = {}

return files


def invalidate_caches(self):
"""Invalidates the cache of file data of the archive path."""
_zip_directory_cache.pop(self.archive, None)


def __repr__(self):
Expand Down Expand Up @@ -305,15 +309,15 @@ def _is_dir(self, path):
# of a namespace package. We test by seeing if the name, with an
# appended path separator, exists.
dirpath = path + path_sep
# If dirpath is present in self._files, we have a directory.
return dirpath in self._files
# If dirpath is present in self._get_files(), we have a directory.
return dirpath in self._get_files()

# Return some information about a module.
def _get_module_info(self, fullname):
path = _get_module_path(self, fullname)
for suffix, isbytecode, ispackage in _zip_searchorder:
fullpath = path + suffix
if fullpath in self._files:
if fullpath in self._get_files():
return ispackage
return None

Expand Down Expand Up @@ -656,7 +660,7 @@ def _get_mtime_and_size_of_source(self, path):
# strip 'c' or 'o' from *.py[co]
assert path[-1:] in ('c', 'o')
path = path[:-1]
toc_entry = self._files[path]
toc_entry = self._get_files()[path]
# fetch the time stamp of the .py file for comparison
# with an embedded pyc time stamp
time = toc_entry[5]
Expand All @@ -676,7 +680,7 @@ def _get_pyc_source(self, path):
path = path[:-1]

try:
toc_entry = self._files[path]
toc_entry = self._get_files()[path]
except KeyError:
return None
else:
Expand All @@ -692,7 +696,7 @@ def _get_module_code(self, fullname):
fullpath = path + suffix
_bootstrap._verbose_message('trying {}{}{}', self.archive, path_sep, fullpath, verbosity=2)
try:
toc_entry = self._files[fullpath]
toc_entry = self._get_files()[fullpath]
except KeyError:
pass
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix cache repopulation semantics of zipimport.invalidate_caches(). The cache is now repopulated upon retrieving files with an invalid cache, not when the cache is invalidated.