Skip to content

Commit 1fb9bd2

Browse files
desmondcheongzxblurb-it[bot]brettcannon
authored
gh-103200: Fix performance issues with zipimport.invalidate_caches() (GH-103208)
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Brett Cannon <[email protected]>
1 parent 6e6a4cd commit 1fb9bd2

File tree

3 files changed

+67
-25
lines changed

3 files changed

+67
-25
lines changed

Lib/test/test_zipimport.py

+41-4
Original file line numberDiff line numberDiff line change
@@ -520,10 +520,10 @@ def testInvalidateCaches(self):
520520
z.writestr(zinfo, data)
521521

522522
zi = zipimport.zipimporter(TEMP_ZIP)
523-
self.assertEqual(zi._files.keys(), files.keys())
523+
self.assertEqual(zi._get_files().keys(), files.keys())
524524
# Check that the file information remains accurate after reloading
525525
zi.invalidate_caches()
526-
self.assertEqual(zi._files.keys(), files.keys())
526+
self.assertEqual(zi._get_files().keys(), files.keys())
527527
# Add a new file to the ZIP archive
528528
newfile = {"spam2" + pyc_ext: (NOW, test_pyc)}
529529
files.update(newfile)
@@ -535,17 +535,54 @@ def testInvalidateCaches(self):
535535
z.writestr(zinfo, data)
536536
# Check that we can detect the new file after invalidating the cache
537537
zi.invalidate_caches()
538-
self.assertEqual(zi._files.keys(), files.keys())
538+
self.assertEqual(zi._get_files().keys(), files.keys())
539539
spec = zi.find_spec('spam2')
540540
self.assertIsNotNone(spec)
541541
self.assertIsInstance(spec.loader, zipimport.zipimporter)
542542
# Check that the cached data is removed if the file is deleted
543543
os.remove(TEMP_ZIP)
544544
zi.invalidate_caches()
545-
self.assertFalse(zi._files)
545+
self.assertFalse(zi._get_files())
546546
self.assertIsNone(zipimport._zip_directory_cache.get(zi.archive))
547547
self.assertIsNone(zi.find_spec("name_does_not_matter"))
548548

549+
def testInvalidateCachesWithMultipleZipimports(self):
550+
packdir = TESTPACK + os.sep
551+
packdir2 = packdir + TESTPACK2 + os.sep
552+
files = {packdir + "__init__" + pyc_ext: (NOW, test_pyc),
553+
packdir2 + "__init__" + pyc_ext: (NOW, test_pyc),
554+
packdir2 + TESTMOD + pyc_ext: (NOW, test_pyc),
555+
"spam" + pyc_ext: (NOW, test_pyc)}
556+
self.addCleanup(os_helper.unlink, TEMP_ZIP)
557+
with ZipFile(TEMP_ZIP, "w") as z:
558+
for name, (mtime, data) in files.items():
559+
zinfo = ZipInfo(name, time.localtime(mtime))
560+
zinfo.compress_type = self.compression
561+
zinfo.comment = b"spam"
562+
z.writestr(zinfo, data)
563+
564+
zi = zipimport.zipimporter(TEMP_ZIP)
565+
self.assertEqual(zi._get_files().keys(), files.keys())
566+
# Zipimporter for the same path.
567+
zi2 = zipimport.zipimporter(TEMP_ZIP)
568+
self.assertEqual(zi2._get_files().keys(), files.keys())
569+
# Add a new file to the ZIP archive to make the cache wrong.
570+
newfile = {"spam2" + pyc_ext: (NOW, test_pyc)}
571+
files.update(newfile)
572+
with ZipFile(TEMP_ZIP, "a") as z:
573+
for name, (mtime, data) in newfile.items():
574+
zinfo = ZipInfo(name, time.localtime(mtime))
575+
zinfo.compress_type = self.compression
576+
zinfo.comment = b"spam"
577+
z.writestr(zinfo, data)
578+
# Invalidate the cache of the first zipimporter.
579+
zi.invalidate_caches()
580+
# Check that the second zipimporter detects the new file and isn't using a stale cache.
581+
self.assertEqual(zi2._get_files().keys(), files.keys())
582+
spec = zi2.find_spec('spam2')
583+
self.assertIsNotNone(spec)
584+
self.assertIsInstance(spec.loader, zipimport.zipimporter)
585+
549586
def testZipImporterMethodsInSubDirectory(self):
550587
packdir = TESTPACK + os.sep
551588
packdir2 = packdir + TESTPACK2 + os.sep

Lib/zipimport.py

+25-21
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,8 @@ def __init__(self, path):
8888
raise ZipImportError('not a Zip file', path=path)
8989
break
9090

91-
try:
92-
files = _zip_directory_cache[path]
93-
except KeyError:
94-
files = _read_directory(path)
95-
_zip_directory_cache[path] = files
96-
self._files = files
91+
if path not in _zip_directory_cache:
92+
_zip_directory_cache[path] = _read_directory(path)
9793
self.archive = path
9894
# a prefix directory following the ZIP file path.
9995
self.prefix = _bootstrap_external._path_join(*prefix[::-1])
@@ -152,7 +148,7 @@ def get_data(self, pathname):
152148
key = pathname[len(self.archive + path_sep):]
153149

154150
try:
155-
toc_entry = self._files[key]
151+
toc_entry = self._get_files()[key]
156152
except KeyError:
157153
raise OSError(0, '', key)
158154
return _get_data(self.archive, toc_entry)
@@ -189,7 +185,7 @@ def get_source(self, fullname):
189185
fullpath = f'{path}.py'
190186

191187
try:
192-
toc_entry = self._files[fullpath]
188+
toc_entry = self._get_files()[fullpath]
193189
except KeyError:
194190
# we have the module, but no source
195191
return None
@@ -268,14 +264,22 @@ def get_resource_reader(self, fullname):
268264
return ZipReader(self, fullname)
269265

270266

271-
def invalidate_caches(self):
272-
"""Reload the file data of the archive path."""
267+
def _get_files(self):
268+
"""Return the files within the archive path."""
273269
try:
274-
self._files = _read_directory(self.archive)
275-
_zip_directory_cache[self.archive] = self._files
276-
except ZipImportError:
277-
_zip_directory_cache.pop(self.archive, None)
278-
self._files = {}
270+
files = _zip_directory_cache[self.archive]
271+
except KeyError:
272+
try:
273+
files = _zip_directory_cache[self.archive] = _read_directory(self.archive)
274+
except ZipImportError:
275+
files = {}
276+
277+
return files
278+
279+
280+
def invalidate_caches(self):
281+
"""Invalidates the cache of file data of the archive path."""
282+
_zip_directory_cache.pop(self.archive, None)
279283

280284

281285
def __repr__(self):
@@ -305,15 +309,15 @@ def _is_dir(self, path):
305309
# of a namespace package. We test by seeing if the name, with an
306310
# appended path separator, exists.
307311
dirpath = path + path_sep
308-
# If dirpath is present in self._files, we have a directory.
309-
return dirpath in self._files
312+
# If dirpath is present in self._get_files(), we have a directory.
313+
return dirpath in self._get_files()
310314

311315
# Return some information about a module.
312316
def _get_module_info(self, fullname):
313317
path = _get_module_path(self, fullname)
314318
for suffix, isbytecode, ispackage in _zip_searchorder:
315319
fullpath = path + suffix
316-
if fullpath in self._files:
320+
if fullpath in self._get_files():
317321
return ispackage
318322
return None
319323

@@ -656,7 +660,7 @@ def _get_mtime_and_size_of_source(self, path):
656660
# strip 'c' or 'o' from *.py[co]
657661
assert path[-1:] in ('c', 'o')
658662
path = path[:-1]
659-
toc_entry = self._files[path]
663+
toc_entry = self._get_files()[path]
660664
# fetch the time stamp of the .py file for comparison
661665
# with an embedded pyc time stamp
662666
time = toc_entry[5]
@@ -676,7 +680,7 @@ def _get_pyc_source(self, path):
676680
path = path[:-1]
677681

678682
try:
679-
toc_entry = self._files[path]
683+
toc_entry = self._get_files()[path]
680684
except KeyError:
681685
return None
682686
else:
@@ -692,7 +696,7 @@ def _get_module_code(self, fullname):
692696
fullpath = path + suffix
693697
_bootstrap._verbose_message('trying {}{}{}', self.archive, path_sep, fullpath, verbosity=2)
694698
try:
695-
toc_entry = self._files[fullpath]
699+
toc_entry = self._get_files()[fullpath]
696700
except KeyError:
697701
pass
698702
else:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix cache repopulation semantics of zipimport.invalidate_caches(). The cache is now repopulated upon retrieving files with an invalid cache, not when the cache is invalidated.

0 commit comments

Comments
 (0)