Skip to content

Commit fb0b6c3

Browse files
committed
gh-121267: Improve performance of tarfile (#121267)
Tarfile in the default write mode spends much of its time resolving UIDs into usernames and GIDs into group names. By caching these mappings, a significant speedup can be achieved. In my simple benchmark[1], this extra caching speeds up tarfile by 8x. [1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2
1 parent 6343486 commit fb0b6c3

File tree

2 files changed

+22
-8
lines changed

2 files changed

+22
-8
lines changed

Lib/tarfile.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1658,6 +1658,9 @@ class TarFile(object):
16581658

16591659
extraction_filter = None # The default filter for extraction.
16601660

1661+
uname_cache = {} # Cached mappings of uid -> uname, gid -> gname
1662+
gname_cache = {}
1663+
16611664
def __init__(self, name=None, mode="r", fileobj=None, format=None,
16621665
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
16631666
errors="surrogateescape", pax_headers=None, debug=None,
@@ -2105,16 +2108,25 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None):
21052108
tarinfo.mtime = statres.st_mtime
21062109
tarinfo.type = type
21072110
tarinfo.linkname = linkname
2111+
2112+
# Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
2113+
# speed things up, cache the resolved usernames and group names.
21082114
if pwd:
2109-
try:
2110-
tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2111-
except KeyError:
2112-
pass
2115+
if not tarinfo.uid in self.uname_cache:
2116+
try:
2117+
self.uname_cache[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
2118+
except KeyError:
2119+
pass
2120+
2121+
tarinfo.uname = self.uname_cache[tarinfo.uid]
21132122
if grp:
2114-
try:
2115-
tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2116-
except KeyError:
2117-
pass
2123+
if not tarinfo.gid in self.gname_cache:
2124+
try:
2125+
self.gname_cache[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
2126+
except KeyError:
2127+
pass
2128+
2129+
tarinfo.gname = self.gname_cache[tarinfo.gid]
21182130

21192131
if type in (CHRTYPE, BLKTYPE):
21202132
if hasattr(os, "major") and hasattr(os, "minor"):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improve the performance of tarfile when writing files, by caching user names
2+
and group names.

0 commit comments

Comments
 (0)