Skip to content

Commit f1b5603

Browse files
committed
pythonGH-115060: Speed up pathlib.Path.glob() by skipping directory scanning.
For ordinary literal pattern segments (e.g. `foo/bar` in `foo/bar/../**`), skip calling `_scandir()` on each segment, and instead call `exists()` or `is_dir()` as necessary to exclude missing paths. This only applies when *case_sensitive* is `None` (the default); otherwise we can't guarantee case sensitivity or realness with this approach. If *follow_symlinks* is `False` we also need to exclude symlinks from intermediate segments. This restores an optimization that was removed in da1980a by some eejit. It's actually even faster because we don't `stat()` intermediate directories, and in some cases we can skip all filesystem access when expanding a literal part (e.g. when it's followed by a non-recursive wildcard segment).
1 parent 0656509 commit f1b5603

File tree

3 files changed

+79
-7
lines changed

3 files changed

+79
-7
lines changed

Lib/pathlib/_abc.py

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ def _ignore_error(exception):
3636
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)
3737

3838

39+
def _is_wildcard_pattern(pat):
40+
"""Whether this pattern needs actual matching using fnmatch, or can be
41+
looked up directly as a file."""
42+
return "*" in pat or "?" in pat or "[" in pat
43+
44+
3945
@functools.cache
4046
def _is_case_sensitive(pathmod):
4147
return pathmod.normcase('Aa') == 'Aa'
@@ -60,12 +66,42 @@ def _compile_pattern(pat, sep, case_sensitive, recursive=True):
6066
return re.compile(regex, flags=flags).match
6167

6268

63-
def _select_special(paths, part):
64-
"""Yield special literal children of the given paths."""
69+
def _select_literal(paths, part):
70+
"""Yield literal children of the given paths."""
6571
for path in paths:
6672
yield path._make_child_relpath(part)
6773

6874

75+
def _select_directories(paths):
76+
"""Yield the given paths, filtering out non-directories."""
77+
for path in paths:
78+
try:
79+
if path.is_dir():
80+
yield path
81+
except OSError:
82+
pass
83+
84+
85+
def _deselect_missing(paths):
86+
"""Yield the given paths, filtering out missing files."""
87+
for path in paths:
88+
try:
89+
path.stat(follow_symlinks=False)
90+
yield path
91+
except OSError:
92+
pass
93+
94+
95+
def _deselect_symlinks(paths):
96+
"""Yield the given paths, filtering out symlinks."""
97+
for path in paths:
98+
try:
99+
if not path.is_symlink():
100+
yield path
101+
except OSError:
102+
pass
103+
104+
69105
def _select_children(parent_paths, dir_only, follow_symlinks, match):
70106
"""Yield direct children of given paths, filtering by name and type."""
71107
if follow_symlinks is None:
@@ -799,16 +835,26 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
799835
# TODO: evaluate case-sensitivity of each directory in _select_children().
800836
case_sensitive = _is_case_sensitive(self.pathmod)
801837

838+
# User doesn't care about case sensitivity, so for non-wildcard
839+
# patterns like "foo/bar" we can stat() once rather than scandir()
840+
# twice. Returned paths may not match real filesystem case.
841+
case_preserving = False
842+
else:
843+
# Explicit case sensitivity choice provided. We must use scandir()
844+
# to retrieve and match filenames with real filesystem case.
845+
case_preserving = True
846+
802847
stack = pattern._pattern_stack
803848
specials = ('', '.', '..')
849+
check_paths = False
804850
deduplicate_paths = False
805851
sep = self.pathmod.sep
806852
paths = iter([self] if self.is_dir() else [])
807853
while stack:
808854
part = stack.pop()
809855
if part in specials:
810856
# Join special component (e.g. '..') onto paths.
811-
paths = _select_special(paths, part)
857+
paths = _select_literal(paths, part)
812858

813859
elif part == '**':
814860
# Consume following '**' components, which have no effect.
@@ -826,6 +872,11 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
826872
# re.Pattern object based on those components.
827873
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
828874

875+
# Ensure directories exist.
876+
if check_paths:
877+
paths = _select_directories(paths)
878+
check_paths = False
879+
829880
# Recursively walk directories, filtering by type and regex.
830881
paths = _select_recursive(paths, bool(stack), follow_symlinks, match)
831882

@@ -837,13 +888,32 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
837888
elif '**' in part:
838889
raise ValueError("Invalid pattern: '**' can only be an entire path component")
839890

840-
else:
891+
elif case_preserving or _is_wildcard_pattern(part):
841892
# If the pattern component isn't '*', compile an re.Pattern
842893
# object based on the component.
843894
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
844895

845896
# Iterate over directories' children filtering by type and regex.
846897
paths = _select_children(paths, bool(stack), follow_symlinks, match)
898+
899+
# Paths are known to exist: they're directory children from _scandir()
900+
check_paths = False
901+
902+
else:
903+
# Join non-wildcard component onto paths.
904+
paths = _select_literal(paths, part)
905+
906+
# Filter out non-symlinks if requested.
907+
if follow_symlinks is False:
908+
paths = _deselect_symlinks(paths)
909+
910+
# Paths might not exist; mark them to be checked.
911+
check_paths = True
912+
913+
if check_paths:
914+
# Filter out paths that don't exist.
915+
paths = _deselect_missing(paths)
916+
847917
return paths
848918

849919
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):

Lib/test/test_pathlib/test_pathlib_abc.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,10 +1431,10 @@ def __repr__(self):
14311431
return "{}({!r})".format(self.__class__.__name__, self.as_posix())
14321432

14331433
def stat(self, *, follow_symlinks=True):
1434-
if follow_symlinks:
1435-
path = str(self.resolve())
1434+
if follow_symlinks or self.name == '..':
1435+
path = str(self.resolve(strict=True))
14361436
else:
1437-
path = str(self.parent.resolve() / self.name)
1437+
path = str(self.parent.resolve(strict=True) / self.name)
14381438
if path in self._files:
14391439
st_mode = stat.S_IFREG
14401440
elif path in self._directories:
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Speed up handling of non-wildcard pattern segments in
2+
:meth:`pathlib.Path.glob`.

0 commit comments

Comments
 (0)