Skip to content

Commit 04bc7f6

Browse files
committed
FindModuleCache: leverage BuildSourceSet
On large codebases, the time in `load_graph` is dominated by `find_module` because this operation is itself `O(n)` where `n` is the number of input files, which ends up being `O(n**2)` because it is called for every import statement in the codebase. Introduce a fast path that leverages the fact that for imports within the code being typechecked, we already have a mapping of module import path to file path in `BuildSourceSet` In a real-world codebase with ~13k files split across dozens of disjoint folders, this brings `load_graph` from ~180s down to ~48s, with profiling showing that `parse` is now taking the vast majority of the time, as expected.
1 parent 835b427 commit 04bc7f6

File tree

2 files changed

+100
-32
lines changed

2 files changed

+100
-32
lines changed

mypy/build.py

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from mypy import moduleinfo
4444
from mypy.fixup import fixup_module
4545
from mypy.modulefinder import (
46-
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
46+
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
4747
ModuleNotFoundReason
4848
)
4949
from mypy.nodes import Expression
@@ -106,33 +106,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
106106
self.errors = [] # type: List[str] # Filled in by build if desired
107107

108108

109-
class BuildSourceSet:
110-
"""Efficiently test a file's membership in the set of build sources."""
111-
112-
def __init__(self, sources: List[BuildSource]) -> None:
113-
self.source_text_present = False
114-
self.source_modules = set() # type: Set[str]
115-
self.source_paths = set() # type: Set[str]
116-
117-
for source in sources:
118-
if source.text is not None:
119-
self.source_text_present = True
120-
elif source.path:
121-
self.source_paths.add(source.path)
122-
else:
123-
self.source_modules.add(source.module)
124-
125-
def is_source(self, file: MypyFile) -> bool:
126-
if file.path and file.path in self.source_paths:
127-
return True
128-
elif file._fullname in self.source_modules:
129-
return True
130-
elif self.source_text_present:
131-
return True
132-
else:
133-
return False
134-
135-
136109
def build(sources: List[BuildSource],
137110
options: Options,
138111
alt_lib_path: Optional[str] = None,
@@ -621,7 +594,8 @@ def __init__(self, data_dir: str,
621594
or options.use_fine_grained_cache)
622595
and not has_reporters)
623596
self.fscache = fscache
624-
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
597+
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
598+
source_set=self.source_set)
625599
self.metastore = create_metastore(options)
626600

627601
# a mapping from source files to their corresponding shadow files

mypy/modulefinder.py

Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from mypy.defaults import PYTHON3_VERSION_MIN
1818
from mypy.fscache import FileSystemCache
19+
from mypy.nodes import MypyFile
1920
from mypy.options import Options
2021
from mypy import sitepkgs
2122

@@ -92,6 +93,33 @@ def __repr__(self) -> str:
9293
self.base_dir)
9394

9495

96+
class BuildSourceSet:
97+
"""Efficiently test a file's membership in the set of build sources."""
98+
99+
def __init__(self, sources: List[BuildSource]) -> None:
100+
self.source_text_present = False
101+
self.source_modules = {} # type: Dict[str, str]
102+
self.source_paths = set() # type: Set[str]
103+
104+
for source in sources:
105+
if source.text is not None:
106+
self.source_text_present = True
107+
if source.path:
108+
self.source_paths.add(source.path)
109+
if source.module:
110+
self.source_modules[source.module] = source.path or ''
111+
112+
def is_source(self, file: MypyFile) -> bool:
113+
if file.path and file.path in self.source_paths:
114+
return True
115+
elif file._fullname in self.source_modules:
116+
return True
117+
elif self.source_text_present:
118+
return True
119+
else:
120+
return False
121+
122+
95123
class FindModuleCache:
96124
"""Module finder with integrated cache.
97125
@@ -107,8 +135,10 @@ def __init__(self,
107135
search_paths: SearchPaths,
108136
fscache: Optional[FileSystemCache] = None,
109137
options: Optional[Options] = None,
110-
ns_packages: Optional[List[str]] = None) -> None:
138+
ns_packages: Optional[List[str]] = None,
139+
source_set: Optional[BuildSourceSet] = None) -> None:
111140
self.search_paths = search_paths
141+
self.source_set = source_set
112142
self.fscache = fscache or FileSystemCache()
113143
# Cache for get_toplevel_possibilities:
114144
# search_paths -> (toplevel_id -> list(package_dirs))
@@ -124,6 +154,39 @@ def clear(self) -> None:
124154
self.initial_components.clear()
125155
self.ns_ancestors.clear()
126156

157+
def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
158+
if not self.source_set:
159+
return None
160+
p = self.source_set.source_modules.get(id, None)
161+
if p and self.fscache.isfile(p):
162+
# NB: need to make sure we still have __init__.py all the way up
163+
# otherwise we might have false positives compared to slow path
164+
d = os.path.dirname(p)
165+
for i in range(id.count('.')):
166+
if not self.fscache.isfile(os.path.join(d, '__init__.py')):
167+
return None
168+
d = os.path.dirname(d)
169+
return p
170+
171+
idx = id.rfind('.')
172+
if idx != - 1:
173+
parent = self.find_module_via_source_set(id[:idx])
174+
if (
175+
parent and isinstance(parent, str)
176+
and not parent.endswith('__init__.py')
177+
and not self.fscache.isdir(os.path.splitext(parent)[0])
178+
):
179+
# if
180+
# 1. we're looking for foo.bar.baz
181+
# 2. foo.bar.py[i] is in the source set
182+
# 3. foo.bar is not a directory
183+
# then we don't want to go spelunking in other search paths to find
184+
# another 'bar' module, because it's a waste of time and even in the
185+
# unlikely event that we did find one that matched, it probably would
186+
# be completely unrelated and undesirable
187+
return ModuleNotFoundReason.NOT_FOUND
188+
return None
189+
127190
def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
128191
"""Find which elements of a lib_path have the directory a module needs to exist.
129192
@@ -209,8 +272,8 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
209272
"""
210273
working_dir = os.getcwd()
211274
parent_search = FindModuleCache(SearchPaths((), (), (), ()))
212-
while any(file.endswith(("__init__.py", "__init__.pyi"))
213-
for file in os.listdir(working_dir)):
275+
while any(os.path.exists(os.path.join(working_dir, f))
276+
for f in ["__init__.py", "__init__.pyi"]):
214277
working_dir = os.path.dirname(working_dir)
215278
parent_search.search_paths = SearchPaths((working_dir,), (), (), ())
216279
if not isinstance(parent_search._find_module(id), ModuleNotFoundReason):
@@ -220,6 +283,37 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
220283
def _find_module(self, id: str) -> ModuleSearchResult:
221284
fscache = self.fscache
222285

286+
# fast path for any modules in the current source set
287+
# this is particularly important when there are a large number of search
288+
# paths which share the first (few) component(s) due to the use of namespace
289+
# packages, for instance
290+
# foo/
291+
# company/
292+
# __init__.py
293+
# foo/
294+
# bar/
295+
# company/
296+
# __init__.py
297+
# bar/
298+
# baz/
299+
# company/
300+
# __init__.py
301+
# baz/
302+
#
303+
# mypy gets [foo/company/foo, foo/company/bar, foo/company/baz, ...] as input
304+
# and computes [foo, bar, baz, ...] as the module search path
305+
#
306+
# This would result in O(n) search for every import of company.* and since,
307+
# leading to O(n**2) behavior in load_graph as such imports are unsurprisingly
308+
# present at least once, and usually many more times than that, in each and
309+
# every file being parsed
310+
#
311+
# Thankfully, such cases are efficiently handled by looking up the module path
312+
# via BuildSourceSet
313+
p = self.find_module_via_source_set(id)
314+
if p:
315+
return p
316+
223317
# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
224318
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
225319
# that only once and cache it for when we look for modules like 'foo.bar.blah'

0 commit comments

Comments
 (0)