Skip to content

Commit 4ab2035

Browse files
committed
FindModuleCache: optionally leverage BuildSourceSet
Gated behind a command line flag to assuage concerns about subtle issues in module lookup being introduced by this fast path.
1 parent 0cec4f7 commit 4ab2035

File tree

4 files changed

+119
-32
lines changed

4 files changed

+119
-32
lines changed

mypy/build.py

+4-30
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
from mypy.report import Reports # Avoid unconditional slow import
4343
from mypy.fixup import fixup_module
4444
from mypy.modulefinder import (
45-
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
46-
ModuleNotFoundReason
45+
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths,
46+
ModuleSearchResult, ModuleNotFoundReason
4747
)
4848
from mypy.nodes import Expression
4949
from mypy.options import Options
@@ -106,33 +106,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
106106
self.errors: List[str] = [] # Filled in by build if desired
107107

108108

109-
class BuildSourceSet:
110-
"""Efficiently test a file's membership in the set of build sources."""
111-
112-
def __init__(self, sources: List[BuildSource]) -> None:
113-
self.source_text_present = False
114-
self.source_modules: Set[str] = set()
115-
self.source_paths: Set[str] = set()
116-
117-
for source in sources:
118-
if source.text is not None:
119-
self.source_text_present = True
120-
elif source.path:
121-
self.source_paths.add(source.path)
122-
else:
123-
self.source_modules.add(source.module)
124-
125-
def is_source(self, file: MypyFile) -> bool:
126-
if file.path and file.path in self.source_paths:
127-
return True
128-
elif file._fullname in self.source_modules:
129-
return True
130-
elif self.source_text_present:
131-
return True
132-
else:
133-
return False
134-
135-
136109
def build(sources: List[BuildSource],
137110
options: Options,
138111
alt_lib_path: Optional[str] = None,
@@ -627,7 +600,8 @@ def __init__(self, data_dir: str,
627600
or options.use_fine_grained_cache)
628601
and not has_reporters)
629602
self.fscache = fscache
630-
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
603+
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
604+
source_set=self.source_set)
631605
self.metastore = create_metastore(options)
632606

633607
# a mapping from source files to their corresponding shadow files

mypy/main.py

+4
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,10 @@ def add_invertible_flag(flag: str,
870870
'--explicit-package-bases', default=False,
871871
help="Use current directory and MYPYPATH to determine module names of files passed",
872872
group=code_group)
873+
add_invertible_flag(
874+
'--fast-module-lookup', default=False,
875+
help="Enable fast path for finding modules within input sources",
876+
group=code_group)
873877
code_group.add_argument(
874878
"--exclude",
875879
action="append",

mypy/modulefinder.py

+109-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from typing_extensions import Final, TypeAlias as _TypeAlias
1717

1818
from mypy.fscache import FileSystemCache
19+
from mypy.nodes import MypyFile
1920
from mypy.options import Options
2021
from mypy.stubinfo import is_legacy_bundled_package
2122
from mypy import pyinfo
@@ -115,6 +116,33 @@ def __repr__(self) -> str:
115116
self.base_dir)
116117

117118

119+
class BuildSourceSet:
120+
"""Helper to efficiently test a file's membership in a set of build sources."""
121+
122+
def __init__(self, sources: List[BuildSource]) -> None:
123+
self.source_text_present = False
124+
self.source_modules = {} # type: Dict[str, str]
125+
self.source_paths = set() # type: Set[str]
126+
127+
for source in sources:
128+
if source.text is not None:
129+
self.source_text_present = True
130+
if source.path:
131+
self.source_paths.add(source.path)
132+
if source.module:
133+
self.source_modules[source.module] = source.path or ''
134+
135+
def is_source(self, file: MypyFile) -> bool:
136+
if file.path and file.path in self.source_paths:
137+
return True
138+
elif file._fullname in self.source_modules:
139+
return True
140+
elif self.source_text_present:
141+
return True
142+
else:
143+
return False
144+
145+
118146
class FindModuleCache:
119147
"""Module finder with integrated cache.
120148
@@ -130,8 +158,10 @@ def __init__(self,
130158
search_paths: SearchPaths,
131159
fscache: Optional[FileSystemCache],
132160
options: Optional[Options],
133-
stdlib_py_versions: Optional[StdlibVersions] = None) -> None:
161+
stdlib_py_versions: Optional[StdlibVersions] = None,
162+
source_set: Optional[BuildSourceSet] = None) -> None:
134163
self.search_paths = search_paths
164+
self.source_set = source_set
135165
self.fscache = fscache or FileSystemCache()
136166
# Cache for get_toplevel_possibilities:
137167
# search_paths -> (toplevel_id -> list(package_dirs))
@@ -153,6 +183,50 @@ def clear(self) -> None:
153183
self.initial_components.clear()
154184
self.ns_ancestors.clear()
155185

186+
def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
187+
if not self.source_set:
188+
return None
189+
190+
p = self.source_set.source_modules.get(id, None)
191+
if p and self.fscache.isfile(p):
192+
# We need to make sure we still have __init__.py all the way up
193+
# otherwise we might have false positives compared to slow path
194+
# in case of deletion of init files, which is covered by some tests.
195+
# TODO: are there some combination of flags in which this check should be skipped?
196+
d = os.path.dirname(p)
197+
for _ in range(id.count('.')):
198+
if not any(self.fscache.isfile(os.path.join(d, '__init__' + x))
199+
for x in PYTHON_EXTENSIONS):
200+
return None
201+
d = os.path.dirname(d)
202+
return p
203+
204+
idx = id.rfind('.')
205+
if idx != -1:
206+
# When we're looking for foo.bar.baz and can't find a matching module
207+
# in the source set, look up for a foo.bar module.
208+
parent = self.find_module_via_source_set(id[:idx])
209+
if parent is None or not isinstance(parent, str):
210+
return None
211+
212+
basename, ext = os.path.splitext(parent)
213+
if (not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS)
214+
and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))):
215+
# If we do find such a *module* (and crucially, we don't want a package,
216+
# hence the filtering out of __init__ files, and checking for the presence
217+
# of a folder with a matching name), then we can be pretty confident that
218+
# 'baz' will either be a top-level variable in foo.bar, or will not exist.
219+
#
220+
# Either way, spelunking in other search paths for another 'foo.bar.baz'
221+
# module should be avoided because:
222+
# 1. in the unlikely event that one were found, it's highly likely that
223+
# it would be unrelated to the source being typechecked and therefore
224+
# more likely to lead to erroneous results
225+
# 2. as described in _find_module, in some cases the search itself could
226+
# potentially waste significant amounts of time
227+
return ModuleNotFoundReason.NOT_FOUND
228+
return None
229+
156230
def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
157231
"""Find which elements of a lib_path have the directory a module needs to exist.
158232
@@ -218,7 +292,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
218292
elif top_level in self.stdlib_py_versions:
219293
use_typeshed = self._typeshed_has_version(top_level)
220294
self.results[id] = self._find_module(id, use_typeshed)
221-
if (not fast_path
295+
if (not (fast_path or (self.options is not None and self.options.fast_module_lookup))
222296
and self.results[id] is ModuleNotFoundReason.NOT_FOUND
223297
and self._can_find_module_in_parent_dir(id)):
224298
self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY
@@ -284,6 +358,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
284358
def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult:
285359
fscache = self.fscache
286360

361+
# Fast path for any modules in the current source set.
362+
# This is particularly important when there are a large number of search
363+
# paths which share the first (few) component(s) due to the use of namespace
364+
# packages, for instance:
365+
# foo/
366+
# company/
367+
# __init__.py
368+
# foo/
369+
# bar/
370+
# company/
371+
# __init__.py
372+
# bar/
373+
# baz/
374+
# company/
375+
# __init__.py
376+
# baz/
377+
#
378+
# mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
379+
# and computes [foo, bar, baz, ...] as the module search path.
380+
#
381+
# This would result in O(n) search for every import of company.*, leading to
382+
# O(n**2) behavior in load_graph as such imports are unsurprisingly present
383+
# at least once, and usually many more times than that, in each and every file
384+
# being parsed.
385+
#
386+
# Thankfully, such cases are efficiently handled by looking up the module path
387+
# via BuildSourceSet.
388+
p = (self.find_module_via_source_set(id)
389+
if (self.options is not None and self.options.fast_module_lookup)
390+
else None)
391+
if p:
392+
return p
393+
287394
# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
288395
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
289396
# that only once and cache it for when we look for modules like 'foo.bar.blah'

mypy/options.py

+2
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,8 @@ def __init__(self) -> None:
287287
self.cache_map: Dict[str, Tuple[str, str]] = {}
288288
# Don't properly free objects on exit, just kill the current process.
289289
self.fast_exit = True
290+
# fast path for finding modules from source set
291+
self.fast_module_lookup = False
290292
# Used to transform source code before parsing if not None
291293
# TODO: Make the type precise (AnyStr -> AnyStr)
292294
self.transform_source: Optional[Callable[[Any], Any]] = None

0 commit comments

Comments
 (0)