Skip to content

Commit 9374acf

Browse files
authored
Improve performance of deletions in initial fine-grained cache load (#4701)
To do this we just fully ditch process_graph for doing initial cache loads, since we don't want to do anything but load the graph and trees from the cache. This has the side effect of disabling any writes to the cache while doing a successful cache load, so for consistency we also disable cache writes when a fine-grained cache load fails and we give up on it.
1 parent 049925c commit 9374acf

File tree

10 files changed

+229
-65
lines changed

10 files changed

+229
-65
lines changed

mypy/build.py

Lines changed: 54 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ class BuildResult:
8080
manager: The build manager.
8181
files: Dictionary from module name to related AST node.
8282
types: Dictionary from parse tree node to its inferred type.
83+
used_cache: Whether the build took advantage of a cache
8384
errors: List of error messages.
8485
"""
8586

@@ -88,6 +89,7 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
8889
self.graph = graph
8990
self.files = manager.modules
9091
self.types = manager.all_types # Non-empty for tests only or if dumping deps
92+
self.used_cache = manager.cache_enabled
9193
self.errors = [] # type: List[str] # Filled in by build if desired
9294

9395

@@ -569,6 +571,9 @@ class BuildManager:
569571
flush_errors: A function for processing errors after each SCC
570572
saved_cache: Dict with saved cache state for coarse-grained dmypy
571573
(read-write!)
574+
cache_enabled: Whether cache usage is enabled. This is set based on options,
575+
but is disabled if fine-grained cache loading fails
576+
and after an initial fine-grained load.
572577
stats: Dict with various instrumentation numbers
573578
"""
574579

@@ -588,7 +593,6 @@ def __init__(self, data_dir: str,
588593
self.data_dir = data_dir
589594
self.errors = errors
590595
self.errors.set_ignore_prefix(ignore_prefix)
591-
self.only_load_from_cache = options.use_fine_grained_cache
592596
self.lib_path = tuple(lib_path)
593597
self.source_set = source_set
594598
self.reports = reports
@@ -607,9 +611,14 @@ def __init__(self, data_dir: str,
607611
self.rechecked_modules = set() # type: Set[str]
608612
self.plugin = plugin
609613
self.flush_errors = flush_errors
614+
self.cache_enabled = options.incremental and (
615+
not options.fine_grained_incremental or options.use_fine_grained_cache)
610616
self.saved_cache = saved_cache if saved_cache is not None else {} # type: SavedCache
611617
self.stats = {} # type: Dict[str, Any] # Values are ints or floats
612618

619+
def use_fine_grained_cache(self) -> bool:
620+
return self.cache_enabled and self.options.use_fine_grained_cache
621+
613622
def maybe_swap_for_shadow_path(self, path: str) -> str:
614623
if (self.options.shadow_file and
615624
os.path.samefile(self.options.shadow_file[0], path)):
@@ -1157,7 +1166,7 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str],
11571166
# changed since the cache was generated. We *don't* want to do a
11581167
# coarse-grained incremental rebuild, so we accept the cache
11591168
# metadata even if it doesn't match the source file.
1160-
if manager.options.use_fine_grained_cache:
1169+
if manager.use_fine_grained_cache():
11611170
manager.log('Using potentially stale metadata for {}'.format(id))
11621171
return meta
11631172

@@ -1655,7 +1664,7 @@ def __init__(self,
16551664
self.path = path
16561665
self.xpath = path or '<string>'
16571666
self.source = source
1658-
if path and source is None and self.options.incremental:
1667+
if path and source is None and self.manager.cache_enabled:
16591668
self.meta = find_cache_meta(self.id, path, manager)
16601669
# TODO: Get mtime if not cached.
16611670
if self.meta is not None:
@@ -1675,10 +1684,10 @@ def __init__(self,
16751684
for id, line in zip(self.meta.dependencies, self.meta.dep_lines)}
16761685
self.child_modules = set(self.meta.child_modules)
16771686
else:
1678-
# In fine-grained cache mode, pretend we only know about modules that
1679-
# have cache information and defer handling new modules until the
1680-
# fine-grained update.
1681-
if manager.only_load_from_cache:
1687+
# When doing a fine-grained cache load, pretend we only
1688+
# know about modules that have cache information and defer
1689+
# handling new modules until the fine-grained update.
1690+
if manager.use_fine_grained_cache():
16821691
manager.log("Deferring module to fine-grained update %s (%s)" % (path, id))
16831692
raise ModuleNotFound
16841693

@@ -1795,13 +1804,15 @@ def load_tree(self) -> None:
17951804

17961805
def fix_cross_refs(self) -> None:
17971806
assert self.tree is not None, "Internal error: method must be called on parsed file only"
1807+
# We need to set quick_and_dirty when doing a fine grained
1808+
# cache load because we need to gracefully handle missing modules.
17981809
fixup_module_pass_one(self.tree, self.manager.modules,
1799-
self.manager.options.quick_and_dirty)
1810+
self.manager.options.quick_and_dirty or
1811+
self.manager.use_fine_grained_cache())
18001812

18011813
def calculate_mros(self) -> None:
18021814
assert self.tree is not None, "Internal error: method must be called on parsed file only"
1803-
fixup_module_pass_two(self.tree, self.manager.modules,
1804-
self.manager.options.quick_and_dirty)
1815+
fixup_module_pass_two(self.tree, self.manager.modules)
18051816

18061817
def patch_dependency_parents(self) -> None:
18071818
"""
@@ -2058,7 +2069,7 @@ def valid_references(self) -> Set[str]:
20582069

20592070
def write_cache(self) -> None:
20602071
assert self.tree is not None, "Internal error: method must be called on parsed file only"
2061-
if not self.path or self.options.cache_dir == os.devnull:
2072+
if not self.path or not self.manager.cache_enabled:
20622073
return
20632074
if self.manager.options.quick_and_dirty:
20642075
is_errors = self.manager.errors.is_errors_for_file(self.path)
@@ -2105,9 +2116,12 @@ def dispatch(sources: List[BuildSource], manager: BuildManager) -> Graph:
21052116
# This is a kind of unfortunate hack to work around some of fine-grained's
21062117
# fragility: if we have loaded less than 50% of the specified files from
21072118
# cache in fine-grained cache mode, load the graph again honestly.
2108-
if manager.options.use_fine_grained_cache and len(graph) < 0.50 * len(sources):
2109-
manager.log("Redoing load_graph because too much was missing")
2110-
manager.only_load_from_cache = False
2119+
# In this case, we just turn the cache off entirely, so we don't need
2120+
# to worry about some files being loaded and some from cache and so
2121+
# that fine-grained mode never *writes* to the cache.
2122+
if manager.use_fine_grained_cache() and len(graph) < 0.50 * len(sources):
2123+
manager.log("Redoing load_graph without cache because too much was missing")
2124+
manager.cache_enabled = False
21112125
graph = load_graph(sources, manager)
21122126

21132127
t1 = time.time()
@@ -2128,7 +2142,13 @@ def dispatch(sources: List[BuildSource], manager: BuildManager) -> Graph:
21282142
if manager.options.dump_graph:
21292143
dump_graph(graph)
21302144
return graph
2131-
process_graph(graph, manager)
2145+
# If we are loading a fine-grained incremental mode cache, we
2146+
# don't want to do a real incremental reprocess of the graph---we
2147+
# just want to load in all of the cache information.
2148+
if manager.use_fine_grained_cache():
2149+
process_fine_grained_cache_graph(graph, manager)
2150+
else:
2151+
process_graph(graph, manager)
21322152
updated = preserve_cache(graph)
21332153
set_updated = set(updated)
21342154
manager.saved_cache.clear()
@@ -2437,14 +2457,6 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
24372457
manager.log("Processing SCC of size %d (%s) as %s" % (size, scc_str, fresh_msg))
24382458
process_stale_scc(graph, scc, manager)
24392459

2440-
# If we are running in fine-grained incremental mode with caching,
2441-
# we always process fresh SCCs so that we have all of the symbol
2442-
# tables and fine-grained dependencies available.
2443-
if manager.options.use_fine_grained_cache:
2444-
for prev_scc in fresh_scc_queue:
2445-
process_fresh_scc(graph, prev_scc, manager)
2446-
fresh_scc_queue = []
2447-
24482460
sccs_left = len(fresh_scc_queue)
24492461
nodes_left = sum(len(scc) for scc in fresh_scc_queue)
24502462
manager.add_stats(sccs_left=sccs_left, nodes_left=nodes_left)
@@ -2456,6 +2468,25 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
24562468
manager.log("No fresh SCCs left in queue")
24572469

24582470

2471+
def process_fine_grained_cache_graph(graph: Graph, manager: BuildManager) -> None:
2472+
"""Finish loading everything for use in the fine-grained incremental cache"""
2473+
2474+
# If we are running in fine-grained incremental mode with caching,
2475+
# we process all SCCs as fresh SCCs so that we have all of the symbol
2476+
# tables and fine-grained dependencies available.
2477+
# We fail the loading of any SCC that we can't load a meta for, so we
2478+
# don't have anything *but* fresh SCCs.
2479+
sccs = sorted_components(graph)
2480+
manager.log("Found %d SCCs; largest has %d nodes" %
2481+
(len(sccs), max(len(scc) for scc in sccs)))
2482+
2483+
for ascc in sccs:
2484+
# Order the SCC's nodes using a heuristic.
2485+
# Note that ascc is a set, and scc is a list.
2486+
scc = order_ascc(graph, ascc)
2487+
process_fresh_scc(graph, scc, manager)
2488+
2489+
24592490
def order_ascc(graph: Graph, ascc: AbstractSet[str], pri_max: int = PRI_ALL) -> List[str]:
24602491
"""Come up with the ideal processing order within an SCC.
24612492

mypy/dmypy_server.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -286,9 +286,6 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
286286
self.fscache = FileSystemCache(self.options.python_version)
287287
self.fswatcher = FileSystemWatcher(self.fscache)
288288
self.update_sources(sources)
289-
if not self.options.use_fine_grained_cache:
290-
# Stores the initial state of sources as a side effect.
291-
self.fswatcher.find_changed()
292289
try:
293290
result = mypy.build.build(sources=sources,
294291
options=self.options,
@@ -305,12 +302,11 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
305302
graph = result.graph
306303
self.fine_grained_manager = FineGrainedBuildManager(manager, graph)
307304
self.previous_sources = sources
308-
self.fscache.flush()
309305

310306
# If we are using the fine-grained cache, build hasn't actually done
311307
# the typechecking on the updated files yet.
312308
# Run a fine-grained update starting from the cached data
313-
if self.options.use_fine_grained_cache:
309+
if result.used_cache:
314310
# Pull times and hashes out of the saved_cache and stick them into
315311
# the fswatcher, so we pick up the changes.
316312
for state in self.fine_grained_manager.graph.values():
@@ -323,9 +319,20 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
323319

324320
# Run an update
325321
changed = self.find_changed(sources)
322+
323+
# Find anything that has had its dependency list change
324+
for state in self.fine_grained_manager.graph.values():
325+
if not state.is_fresh():
326+
assert state.path is not None
327+
changed.append((state.id, state.path))
328+
326329
if changed:
327330
messages = self.fine_grained_manager.update(changed)
328-
self.fscache.flush()
331+
else:
332+
# Stores the initial state of sources as a side effect.
333+
self.fswatcher.find_changed()
334+
335+
self.fscache.flush()
329336

330337
status = 1 if messages else 0
331338
self.previous_messages = messages[:]

mypy/fixup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ def fixup_module_pass_one(tree: MypyFile, modules: Dict[str, MypyFile],
2222
node_fixer.visit_symbol_table(tree.names)
2323

2424

25-
def fixup_module_pass_two(tree: MypyFile, modules: Dict[str, MypyFile],
26-
quick_and_dirty: bool) -> None:
25+
def fixup_module_pass_two(tree: MypyFile, modules: Dict[str, MypyFile]) -> None:
2726
compute_all_mros(tree.names, modules)
2827

2928

mypy/server/astdiff.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,9 @@ def snapshot_symbol_table(name_prefix: str, table: SymbolTable) -> Dict[str, Sna
136136
common = (fullname, symbol.kind, symbol.module_public)
137137
if symbol.kind == MODULE_REF:
138138
# This is a cross-reference to another module.
139-
assert isinstance(node, MypyFile)
139+
# If the reference is busted because the other module is missing,
140+
# the node will be a "stale_info" TypeInfo produced by fixup,
141+
# but that doesn't really matter to us here.
140142
result[name] = ('Moduleref', common)
141143
elif symbol.kind == TVAR:
142144
assert isinstance(node, TypeVarExpr)

mypy/server/update.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,13 +171,17 @@ def __init__(self,
171171
# Module that we haven't processed yet but that are known to be stale.
172172
self.stale = [] # type: List[Tuple[str, str]]
173173
# Disable the cache so that load_graph doesn't try going back to disk
174-
# for the cache. This is kind of a hack and it might be better to have
175-
# this directly reflected in load_graph's interface.
176-
self.options.cache_dir = os.devnull
174+
# for the cache.
175+
self.manager.cache_enabled = False
177176
manager.saved_cache = {}
178-
manager.only_load_from_cache = False
177+
178+
# Some hints to the test suite about what is going on:
179179
# Active triggers during the last update
180180
self.triggered = [] # type: List[str]
181+
# Modules passed to update during the last update
182+
self.changed_modules = [] # type: List[Tuple[str, str]]
183+
# Modules processed during the last update
184+
self.updated_modules = [] # type: List[str]
181185

182186
def update(self, changed_modules: List[Tuple[str, str]]) -> List[str]:
183187
"""Update previous build result by processing changed modules.
@@ -198,10 +202,13 @@ def update(self, changed_modules: List[Tuple[str, str]]) -> List[str]:
198202
"""
199203
assert changed_modules, 'No changed modules'
200204

205+
self.changed_modules = changed_modules
206+
201207
# Reset global caches for the new build.
202208
find_module_clear_caches()
203209

204210
self.triggered = []
211+
self.updated_modules = []
205212
changed_modules = dedupe_modules(changed_modules + self.stale)
206213
initial_set = {id for id, _ in changed_modules}
207214
self.manager.log_fine_grained('==== update %s ====' % ', '.join(
@@ -258,6 +265,7 @@ def update_single(self, module: str, path: str) -> Tuple[List[str],
258265
- Whether there was a blocking error in the module
259266
"""
260267
self.manager.log_fine_grained('--- update single %r ---' % module)
268+
self.updated_modules.append(module)
261269

262270
# TODO: If new module brings in other modules, we parse some files multiple times.
263271
manager = self.manager

mypy/test/helpers.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import time
66
import shutil
77

8-
from typing import List, Dict, Tuple, Callable, Any, Optional
8+
from typing import List, Iterable, Dict, Tuple, Callable, Any, Optional
99

1010
from mypy import defaults
1111
from mypy.test.config import test_temp_dir
@@ -98,6 +98,21 @@ def assert_string_arrays_equal(expected: List[str], actual: List[str],
9898
raise AssertionError(msg)
9999

100100

101+
def assert_module_equivalence(name: str,
102+
expected: Optional[Iterable[str]], actual: Iterable[str]) -> None:
103+
if expected is not None:
104+
expected_normalized = sorted(expected)
105+
actual_normalized = sorted(set(actual).difference({"__main__"}))
106+
assert_string_arrays_equal(
107+
expected_normalized,
108+
actual_normalized,
109+
('Actual modules ({}) do not match expected modules ({}) '
110+
'for "[{} ...]"').format(
111+
', '.join(actual_normalized),
112+
', '.join(expected_normalized),
113+
name))
114+
115+
101116
def update_testcase_output(testcase: DataDrivenTestCase, output: List[str]) -> None:
102117
assert testcase.old_cwd is not None, "test was not properly set up"
103118
testcase_path = os.path.join(testcase.old_cwd, testcase.file)

mypy/test/testcheck.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from mypy.test.config import test_temp_dir
1212
from mypy.test.data import DataDrivenTestCase, DataSuite
1313
from mypy.test.helpers import (
14-
assert_string_arrays_equal, normalize_error_messages,
14+
assert_string_arrays_equal, normalize_error_messages, assert_module_equivalence,
1515
retry_on_error, update_testcase_output, parse_options,
1616
copy_and_fudge_mtime
1717
)
@@ -190,29 +190,15 @@ def run_case_once(self, testcase: DataDrivenTestCase, incremental_step: int = 0)
190190
self.verify_cache(module_data, a, res.manager)
191191
if incremental_step > 1:
192192
suffix = '' if incremental_step == 2 else str(incremental_step - 1)
193-
self.check_module_equivalence(
193+
assert_module_equivalence(
194194
'rechecked' + suffix,
195195
testcase.expected_rechecked_modules.get(incremental_step - 1),
196196
res.manager.rechecked_modules)
197-
self.check_module_equivalence(
197+
assert_module_equivalence(
198198
'stale' + suffix,
199199
testcase.expected_stale_modules.get(incremental_step - 1),
200200
res.manager.stale_modules)
201201

202-
def check_module_equivalence(self, name: str,
203-
expected: Optional[Set[str]], actual: Set[str]) -> None:
204-
if expected is not None:
205-
expected_normalized = sorted(expected)
206-
actual_normalized = sorted(actual.difference({"__main__"}))
207-
assert_string_arrays_equal(
208-
expected_normalized,
209-
actual_normalized,
210-
('Actual modules ({}) do not match expected modules ({}) '
211-
'for "[{} ...]"').format(
212-
', '.join(actual_normalized),
213-
', '.join(expected_normalized),
214-
name))
215-
216202
def verify_cache(self, module_data: List[Tuple[str, str, str]], a: List[str],
217203
manager: build.BuildManager) -> None:
218204
# There should be valid cache metadata for each module except

0 commit comments

Comments
 (0)