Skip to content

Commit 95a0186

Browse files
Michael0x2agvanrossum
authored andcommitted
Make incremental not propagate staleness if the public interface of a module is unchanged (v2) (#2014)
* Add tests for interface-based incremental checks This commit modifies the incremental tests so that changes that don't change the "public interface" of the file will not trigger a recheck in any dependees. * Add "rechecked" keyword to incremental tests This commit makes the incremental tests distinguish between "stale" and "rechecked" files. Stale files are ones where the "public interface" of that method is changed; rechecked files are ones that must be re-analyzed (perhaps because one of their dependencies changed?) but did not necessarily have a changed public interface. This functionality is currently unused, but paves a path for upcoming incremental-mode changes. * Make incremental tests use 'rechecked' keyword This commit updates the incremental tests to use the 'rechecked' keywords and recalibrates a few tests to work against a more precise interface change detector. * Make incremental checks handle remote error Previously, the incremental checks did not correctly handle the case where... 1. Module "A" imports and uses module "B" 2. Incremental mode is successfully run 3. Module "B" is changed such that module "B" typechecks, but module "A" now throws an error 4. Incremental mode is run again What happens in this case is that the cache files for "A" and "B" are produced in the first run, and only the cache files for "B" is updated in the second run. Previously, the checks made the assumption that if a file has an error, it was due to a change within that file. This is not actually always the case.
1 parent c3fa852 commit 95a0186

File tree

6 files changed

+707
-45
lines changed

6 files changed

+707
-45
lines changed

mypy/build.py

Lines changed: 120 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import binascii
1414
import collections
1515
import contextlib
16+
import hashlib
1617
import json
1718
import os
1819
import os.path
@@ -290,6 +291,7 @@ def default_lib_path(data_dir: str, pyversion: Tuple[int, int]) -> List[str]:
290291
('child_modules', List[str]), # all submodules of the given module
291292
('options', Optional[Dict[str, bool]]), # build options
292293
('dep_prios', List[int]),
294+
('interface_hash', str), # hash representing the public interface
293295
('version_id', str), # mypy version for cache invalidation
294296
])
295297
# NOTE: dependencies + suppressed == all reachable imports;
@@ -351,6 +353,7 @@ def __init__(self, data_dir: str,
351353
self.type_checker = TypeChecker(self.errors, self.modules, options=options)
352354
self.missing_modules = set() # type: Set[str]
353355
self.stale_modules = set() # type: Set[str]
356+
self.rechecked_modules = set() # type: Set[str]
354357

355358
def all_imported_modules_in_file(self,
356359
file: MypyFile) -> List[Tuple[int, str, int]]:
@@ -728,6 +731,7 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> Optional[Cache
728731
meta.get('child_modules', []),
729732
meta.get('options'),
730733
meta.get('dep_prios', []),
734+
meta.get('interface_hash', ''),
731735
meta.get('version_id'),
732736
)
733737
if (m.id != id or m.path != path or
@@ -750,20 +754,27 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> Optional[Cache
750754
manager.trace('Metadata abandoned for {}: options differ'.format(id))
751755
return None
752756

757+
return m
758+
759+
760+
def is_meta_fresh(meta: CacheMeta, id: str, path: str, manager: BuildManager) -> bool:
761+
if meta is None:
762+
return False
763+
753764
# TODO: Share stat() outcome with find_module()
754765
st = os.stat(path) # TODO: Errors
755-
if st.st_mtime != m.mtime or st.st_size != m.size:
766+
if st.st_mtime != meta.mtime or st.st_size != meta.size:
756767
manager.log('Metadata abandoned for {}: file {} is modified'.format(id, path))
757768
return None
758769

759770
# It's a match on (id, path, mtime, size).
760771
# Check data_json; assume if its mtime matches it's good.
761772
# TODO: stat() errors
762-
if os.path.getmtime(data_json) != m.data_mtime:
773+
if os.path.getmtime(meta.data_json) != meta.data_mtime:
763774
manager.log('Metadata abandoned for {}: data cache is modified'.format(id))
764-
return None
765-
manager.log('Found {} {} (metadata is fresh)'.format(id, meta_json))
766-
return m
775+
return False
776+
manager.log('Found {} {} (metadata is fresh)'.format(id, meta.data_json))
777+
return True
767778

768779

769780
def select_options_affecting_cache(options: Options) -> Mapping[str, bool]:
@@ -783,10 +794,17 @@ def random_string() -> str:
783794
return binascii.hexlify(os.urandom(8)).decode('ascii')
784795

785796

797+
def compute_hash(text: str) -> str:
798+
# We use md5 instead of the builtin hash(...) function because the output of hash(...)
799+
# can differ between runs due to hash randomization (enabled by default in Python 3.3).
800+
# See the note in https://docs.python.org/3/reference/datamodel.html#object.__hash__.
801+
return hashlib.md5(text.encode('utf-8')).hexdigest()
802+
803+
786804
def write_cache(id: str, path: str, tree: MypyFile,
787805
dependencies: List[str], suppressed: List[str],
788806
child_modules: List[str], dep_prios: List[int],
789-
manager: BuildManager) -> None:
807+
old_interface_hash: str, manager: BuildManager) -> str:
790808
"""Write cache files for a module.
791809
792810
Args:
@@ -796,28 +814,52 @@ def write_cache(id: str, path: str, tree: MypyFile,
796814
dependencies: module IDs on which this module depends
797815
suppressed: module IDs which were suppressed as dependencies
798816
dep_prios: priorities (parallel array to dependencies)
817+
old_interface_hash: the hash from the previous version of the data cache file
799818
manager: the build manager (for pyversion, log/trace)
819+
820+
Return:
821+
The new interface hash based on the serialized tree
800822
"""
823+
# Obtain file paths
801824
path = os.path.abspath(path)
802-
manager.trace('Dumping {} {}'.format(id, path))
803-
st = os.stat(path) # TODO: Errors
804-
mtime = st.st_mtime
805-
size = st.st_size
806825
meta_json, data_json = get_cache_names(
807826
id, path, manager.options.cache_dir, manager.options.python_version)
808-
manager.log('Writing {} {} {}'.format(id, meta_json, data_json))
809-
data = tree.serialize()
827+
manager.log('Writing {} {} {} {}'.format(id, path, meta_json, data_json))
828+
829+
# Make sure directory for cache files exists
810830
parent = os.path.dirname(data_json)
811831
if not os.path.isdir(parent):
812832
os.makedirs(parent)
813833
assert os.path.dirname(meta_json) == parent
834+
835+
# Construct temp file names
814836
nonce = '.' + random_string()
815837
data_json_tmp = data_json + nonce
816838
meta_json_tmp = meta_json + nonce
817-
with open(data_json_tmp, 'w') as f:
818-
json.dump(data, f, indent=2, sort_keys=True)
819-
f.write('\n')
820-
data_mtime = os.path.getmtime(data_json_tmp)
839+
840+
# Serialize data and analyze interface
841+
data = tree.serialize()
842+
data_str = json.dumps(data, indent=2, sort_keys=True)
843+
interface_hash = compute_hash(data_str)
844+
845+
# Write data cache file, if applicable
846+
if old_interface_hash == interface_hash:
847+
# If the interface is unchanged, the cached data is guaranteed
848+
# to be equivalent, and we only need to update the metadata.
849+
data_mtime = os.path.getmtime(data_json)
850+
manager.trace("Interface for {} is unchanged".format(id))
851+
else:
852+
with open(data_json_tmp, 'w') as f:
853+
f.write(data_str)
854+
f.write('\n')
855+
data_mtime = os.path.getmtime(data_json_tmp)
856+
os.replace(data_json_tmp, data_json)
857+
manager.trace("Interface for {} has changed".format(id))
858+
859+
# Obtain and set up metadata
860+
st = os.stat(path) # TODO: Handle errors
861+
mtime = st.st_mtime
862+
size = st.st_size
821863
meta = {'id': id,
822864
'path': path,
823865
'mtime': mtime,
@@ -828,14 +870,18 @@ def write_cache(id: str, path: str, tree: MypyFile,
828870
'child_modules': child_modules,
829871
'options': select_options_affecting_cache(manager.options),
830872
'dep_prios': dep_prios,
873+
'interface_hash': interface_hash,
831874
'version_id': manager.version_id,
832875
}
876+
877+
# Write meta cache file
833878
with open(meta_json_tmp, 'w') as f:
834879
json.dump(meta, f, sort_keys=True)
835880
f.write('\n')
836-
os.replace(data_json_tmp, data_json)
837881
os.replace(meta_json_tmp, meta_json)
838882

883+
return interface_hash
884+
839885

840886
"""Dependency manager.
841887
@@ -1021,6 +1067,12 @@ class State:
10211067
# If caller_state is set, the line number in the caller where the import occurred
10221068
caller_line = 0
10231069

1070+
# If True, indicate that the public interface of this module is unchanged
1071+
externally_same = True
1072+
1073+
# Contains a hash of the public interface in incremental mode
1074+
interface_hash = "" # type: str
1075+
10241076
def __init__(self,
10251077
id: Optional[str],
10261078
path: Optional[str],
@@ -1100,8 +1152,10 @@ def __init__(self,
11001152
if path and source is None and manager.options.incremental:
11011153
self.meta = find_cache_meta(self.id, self.path, manager)
11021154
# TODO: Get mtime if not cached.
1155+
if self.meta is not None:
1156+
self.interface_hash = self.meta.interface_hash
11031157
self.add_ancestors()
1104-
if self.meta:
1158+
if is_meta_fresh(self.meta, self.id, self.path, manager):
11051159
# Make copies, since we may modify these and want to
11061160
# compare them to the originals later.
11071161
self.dependencies = list(self.meta.dependencies)
@@ -1113,6 +1167,7 @@ def __init__(self,
11131167
self.dep_line_map = {}
11141168
else:
11151169
# Parse the file (and then some) to get the dependencies.
1170+
self.meta = None
11161171
self.parse_file()
11171172
self.suppressed = []
11181173
self.child_modules = set()
@@ -1163,16 +1218,25 @@ def is_fresh(self) -> bool:
11631218
# suppression by --silent-imports. However when a suppressed
11641219
# dependency is added back we find out later in the process.
11651220
return (self.meta is not None
1221+
and self.is_interface_fresh()
11661222
and self.dependencies == self.meta.dependencies
11671223
and self.child_modules == set(self.meta.child_modules))
11681224

1225+
def is_interface_fresh(self) -> bool:
1226+
return self.externally_same
1227+
11691228
def has_new_submodules(self) -> bool:
11701229
"""Return if this module has new submodules after being loaded from a warm cache."""
11711230
return self.meta is not None and self.child_modules != set(self.meta.child_modules)
11721231

1173-
def mark_stale(self) -> None:
1174-
"""Throw away the cache data for this file, marking it as stale."""
1232+
def mark_as_rechecked(self) -> None:
1233+
"""Marks this module as having been fully re-analyzed by the type-checker."""
1234+
self.manager.rechecked_modules.add(self.id)
1235+
1236+
def mark_interface_stale(self) -> None:
1237+
"""Marks this module as having a stale public interface, and discards the cache data."""
11751238
self.meta = None
1239+
self.externally_same = False
11761240
self.manager.stale_modules.add(self.id)
11771241

11781242
def check_blockers(self) -> None:
@@ -1362,10 +1426,17 @@ def type_check(self) -> None:
13621426
def write_cache(self) -> None:
13631427
if self.path and self.manager.options.incremental and not self.manager.errors.is_errors():
13641428
dep_prios = [self.priorities.get(dep, PRI_HIGH) for dep in self.dependencies]
1365-
write_cache(self.id, self.path, self.tree,
1366-
list(self.dependencies), list(self.suppressed), list(self.child_modules),
1367-
dep_prios,
1368-
self.manager)
1429+
new_interface_hash = write_cache(
1430+
self.id, self.path, self.tree,
1431+
list(self.dependencies), list(self.suppressed), list(self.child_modules),
1432+
dep_prios, self.interface_hash,
1433+
self.manager)
1434+
if new_interface_hash == self.interface_hash:
1435+
self.manager.log("Cached module {} has same interface".format(self.id))
1436+
else:
1437+
self.manager.log("Cached module {} has changed interface".format(self.id))
1438+
self.mark_interface_stale()
1439+
self.interface_hash = new_interface_hash
13691440

13701441

13711442
def dispatch(sources: List[BuildSource], manager: BuildManager) -> None:
@@ -1434,6 +1505,7 @@ def load_graph(sources: List[BuildSource], manager: BuildManager) -> Graph:
14341505
for id, g in graph.items():
14351506
if g.has_new_submodules():
14361507
g.parse_file()
1508+
g.mark_interface_stale()
14371509
return graph
14381510

14391511

@@ -1472,7 +1544,7 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
14721544
for id in scc:
14731545
deps.update(graph[id].dependencies)
14741546
deps -= ascc
1475-
stale_deps = {id for id in deps if not graph[id].is_fresh()}
1547+
stale_deps = {id for id in deps if not graph[id].is_interface_fresh()}
14761548
fresh = fresh and not stale_deps
14771549
undeps = set()
14781550
if fresh:
@@ -1488,9 +1560,10 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
14881560
# All cache files are fresh. Check that no dependency's
14891561
# cache file is newer than any scc node's cache file.
14901562
oldest_in_scc = min(graph[id].meta.data_mtime for id in scc)
1491-
newest_in_deps = 0 if not deps else max(graph[dep].meta.data_mtime for dep in deps)
1563+
viable = {id for id in deps if not graph[id].is_interface_fresh()}
1564+
newest_in_deps = 0 if not viable else max(graph[dep].meta.data_mtime for dep in viable)
14921565
if manager.options.verbosity >= 3: # Dump all mtimes for extreme debugging.
1493-
all_ids = sorted(ascc | deps, key=lambda id: graph[id].meta.data_mtime)
1566+
all_ids = sorted(ascc | viable, key=lambda id: graph[id].meta.data_mtime)
14941567
for id in all_ids:
14951568
if id in scc:
14961569
if graph[id].meta.data_mtime < newest_in_deps:
@@ -1528,6 +1601,25 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
15281601
else:
15291602
process_stale_scc(graph, scc)
15301603

1604+
# TODO: This is a workaround to get around the "chaining imports" problem
1605+
# with the interface checks.
1606+
#
1607+
# That is, if we have a file named `module_a.py` which does:
1608+
#
1609+
# import module_b
1610+
# module_b.module_c.foo(3)
1611+
#
1612+
# ...and if the type signature of `module_c.foo(...)` were to change,
1613+
# module_a_ would not be rechecked since the interface of `module_b`
1614+
# would not be considered changed.
1615+
#
1616+
# As a workaround, this check will force a module's interface to be
1617+
# considered stale if anything it imports has a stale interface,
1618+
# which ensures these changes are caught and propagated.
1619+
if len(stale_deps) > 0:
1620+
for id in scc:
1621+
graph[id].mark_interface_stale()
1622+
15311623

15321624
def order_ascc(graph: Graph, ascc: AbstractSet[str], pri_max: int = PRI_ALL) -> List[str]:
15331625
"""Come up with the ideal processing order within an SCC.
@@ -1590,8 +1682,6 @@ def process_fresh_scc(graph: Graph, scc: List[str]) -> None:
15901682

15911683
def process_stale_scc(graph: Graph, scc: List[str]) -> None:
15921684
"""Process the modules in one SCC from source code."""
1593-
for id in scc:
1594-
graph[id].mark_stale()
15951685
for id in scc:
15961686
# We may already have parsed the module, or not.
15971687
# If the former, parse_file() is a no-op.
@@ -1606,6 +1696,7 @@ def process_stale_scc(graph: Graph, scc: List[str]) -> None:
16061696
for id in scc:
16071697
graph[id].type_check()
16081698
graph[id].write_cache()
1699+
graph[id].mark_as_rechecked()
16091700

16101701

16111702
def sorted_components(graph: Graph,

mypy/test/data.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def parse_test_cases(
4747

4848
files = [] # type: List[Tuple[str, str]] # path and contents
4949
stale_modules = None # type: Optional[Set[str]] # module names
50+
rechecked_modules = None # type: Optional[Set[str]] # module names
5051
while i < len(p) and p[i].id not in ['out', 'case']:
5152
if p[i].id == 'file':
5253
# Record an extra file needed for the test case.
@@ -68,12 +69,25 @@ def parse_test_cases(
6869
stale_modules = set()
6970
else:
7071
stale_modules = {item.strip() for item in p[i].arg.split(',')}
72+
elif p[i].id == 'rechecked':
73+
if p[i].arg is None:
74+
rechecked_modules = set()
75+
else:
76+
rechecked_modules = {item.strip() for item in p[i].arg.split(',')}
7177
else:
7278
raise ValueError(
7379
'Invalid section header {} in {} at line {}'.format(
7480
p[i].id, path, p[i].line))
7581
i += 1
7682

83+
if rechecked_modules is None:
84+
# If the set of rechecked modules isn't specified, make it the same as the set of
85+
# modules with a stale public interface.
86+
rechecked_modules = stale_modules
87+
if stale_modules is not None and not stale_modules.issubset(rechecked_modules):
88+
raise ValueError(
89+
'Stale modules must be a subset of rechecked modules ({})'.format(path))
90+
7791
tcout = [] # type: List[str]
7892
if i < len(p) and p[i].id == 'out':
7993
tcout = p[i].data
@@ -90,7 +104,7 @@ def parse_test_cases(
90104
lastline = p[i].line if i < len(p) else p[i - 1].line + 9999
91105
tc = DataDrivenTestCase(p[i0].arg, input, tcout, path,
92106
p[i0].line, lastline, perform,
93-
files, stale_modules)
107+
files, stale_modules, rechecked_modules)
94108
out.append(tc)
95109
if not ok:
96110
raise ValueError(
@@ -116,7 +130,7 @@ class DataDrivenTestCase(TestCase):
116130
clean_up = None # type: List[Tuple[bool, str]]
117131

118132
def __init__(self, name, input, output, file, line, lastline,
119-
perform, files, expected_stale_modules):
133+
perform, files, expected_stale_modules, expected_rechecked_modules):
120134
super().__init__(name)
121135
self.input = input
122136
self.output = output
@@ -126,6 +140,7 @@ def __init__(self, name, input, output, file, line, lastline,
126140
self.perform = perform
127141
self.files = files
128142
self.expected_stale_modules = expected_stale_modules
143+
self.expected_rechecked_modules = expected_rechecked_modules
129144

130145
def set_up(self) -> None:
131146
super().set_up()

0 commit comments

Comments
 (0)