Skip to content

Commit b78b621

Browse files
authored
Add several undocumented flags (--bazel, --package-root and --cache-map) in support of Bazel (#4759)
Addresses the mypy part for #3912. (The Skylark code I wrote for that is not yet open source.) The `--bazel` flag make cache files hermetic by using relative paths and setting mtime to zero; effectively the presence of a cache file prevents reading of the source file (though the source file must still exist). The `--cache-map` flag specifies a mapping from source files to cache files that overrides the usual way of finding the locations for cache files; e.g. ``` mypy --cache-map foo.py foo.meta.json foo.data.json bar.py bar.meta.json bar.data.json -- foo.py bar.py ``` The `--package-root` flag specifies a directory below which all subdirectories are considered packages (this accounts for Bazel's habit to imply empty `__init__.py` files everywhere).
1 parent 8f1bd58 commit b78b621

File tree

7 files changed

+314
-32
lines changed

7 files changed

+314
-32
lines changed

mypy/build.py

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,6 @@
7474
Graph = Dict[str, 'State']
7575

7676

77-
def getmtime(name: str) -> int:
78-
return int(os.path.getmtime(name))
79-
80-
8177
# TODO: Get rid of BuildResult. We might as well return a BuildManager.
8278
class BuildResult:
8379
"""The result of a successful build.
@@ -230,7 +226,12 @@ def compute_lib_path(sources: List[BuildSource],
230226
# to the lib_path
231227
# TODO: Don't do this in some cases; for motivation see see
232228
# https://github.com/python/mypy/issues/4195#issuecomment-341915031
233-
lib_path.appendleft(os.getcwd())
229+
if options.bazel:
230+
dir = '.'
231+
else:
232+
dir = os.getcwd()
233+
if dir not in lib_path:
234+
lib_path.appendleft(dir)
234235

235236
# Prepend a config-defined mypy path.
236237
lib_path.extendleft(options.mypy_path)
@@ -687,6 +688,31 @@ def maybe_swap_for_shadow_path(self, path: str) -> str:
687688
def get_stat(self, path: str) -> os.stat_result:
688689
return self.fscache.stat(self.maybe_swap_for_shadow_path(path))
689690

691+
def getmtime(self, path: str) -> int:
692+
"""Return a file's mtime; but 0 in bazel mode.
693+
694+
(Bazel's distributed cache doesn't like filesystem metadata to
695+
end up in output files.)
696+
"""
697+
if self.options.bazel:
698+
return 0
699+
else:
700+
return int(os.path.getmtime(path))
701+
702+
def normpath(self, path: str) -> str:
703+
"""Convert path to absolute; but to relative in bazel mode.
704+
705+
(Bazel's distributed cache doesn't like filesystem metadata to
706+
end up in output files.)
707+
"""
708+
# TODO: Could we always use relpath? (A worry in non-bazel
709+
# mode would be that a moved file may change its full module
710+
# name without changing its size, mtime or hash.)
711+
if self.options.bazel:
712+
return os.path.relpath(path)
713+
else:
714+
return os.path.abspath(path)
715+
690716
def all_imported_modules_in_file(self,
691717
file: MypyFile) -> List[Tuple[int, str, int]]:
692718
"""Find all reachable import statements in a file.
@@ -1094,14 +1120,17 @@ def get_cache_names(id: str, path: str, manager: BuildManager) -> Tuple[str, str
10941120
10951121
Args:
10961122
id: module ID
1097-
path: module path (used to recognize packages)
1123+
path: module path
10981124
cache_dir: cache directory
10991125
pyversion: Python version (major, minor)
11001126
11011127
Returns:
11021128
A tuple with the file names to be used for the meta JSON, the
11031129
data JSON, and the fine-grained deps JSON, respectively.
11041130
"""
1131+
pair = manager.options.cache_map.get(path)
1132+
if pair is not None:
1133+
return (pair[0], pair[1], None)
11051134
prefix = _cache_dir_prefix(manager, id)
11061135
is_package = os.path.basename(path).startswith('__init__.py')
11071136
if is_package:
@@ -1232,22 +1261,23 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str],
12321261
manager.log('Metadata abandoned for {}: errors were previously ignored'.format(id))
12331262
return None
12341263

1264+
bazel = manager.options.bazel
12351265
assert path is not None, "Internal error: meta was provided without a path"
12361266
# Check data_json; assume if its mtime matches it's good.
12371267
# TODO: stat() errors
1238-
data_mtime = getmtime(meta.data_json)
1268+
data_mtime = manager.getmtime(meta.data_json)
12391269
if data_mtime != meta.data_mtime:
12401270
manager.log('Metadata abandoned for {}: data cache is modified'.format(id))
12411271
return None
12421272
deps_mtime = None
12431273
if manager.options.cache_fine_grained:
12441274
assert meta.deps_json
1245-
deps_mtime = getmtime(meta.deps_json)
1275+
deps_mtime = manager.getmtime(meta.deps_json)
12461276
if deps_mtime != meta.deps_mtime:
12471277
manager.log('Metadata abandoned for {}: deps cache is modified'.format(id))
12481278
return None
12491279

1250-
path = os.path.abspath(path)
1280+
path = manager.normpath(path)
12511281
try:
12521282
st = manager.get_stat(path)
12531283
except OSError:
@@ -1272,12 +1302,14 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str],
12721302
fine_grained_cache = manager.use_fine_grained_cache()
12731303

12741304
size = st.st_size
1275-
if size != meta.size and not fine_grained_cache:
1305+
# Bazel ensures the cache is valid.
1306+
if size != meta.size and not bazel and not fine_grained_cache:
12761307
manager.log('Metadata abandoned for {}: file {} has different size'.format(id, path))
12771308
return None
12781309

1279-
mtime = int(st.st_mtime)
1280-
if mtime != meta.mtime or path != meta.path:
1310+
# Bazel ensures the cache is valid.
1311+
mtime = 0 if bazel else int(st.st_mtime)
1312+
if not bazel and (mtime != meta.mtime or path != meta.path):
12811313
try:
12821314
source_hash = manager.fscache.md5(path)
12831315
except (OSError, UnicodeDecodeError, DecodeError):
@@ -1317,7 +1349,7 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str],
13171349
meta_str = json.dumps(meta_dict, indent=2, sort_keys=True)
13181350
else:
13191351
meta_str = json.dumps(meta_dict)
1320-
meta_json, _, _2 = get_cache_names(id, path, manager)
1352+
meta_json, _, _ = get_cache_names(id, path, manager)
13211353
manager.log('Updating mtime for {}: file {}, meta {}, mtime {}'
13221354
.format(id, path, meta_json, meta.mtime))
13231355
atomic_write(meta_json, meta_str, '\n') # Ignore errors, it's just an optimization.
@@ -1373,12 +1405,20 @@ def write_cache(id: str, path: str, tree: MypyFile,
13731405
corresponding to the metadata that was written (the latter may
13741406
be None if the cache could not be written).
13751407
"""
1376-
# Obtain file paths
1377-
path = os.path.abspath(path)
1408+
# For Bazel we use relative paths and zero mtimes.
1409+
bazel = manager.options.bazel
1410+
1411+
# Obtain file paths.
1412+
path = manager.normpath(path)
13781413
meta_json, data_json, deps_json = get_cache_names(id, path, manager)
13791414
manager.log('Writing {} {} {} {} {}'.format(
13801415
id, path, meta_json, data_json, deps_json))
13811416

1417+
# Update tree.path so that in bazel mode it's made relative (since
1418+
# sometimes paths leak out).
1419+
if bazel:
1420+
tree.path = path
1421+
13821422
# Make sure directory for cache files exists
13831423
parent = os.path.dirname(data_json)
13841424
assert os.path.dirname(meta_json) == parent
@@ -1390,7 +1430,8 @@ def write_cache(id: str, path: str, tree: MypyFile,
13901430

13911431
# Obtain and set up metadata
13921432
try:
1393-
os.makedirs(parent, exist_ok=True)
1433+
if parent:
1434+
os.makedirs(parent, exist_ok=True)
13941435
st = manager.get_stat(path)
13951436
except OSError as err:
13961437
manager.log("Cannot get stat for {}: {}".format(path, err))
@@ -1405,10 +1446,11 @@ def write_cache(id: str, path: str, tree: MypyFile,
14051446
return interface_hash, None
14061447

14071448
# Write data cache file, if applicable
1449+
# Note that for Bazel we don't record the data file's mtime.
14081450
if old_interface_hash == interface_hash:
14091451
# If the interface is unchanged, the cached data is guaranteed
14101452
# to be equivalent, and we only need to update the metadata.
1411-
data_mtime = getmtime(data_json)
1453+
data_mtime = manager.getmtime(data_json)
14121454
manager.trace("Interface for {} is unchanged".format(id))
14131455
else:
14141456
manager.trace("Interface for {} has changed".format(id))
@@ -1425,17 +1467,17 @@ def write_cache(id: str, path: str, tree: MypyFile,
14251467
# Both have the effect of slowing down the next run a
14261468
# little bit due to an out-of-date cache file.
14271469
return interface_hash, None
1428-
data_mtime = getmtime(data_json)
1470+
data_mtime = manager.getmtime(data_json)
14291471

14301472
deps_mtime = None
14311473
if deps_json:
14321474
deps_str = json_dumps(serialized_fine_grained_deps, manager.options.debug_cache)
14331475
if not atomic_write(deps_json, deps_str, '\n'):
14341476
manager.log("Error writing deps JSON file {}".format(deps_json))
14351477
return interface_hash, None
1436-
deps_mtime = getmtime(deps_json)
1478+
deps_mtime = manager.getmtime(deps_json)
14371479

1438-
mtime = int(st.st_mtime)
1480+
mtime = 0 if bazel else int(st.st_mtime)
14391481
size = st.st_size
14401482
options = manager.options.clone_for_module(id)
14411483
assert source_hash is not None
@@ -1475,7 +1517,7 @@ def delete_cache(id: str, path: str, manager: BuildManager) -> None:
14751517
This avoids inconsistent states with cache files from different mypy runs,
14761518
see #4043 for an example.
14771519
"""
1478-
path = os.path.abspath(path)
1520+
path = manager.normpath(path)
14791521
cache_paths = get_cache_names(id, path, manager)
14801522
manager.log('Deleting {} {} {}'.format(id, path, " ".join(x for x in cache_paths if x)))
14811523

mypy/find_sources.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ def get_init_file(self, dir: str) -> Optional[str]:
143143
f = os.path.join(dir, '__init__' + ext)
144144
if self.fscache.isfile(f):
145145
return f
146+
if ext == '.py' and self.fscache.init_under_package_root(f):
147+
return f
146148
return None
147149

148150

mypy/fscache.py

Lines changed: 109 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,19 @@
3232
import hashlib
3333
import os
3434
import stat
35-
from typing import Dict, List, Tuple
35+
from typing import Dict, List, Optional, Set, Tuple
3636

3737

3838
class FileSystemCache:
3939
def __init__(self) -> None:
40+
# The package root is not flushed with the caches.
41+
# It is set by set_package_root() below.
42+
self.package_root = [] # type: List[str]
4043
self.flush()
4144

45+
def set_package_root(self, package_root: List[str]) -> None:
46+
self.package_root = package_root
47+
4248
def flush(self) -> None:
4349
"""Start another transaction and empty all caches."""
4450
self.stat_cache = {} # type: Dict[str, os.stat_result]
@@ -49,6 +55,7 @@ def flush(self) -> None:
4955
self.read_cache = {} # type: Dict[str, bytes]
5056
self.read_error_cache = {} # type: Dict[str, Exception]
5157
self.hash_cache = {} # type: Dict[str, str]
58+
self.fake_package_cache = set() # type: Set[str]
5259

5360
def stat(self, path: str) -> os.stat_result:
5461
if path in self.stat_cache:
@@ -58,16 +65,100 @@ def stat(self, path: str) -> os.stat_result:
5865
try:
5966
st = os.stat(path)
6067
except OSError as err:
68+
if self.init_under_package_root(path):
69+
try:
70+
return self._fake_init(path)
71+
except OSError:
72+
pass
6173
# Take a copy to get rid of associated traceback and frame objects.
6274
# Just assigning to __traceback__ doesn't free them.
6375
self.stat_error_cache[path] = copy_os_error(err)
6476
raise err
6577
self.stat_cache[path] = st
6678
return st
6779

80+
def init_under_package_root(self, path: str) -> bool:
81+
"""Is this path an __init__.py under a package root?
82+
83+
This is used to detect packages that don't contain __init__.py
84+
files, which is needed to support Bazel. The function should
85+
only be called for non-existing files.
86+
87+
It will return True if it refers to a __init__.py file that
88+
Bazel would create, so that at runtime Python would think the
89+
directory containing it is a package. For this to work you
90+
must pass one or more package roots using the --package-root
91+
flag.
92+
93+
As an exceptional case, any directory that is a package root
94+
itself will not be considered to contain a __init__.py file.
95+
This is different from the rules Bazel itself applies, but is
96+
necessary for mypy to properly distinguish packages from other
97+
directories.
98+
99+
See https://docs.bazel.build/versions/master/be/python.html,
100+
where this behavior is described under legacy_create_init.
101+
"""
102+
if not self.package_root:
103+
return False
104+
dirname, basename = os.path.split(path)
105+
if basename != '__init__.py':
106+
return False
107+
try:
108+
st = self.stat(dirname)
109+
except OSError:
110+
return False
111+
else:
112+
if not stat.S_ISDIR(st.st_mode):
113+
return False
114+
ok = False
115+
drive, path = os.path.splitdrive(path) # Ignore Windows drive name
116+
path = os.path.normpath(path)
117+
for root in self.package_root:
118+
if path.startswith(root):
119+
if path == root + basename:
120+
# A package root itself is never a package.
121+
ok = False
122+
break
123+
else:
124+
ok = True
125+
return ok
126+
127+
def _fake_init(self, path: str) -> os.stat_result:
128+
"""Prime the cache with a fake __init__.py file.
129+
130+
This makes code that looks for path believe an empty file by
131+
that name exists. Should only be called after
132+
init_under_package_root() returns True.
133+
"""
134+
dirname, basename = os.path.split(path)
135+
assert basename == '__init__.py', path
136+
assert not os.path.exists(path), path # Not cached!
137+
dirname = os.path.normpath(dirname)
138+
st = self.stat(dirname) # May raise OSError
139+
# Get stat result as a sequence so we can modify it.
140+
# (Alas, typeshed's os.stat_result is not a sequence yet.)
141+
tpl = tuple(st) # type: ignore
142+
seq = list(tpl) # type: List[float]
143+
seq[stat.ST_MODE] = stat.S_IFREG | 0o444
144+
seq[stat.ST_INO] = 1
145+
seq[stat.ST_NLINK] = 1
146+
seq[stat.ST_SIZE] = 0
147+
tpl = tuple(seq)
148+
st = os.stat_result(tpl)
149+
self.stat_cache[path] = st
150+
# Make listdir() and read() also pretend this file exists.
151+
self.fake_package_cache.add(dirname)
152+
return st
153+
68154
def listdir(self, path: str) -> List[str]:
155+
path = os.path.normpath(path)
69156
if path in self.listdir_cache:
70-
return self.listdir_cache[path]
157+
res = self.listdir_cache[path]
158+
# Check the fake cache.
159+
if path in self.fake_package_cache and '__init__.py' not in res:
160+
res.append('__init__.py') # Updates the result as well as the cache
161+
return res
71162
if path in self.listdir_error_cache:
72163
raise copy_os_error(self.listdir_error_cache[path])
73164
try:
@@ -77,6 +168,9 @@ def listdir(self, path: str) -> List[str]:
77168
self.listdir_error_cache[path] = copy_os_error(err)
78169
raise err
79170
self.listdir_cache[path] = results
171+
# Check the fake cache.
172+
if path in self.fake_package_cache and '__init__.py' not in results:
173+
results.append('__init__.py')
80174
return results
81175

82176
def isfile(self, path: str) -> bool:
@@ -133,12 +227,19 @@ def read(self, path: str) -> bytes:
133227
# earlier instant than the mtime reported by self.stat().
134228
self.stat(path)
135229

136-
try:
137-
with open(path, 'rb') as f:
138-
data = f.read()
139-
except Exception as err:
140-
self.read_error_cache[path] = err
141-
raise
230+
dirname, basename = os.path.split(path)
231+
dirname = os.path.normpath(dirname)
232+
# Check the fake cache.
233+
if basename == '__init__.py' and dirname in self.fake_package_cache:
234+
data = b''
235+
else:
236+
try:
237+
with open(path, 'rb') as f:
238+
data = f.read()
239+
except OSError as err:
240+
self.read_error_cache[path] = err
241+
raise
242+
142243
md5hash = hashlib.md5(data).hexdigest()
143244
self.read_cache[path] = data
144245
self.hash_cache[path] = md5hash

0 commit comments

Comments
 (0)