Skip to content

Commit 32e02f8

Browse files
authored
Hash files instead of relying on modification timestamps. (#469)
1 parent 4da749b commit 32e02f8

File tree

16 files changed

+570
-67
lines changed

16 files changed

+570
-67
lines changed

docs/source/changes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and
1919
- {pull}`463` raise error when a task function is not defined inside the loop body.
2020
- {pull}`464` improves pinned dependencies.
2121
- {pull}`465` adds test to ensure internal tracebacks are removed by reports.
22+
- {pull}`466` implements hashing for files instead of modification timestamps.
2223
- {pull}`472` adds `is_product` to {meth}`PNode.load`.
2324

2425
## 0.4.1 - 2023-10-11

docs/source/reference_guides/api.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,13 @@ outcome.
343343
.. autofunction:: pytask.count_outcomes
344344
```
345345

346+
## Path utilities
347+
348+
```{eval-rst}
349+
.. autofunction:: pytask.path.import_path
350+
.. autofunction:: pytask.path.hash_path
351+
```
352+
346353
## Programmatic Interfaces
347354

348355
```{eval-rst}
@@ -360,6 +367,17 @@ There are some classes to handle different kinds of reports.
360367
.. autoclass:: pytask.DagReport
361368
```
362369

370+
## Tree utilities
371+
372+
```{eval-rst}
373+
.. autofunction:: pytask.tree_util.PyTree
374+
.. autofunction:: pytask.tree_util.tree_flatten_with_path
375+
.. autofunction:: pytask.tree_util.tree_leaves
376+
.. autofunction:: pytask.tree_util.tree_map
377+
.. autofunction:: pytask.tree_util.tree_map_with_path
378+
.. autofunction:: pytask.tree_util.tree_structure
379+
```
380+
363381
## Typing
364382

365383
```{eval-rst}

src/_pytask/_hashlib.py

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
from __future__ import annotations
2+
3+
import hashlib
4+
import sys
5+
from contextlib import suppress
6+
from pathlib import Path
7+
from typing import Any
8+
9+
10+
if sys.version_info >= (3, 11):
11+
from hashlib import file_digest
12+
else:
13+
# This tuple and __get_builtin_constructor() must be modified if a new
14+
# always available algorithm is added.
15+
__always_supported = (
16+
"md5",
17+
"sha1",
18+
"sha224",
19+
"sha256",
20+
"sha384",
21+
"sha512",
22+
"blake2b",
23+
"blake2s",
24+
"sha3_224",
25+
"sha3_256",
26+
"sha3_384",
27+
"sha3_512",
28+
"shake_128",
29+
"shake_256",
30+
)
31+
32+
algorithms_guaranteed = set(__always_supported)
33+
algorithms_available = set(__always_supported)
34+
35+
__all__ = __always_supported + (
36+
"new",
37+
"algorithms_guaranteed",
38+
"algorithms_available",
39+
"file_digest",
40+
)
41+
42+
__builtin_constructor_cache = {}
43+
44+
# Prefer our blake2 implementation
45+
# OpenSSL 1.1.0 comes with a limited implementation of blake2b/s. The OpenSSL
46+
# implementations neither support keyed blake2 (blake2 MAC) nor advanced
47+
# features like salt, personalization, or tree hashing. OpenSSL hash-only
48+
# variants are available as 'blake2b512' and 'blake2s256', though.
49+
__block_openssl_constructor = {
50+
"blake2b",
51+
"blake2s",
52+
}
53+
54+
def __get_builtin_constructor(name):
55+
cache = __builtin_constructor_cache
56+
constructor = cache.get(name)
57+
if constructor is not None:
58+
return constructor
59+
60+
with suppress(ImportError):
61+
if name in {"SHA1", "sha1"}:
62+
import _sha1
63+
64+
cache["SHA1"] = cache["sha1"] = _sha1.sha1
65+
elif name in {"MD5", "md5"}:
66+
import _md5
67+
68+
cache["MD5"] = cache["md5"] = _md5.md5
69+
elif name in {"SHA256", "sha256", "SHA224", "sha224"}:
70+
try:
71+
import _sha2
72+
except ImportError:
73+
import _sha256 as _sha2
74+
cache["SHA224"] = cache["sha224"] = _sha2.sha224
75+
cache["SHA256"] = cache["sha256"] = _sha2.sha256
76+
elif name in {"SHA512", "sha512", "SHA384", "sha384"}:
77+
try:
78+
import _sha2
79+
except ImportError:
80+
import _sha256 as _sha2
81+
cache["SHA384"] = cache["sha384"] = _sha2.sha384
82+
cache["SHA512"] = cache["sha512"] = _sha2.sha512
83+
elif name in {"blake2b", "blake2s"}:
84+
import _blake2
85+
86+
cache["blake2b"] = _blake2.blake2b
87+
cache["blake2s"] = _blake2.blake2s
88+
elif name in {"sha3_224", "sha3_256", "sha3_384", "sha3_512"}:
89+
import _sha3
90+
91+
cache["sha3_224"] = _sha3.sha3_224
92+
cache["sha3_256"] = _sha3.sha3_256
93+
cache["sha3_384"] = _sha3.sha3_384
94+
cache["sha3_512"] = _sha3.sha3_512
95+
elif name in {"shake_128", "shake_256"}:
96+
import _sha3
97+
98+
cache["shake_128"] = _sha3.shake_128
99+
cache["shake_256"] = _sha3.shake_256
100+
101+
constructor = cache.get(name)
102+
if constructor is not None:
103+
return constructor
104+
105+
raise ValueError("unsupported hash type " + name)
106+
107+
def __get_openssl_constructor(name):
108+
if name in __block_openssl_constructor:
109+
# Prefer our builtin blake2 implementation.
110+
return __get_builtin_constructor(name)
111+
try:
112+
# MD5, SHA1, and SHA2 are in all supported OpenSSL versions
113+
# SHA3/shake are available in OpenSSL 1.1.1+
114+
f = getattr(_hashlib, "openssl_" + name)
115+
# Allow the C module to raise ValueError. The function will be
116+
# defined but the hash not actually available. Don't fall back to
117+
# builtin if the current security policy blocks a digest, bpo#40695.
118+
f(usedforsecurity=False)
119+
# Use the C function directly (very fast)
120+
return f
121+
except (AttributeError, ValueError):
122+
return __get_builtin_constructor(name)
123+
124+
def __py_new(name, data=b"", **kwargs):
125+
"""new(name, data=b'', **kwargs) - Return a new hashing object using the
126+
named algorithm; optionally initialized with data (which must be
127+
a bytes-like object).
128+
"""
129+
return __get_builtin_constructor(name)(data, **kwargs)
130+
131+
def __hash_new(name, data=b"", **kwargs):
132+
"""new(name, data=b'') - Return a new hashing object using the named algorithm;
133+
optionally initialized with data (which must be a bytes-like object).
134+
"""
135+
if name in __block_openssl_constructor:
136+
# Prefer our builtin blake2 implementation.
137+
return __get_builtin_constructor(name)(data, **kwargs)
138+
try:
139+
return _hashlib.new(name, data, **kwargs)
140+
except ValueError:
141+
# If the _hashlib module (OpenSSL) doesn't support the named
142+
# hash, try using our builtin implementations.
143+
# This allows for SHA224/256 and SHA384/512 support even though
144+
# the OpenSSL library prior to 0.9.8 doesn't provide them.
145+
return __get_builtin_constructor(name)(data)
146+
147+
try:
148+
import _hashlib
149+
150+
new = __hash_new
151+
__get_hash = __get_openssl_constructor
152+
algorithms_available = algorithms_available.union(
153+
_hashlib.openssl_md_meth_names
154+
)
155+
except ImportError:
156+
_hashlib = None
157+
new = __py_new
158+
__get_hash = __get_builtin_constructor
159+
160+
with suppress(ImportError):
161+
# OpenSSL's PKCS5_PBKDF2_HMAC requires OpenSSL 1.0+ with HMAC and SHA
162+
from _hashlib import pbkdf2_hmac
163+
164+
__all__ += ("pbkdf2_hmac",)
165+
166+
with suppress(ImportError):
167+
# OpenSSL's scrypt requires OpenSSL 1.1+
168+
from _hashlib import scrypt
169+
170+
def file_digest(fileobj, digest, /, *, _bufsize=2**18):
171+
"""Hash the contents of a file-like object. Returns a digest object.
172+
173+
*fileobj* must be a file-like object opened for reading in binary mode.
174+
It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
175+
The function may bypass Python's I/O and use the file descriptor *fileno*
176+
directly.
177+
178+
*digest* must either be a hash algorithm name as a *str*, a hash
179+
constructor, or a callable that returns a hash object.
180+
"""
181+
# On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
182+
# hashing with hardware acceleration.
183+
if isinstance(digest, str):
184+
digestobj = new(digest)
185+
else:
186+
digestobj = digest()
187+
188+
if hasattr(fileobj, "getbuffer"):
189+
# io.BytesIO object, use zero-copy buffer
190+
digestobj.update(fileobj.getbuffer())
191+
return digestobj
192+
193+
# Only binary files implement readinto().
194+
if not (
195+
hasattr(fileobj, "readinto")
196+
and hasattr(fileobj, "readable")
197+
and fileobj.readable()
198+
):
199+
raise ValueError(
200+
f"'{fileobj!r}' is not a file-like object in binary reading mode."
201+
)
202+
203+
# binary file, socket.SocketIO object
204+
# Note: socket I/O uses different syscalls than file I/O.
205+
buf = bytearray(_bufsize) # Reusable buffer to reduce allocations.
206+
view = memoryview(buf)
207+
while True:
208+
size = fileobj.readinto(buf)
209+
if size == 0:
210+
break # EOF
211+
digestobj.update(view[:size])
212+
213+
return digestobj
214+
215+
216+
def hash_value(value: Any) -> int | str:
217+
"""Hash values.
218+
219+
Compute the hash of paths, strings, and bytes with a hash function or otherwise the
220+
hashes are salted.
221+
222+
"""
223+
if isinstance(value, Path):
224+
value = str(value)
225+
if isinstance(value, str):
226+
value = value.encode()
227+
if isinstance(value, bytes):
228+
return str(hashlib.sha256(value).hexdigest())
229+
return hash(value)

src/_pytask/build.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Implement the build command."""
22
from __future__ import annotations
33

4+
import json
45
import sys
56
from pathlib import Path
67
from typing import Any
@@ -21,6 +22,7 @@
2122
from _pytask.exceptions import ExecutionError
2223
from _pytask.exceptions import ResolvingDependenciesError
2324
from _pytask.outcomes import ExitCode
25+
from _pytask.path import HashPathCache
2426
from _pytask.pluginmanager import get_plugin_manager
2527
from _pytask.session import Session
2628
from _pytask.shared import parse_paths
@@ -40,6 +42,27 @@ def pytask_extend_command_line_interface(cli: click.Group) -> None:
4042
cli.add_command(build_command)
4143

4244

45+
@hookimpl
46+
def pytask_post_parse(config: dict[str, Any]) -> None:
47+
"""Fill cache of file hashes with stored hashes."""
48+
try:
49+
path = config["root"] / ".pytask" / "file_hashes.json"
50+
cache = json.loads(path.read_text())
51+
except Exception: # noqa: BLE001
52+
cache = {}
53+
54+
for key, value in cache.items():
55+
HashPathCache.add(key, value)
56+
57+
58+
@hookimpl
59+
def pytask_unconfigure(session: Session) -> None:
60+
"""Save calculated file hashes to file."""
61+
path = session.config["root"] / ".pytask"
62+
path.mkdir(exist_ok=True, parents=True)
63+
path.joinpath("file_hashes.json").write_text(json.dumps(HashPathCache._cache))
64+
65+
4366
def build( # noqa: C901, PLR0912, PLR0913
4467
*,
4568
capture: Literal["fd", "no", "sys", "tee-sys"] | CaptureMethod = CaptureMethod.NO,

0 commit comments

Comments
 (0)