Skip to content

Commit c5c6466

Browse files
cache Link parsing and interpreter compatibility checking
1 parent 3a8589a commit c5c6466

File tree

3 files changed

+236
-8
lines changed

3 files changed

+236
-8
lines changed

src/pip/_internal/cache.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import logging
88
import os
99
from pathlib import Path
10-
from typing import Any, Dict, List, Optional
10+
from typing import Any, Dict, List, Optional, Type
1111

1212
from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
1313
from pip._vendor.packaging.utils import canonicalize_name
@@ -95,6 +95,17 @@ def get_path_for_link(self, link: Link) -> str:
9595
return os.path.join(self.cache_dir, "link-metadata", *parts)
9696

9797

98+
class SerializableEntry(abc.ABC):
99+
@classmethod
100+
@abc.abstractmethod
101+
def suffix(cls) -> str:
102+
...
103+
104+
@abc.abstractmethod
105+
def serialize(self) -> Dict[str, Any]:
106+
...
107+
108+
98109
class FetchResolveCache(Cache):
99110
def get_path_for_link(self, link: Link) -> str:
100111
# We are reading index links to extract other links from, not executing any
@@ -103,6 +114,19 @@ def get_path_for_link(self, link: Link) -> str:
103114
assert self.cache_dir
104115
return os.path.join(self.cache_dir, "fetch-resolve", *parts)
105116

117+
def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
118+
hashed = _hash_dict(entry.serialize())
119+
return self.cache_path(link) / f"{hashed}{entry.suffix()}"
120+
121+
def clear_hashed_entries(
122+
self, link: Link, entry_type: Type[SerializableEntry]
123+
) -> None:
124+
for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
125+
logger.debug(
126+
"unlinking invalidated hashed link eval cache entry %s", hashed_entry
127+
)
128+
hashed_entry.unlink()
129+
106130

107131
class WheelCacheBase(Cache):
108132
"""Specializations to the cache concept for wheels."""

src/pip/_internal/index/package_finder.py

Lines changed: 153 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
import enum
44
import functools
55
import itertools
6+
import json
67
import logging
78
import os
89
import re
910
from hashlib import sha256
1011
from pathlib import Path
1112
from typing import (
1213
TYPE_CHECKING,
14+
Any,
1315
Dict,
1416
FrozenSet,
1517
Iterable,
@@ -26,7 +28,7 @@
2628
from pip._vendor.packaging.version import _BaseVersion
2729
from pip._vendor.packaging.version import parse as parse_version
2830

29-
from pip._internal.cache import FetchResolveCache
31+
from pip._internal.cache import FetchResolveCache, SerializableEntry
3032
from pip._internal.exceptions import (
3133
BestVersionAlreadyInstalled,
3234
DistributionNotFound,
@@ -36,7 +38,7 @@
3638
from pip._internal.index.collector import IndexContent, LinkCollector, parse_links
3739
from pip._internal.models.candidate import InstallationCandidate
3840
from pip._internal.models.format_control import FormatControl
39-
from pip._internal.models.link import Link
41+
from pip._internal.models.link import Link, PersistentLinkCacheArgs
4042
from pip._internal.models.search_scope import SearchScope
4143
from pip._internal.models.selection_prefs import SelectionPreferences
4244
from pip._internal.models.target_python import TargetPython
@@ -119,14 +121,41 @@ class LinkType(enum.Enum):
119121
requires_python_mismatch = enum.auto()
120122

121123

122-
class LinkEvaluator:
124+
class LinkEvaluator(SerializableEntry):
123125

124126
"""
125127
Responsible for evaluating links for a particular project.
126128
"""
127129

130+
@classmethod
131+
def suffix(cls) -> str:
132+
return ".evaluation"
133+
128134
_py_version_re = re.compile(r"-py([123]\.?[0-9]?)$")
129135

136+
def serialize(self) -> Dict[str, Any]:
137+
return dict(
138+
project_name=self.project_name,
139+
canonical_name=self._canonical_name,
140+
# Sort these for determinism.
141+
formats=sorted(self._formats),
142+
target_python=self._target_python.format_given(),
143+
allow_yanked=self._allow_yanked,
144+
ignore_requires_python=self._ignore_requires_python,
145+
)
146+
147+
def to_json(self) -> str:
148+
return json.dumps(self.serialize(), sort_keys=True)
149+
150+
def __eq__(self, other: Any) -> bool:
151+
return isinstance(other, type(self)) and self.to_json() == other.to_json()
152+
153+
def __ne__(self, other: Any) -> bool:
154+
return not self == other
155+
156+
def __hash__(self) -> int:
157+
return hash(self.to_json())
158+
130159
# Don't include an allow_yanked default value to make sure each call
131160
# site considers whether yanked releases are allowed. This also causes
132161
# that decision to be made explicit in the calling code, which helps
@@ -900,6 +929,91 @@ def _write_http_cache_info(
900929

901930
return (new_etag, new_date, new_checksum, page_unmodified)
902931

932+
@staticmethod
933+
def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]:
934+
page_links: Optional[List[Link]] = None
935+
try:
936+
with parsed_links_path.open("r") as f:
937+
logger.debug("reading page links from cache %s", parsed_links_path)
938+
cached_links = json.load(f)
939+
page_links = []
940+
for cache_info in cached_links:
941+
link = Link.from_cache_args(
942+
PersistentLinkCacheArgs.from_json(cache_info)
943+
)
944+
assert link is not None
945+
page_links.append(link)
946+
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
947+
logger.debug(
948+
"could not read page links from cache file %s %s(%s)",
949+
parsed_links_path,
950+
e.__class__.__name__,
951+
str(e),
952+
)
953+
return page_links
954+
955+
@staticmethod
956+
def _write_parsed_links_cache(
957+
parsed_links_path: Path, links: Iterable[Link]
958+
) -> List[Link]:
959+
cacheable_links: List[Dict[str, Any]] = []
960+
page_links: List[Link] = []
961+
for link in links:
962+
cache_info = link.cache_args()
963+
assert cache_info is not None
964+
cacheable_links.append(cache_info.to_json())
965+
page_links.append(link)
966+
967+
logger.debug("writing page links to %s", parsed_links_path)
968+
with parsed_links_path.open("w") as f:
969+
json.dump(cacheable_links, f)
970+
971+
return page_links
972+
973+
@staticmethod
974+
def _try_load_installation_candidate_cache(
975+
cached_candidates_path: Path,
976+
) -> Optional[List[InstallationCandidate]]:
977+
try:
978+
with cached_candidates_path.open("r") as f:
979+
serialized_candidates = json.load(f)
980+
logger.debug("read serialized candidates from %s", cached_candidates_path)
981+
package_links: List[InstallationCandidate] = []
982+
for cand in serialized_candidates:
983+
link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"])
984+
link = Link.from_cache_args(link_cache_args)
985+
package_links.append(
986+
InstallationCandidate(cand["name"], cand["version"], link)
987+
)
988+
return package_links
989+
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
990+
logger.debug(
991+
"could not read cached candidates at %s %s(%s)",
992+
cached_candidates_path,
993+
e.__class__.__name__,
994+
str(e),
995+
)
996+
return None
997+
998+
@staticmethod
999+
def _write_installation_candidate_cache(
1000+
cached_candidates_path: Path,
1001+
candidates: Iterable[InstallationCandidate],
1002+
) -> List[InstallationCandidate]:
1003+
candidates = list(candidates)
1004+
serialized_candidates = [
1005+
dict(
1006+
name=candidate.name,
1007+
version=str(candidate.version),
1008+
link=candidate.link.cache_args().to_json(),
1009+
)
1010+
for candidate in candidates
1011+
]
1012+
with cached_candidates_path.open("w") as f:
1013+
logger.debug("writing serialized candidates to %s", f.name)
1014+
json.dump(serialized_candidates, f)
1015+
return candidates
1016+
9031017
def _process_project_url_uncached(
9041018
self, project_url: Link, link_evaluator: LinkEvaluator
9051019
) -> List[InstallationCandidate]:
@@ -926,6 +1040,10 @@ def process_project_url(
9261040
etag_path = cached_path / "etag"
9271041
date_path = cached_path / "modified-since-date"
9281042
checksum_path = cached_path / "checksum"
1043+
parsed_links_path = cached_path / "parsed-links"
1044+
cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path(
1045+
project_url, link_evaluator
1046+
)
9291047

9301048
headers: Dict[str, str] = {}
9311049
# NB: mutates headers!
@@ -962,12 +1080,40 @@ def process_project_url(
9621080
prev_checksum=prev_checksum,
9631081
)
9641082

965-
page_links = parse_links(index_response)
1083+
page_links: Optional[List[Link]] = None
1084+
# Only try our persistent link parsing and evaluation caches if we know the page
1085+
# was unmodified via checksum.
1086+
if page_unmodified:
1087+
cached_candidates = self._try_load_installation_candidate_cache(
1088+
cached_candidates_path
1089+
)
1090+
if cached_candidates is not None:
1091+
return cached_candidates
1092+
1093+
page_links = self._try_load_parsed_links_cache(parsed_links_path)
1094+
else:
1095+
try:
1096+
parsed_links_path.unlink()
1097+
except OSError:
1098+
pass
1099+
self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator)
1100+
1101+
if page_links is None:
1102+
logger.debug(
1103+
"extracting new parsed links from index response %s", index_response
1104+
)
1105+
page_links = self._write_parsed_links_cache(
1106+
parsed_links_path,
1107+
parse_links(index_response),
1108+
)
9661109

9671110
with indent_log():
968-
package_links = self.evaluate_links(
969-
link_evaluator,
970-
links=page_links,
1111+
package_links = self._write_installation_candidate_cache(
1112+
cached_candidates_path,
1113+
self.evaluate_links(
1114+
link_evaluator,
1115+
links=page_links,
1116+
),
9711117
)
9721118

9731119
return package_links

src/pip/_internal/models/link.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,43 @@ def _ensure_quoted_url(url: str) -> str:
179179
return urllib.parse.urlunparse(result._replace(path=path))
180180

181181

182+
@dataclass(frozen=True)
183+
class PersistentLinkCacheArgs:
184+
url: str
185+
comes_from: Optional[str] = None
186+
requires_python: Optional[str] = None
187+
yanked_reason: Optional[str] = None
188+
metadata_file_data: Optional[MetadataFile] = None
189+
hashes: Optional[Mapping[str, str]] = None
190+
191+
def to_json(self) -> Dict[str, Any]:
192+
return dict(
193+
url=self.url,
194+
comes_from=self.comes_from,
195+
requires_python=self.requires_python,
196+
yanked_reason=self.yanked_reason,
197+
metadata_file_data=(
198+
self.metadata_file_data.hashes if self.metadata_file_data else None
199+
),
200+
hashes=self.hashes,
201+
)
202+
203+
@classmethod
204+
def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs":
205+
return cls(
206+
url=cache_info["url"],
207+
comes_from=cache_info["comes_from"],
208+
requires_python=cache_info["requires_python"],
209+
yanked_reason=cache_info["yanked_reason"],
210+
metadata_file_data=(
211+
MetadataFile(hashes=cache_info["metadata_file_data"])
212+
if cache_info["metadata_file_data"]
213+
else None
214+
),
215+
hashes=cache_info["hashes"],
216+
)
217+
218+
182219
class Link(KeyBasedCompareMixin):
183220
"""Represents a parsed link from a Package Index's simple URL"""
184221

@@ -305,6 +342,27 @@ def from_json(
305342
metadata_file_data=metadata_file_data,
306343
)
307344

345+
def cache_args(self) -> PersistentLinkCacheArgs:
346+
return PersistentLinkCacheArgs(
347+
url=self.url,
348+
comes_from=(str(self.comes_from) if self.comes_from else None),
349+
requires_python=self.requires_python,
350+
yanked_reason=self.yanked_reason,
351+
metadata_file_data=self.metadata_file_data,
352+
hashes=self._hashes,
353+
)
354+
355+
@classmethod
356+
def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link":
357+
return cls(
358+
args.url,
359+
comes_from=args.comes_from,
360+
requires_python=args.requires_python,
361+
yanked_reason=args.yanked_reason,
362+
metadata_file_data=args.metadata_file_data,
363+
hashes=args.hashes,
364+
)
365+
308366
@classmethod
309367
def from_element(
310368
cls,

0 commit comments

Comments
 (0)