Skip to content

Commit f007a62

Browse files
cache Link parsing and interpreter compatibility checking
- also compress the link parsing
1 parent dbacf99 commit f007a62

File tree

5 files changed

+252
-59
lines changed

5 files changed

+252
-59
lines changed

src/pip/_internal/cache.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import os
99
import re
1010
from pathlib import Path
11-
from typing import Dict, Iterator, List, Optional, Tuple
11+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type
1212

1313
from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
1414
from pip._vendor.packaging.utils import canonicalize_name
@@ -146,6 +146,15 @@ def get_path_for_link(self, link: Link) -> str:
146146
return os.path.join(self.cache_dir, "link-metadata", *parts)
147147

148148

149+
class SerializableEntry(abc.ABC):
150+
@classmethod
151+
@abc.abstractmethod
152+
def suffix(cls) -> str: ...
153+
154+
@abc.abstractmethod
155+
def serialize(self) -> Dict[str, Any]: ...
156+
157+
149158
class FetchResolveCache(Cache):
150159
def get_path_for_link(self, link: Link) -> str:
151160
# We are reading index links to extract other links from, not executing any
@@ -154,6 +163,19 @@ def get_path_for_link(self, link: Link) -> str:
154163
assert self.cache_dir
155164
return os.path.join(self.cache_dir, "fetch-resolve", *parts)
156165

166+
def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
167+
hashed = _hash_dict(entry.serialize())
168+
return self.cache_path(link) / f"{hashed}{entry.suffix()}"
169+
170+
def clear_hashed_entries(
171+
self, link: Link, entry_type: Type[SerializableEntry]
172+
) -> None:
173+
for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
174+
logger.debug(
175+
"unlinking invalidated hashed link eval cache entry %s", hashed_entry
176+
)
177+
hashed_entry.unlink()
178+
157179

158180
class WheelCacheBase(Cache):
159181
"""Specializations to the cache concept for wheels."""

src/pip/_internal/index/package_finder.py

Lines changed: 157 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
"""Routines related to PyPI, indexes"""
22

33
import binascii
4+
import bz2
45
import datetime
56
import enum
67
import functools
78
import itertools
9+
import json
810
import logging
911
import os
1012
import re
@@ -14,6 +16,8 @@
1416
from pathlib import Path
1517
from typing import (
1618
TYPE_CHECKING,
19+
Any,
20+
Callable,
1721
Dict,
1822
FrozenSet,
1923
Iterable,
@@ -30,7 +34,7 @@
3034
from pip._vendor.packaging.version import InvalidVersion, _BaseVersion
3135
from pip._vendor.packaging.version import parse as parse_version
3236

33-
from pip._internal.cache import FetchResolveCache
37+
from pip._internal.cache import FetchResolveCache, SerializableEntry
3438
from pip._internal.exceptions import (
3539
BestVersionAlreadyInstalled,
3640
DistributionNotFound,
@@ -40,7 +44,7 @@
4044
from pip._internal.index.collector import IndexContent, LinkCollector, parse_links
4145
from pip._internal.models.candidate import InstallationCandidate
4246
from pip._internal.models.format_control import FormatControl
43-
from pip._internal.models.link import Link
47+
from pip._internal.models.link import Link, PersistentLinkCacheArgs
4448
from pip._internal.models.search_scope import SearchScope
4549
from pip._internal.models.selection_prefs import SelectionPreferences
4650
from pip._internal.models.target_python import TargetPython
@@ -123,13 +127,28 @@ class LinkType(enum.Enum):
123127
requires_python_mismatch = enum.auto()
124128

125129

126-
class LinkEvaluator:
130+
class LinkEvaluator(SerializableEntry):
127131
"""
128132
Responsible for evaluating links for a particular project.
129133
"""
130134

135+
@classmethod
136+
def suffix(cls) -> str:
137+
return ".evaluation"
138+
131139
_py_version_re = re.compile(r"-py([123]\.?[0-9]?)$")
132140

141+
def serialize(self) -> Dict[str, Any]:
142+
return {
143+
"project_name": self.project_name,
144+
"canonical_name": self._canonical_name,
145+
# Sort these for determinism.
146+
"formats": sorted(self._formats),
147+
"target_python": self._target_python.format_given(),
148+
"allow_yanked": self._allow_yanked,
149+
"ignore_requires_python": self._ignore_requires_python,
150+
}
151+
133152
# Don't include an allow_yanked default value to make sure each call
134153
# site considers whether yanked releases are allowed. This also causes
135154
# that decision to be made explicit in the calling code, which helps
@@ -594,6 +613,19 @@ def compute_best_candidate(
594613
)
595614

596615

616+
_FindCandidates = Callable[["PackageFinder", str], List[InstallationCandidate]]
617+
618+
619+
def _canonicalize_arg(func: _FindCandidates) -> _FindCandidates:
620+
@functools.wraps(func)
621+
def wrapper(
622+
self: "PackageFinder", project_name: str
623+
) -> List[InstallationCandidate]:
624+
return func(self, canonicalize_name(project_name))
625+
626+
return wrapper
627+
628+
597629
class PackageFinder:
598630
"""This finds packages.
599631
@@ -954,6 +986,91 @@ def _write_http_cache_info(
954986

955987
return (new_etag, new_date, new_checksum, page_unmodified)
956988

989+
@staticmethod
990+
def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]:
991+
page_links: Optional[List[Link]] = None
992+
try:
993+
with bz2.open(parsed_links_path, mode="rt", encoding="utf-8") as f:
994+
logger.debug("reading page links from cache %s", parsed_links_path)
995+
cached_links = json.load(f)
996+
page_links = []
997+
for cache_info in cached_links:
998+
link = Link.from_cache_args(
999+
PersistentLinkCacheArgs.from_json(cache_info)
1000+
)
1001+
assert link is not None
1002+
page_links.append(link)
1003+
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
1004+
logger.debug(
1005+
"could not read page links from cache file %s %s(%s)",
1006+
parsed_links_path,
1007+
e.__class__.__name__,
1008+
str(e),
1009+
)
1010+
return page_links
1011+
1012+
@staticmethod
1013+
def _write_parsed_links_cache(
1014+
parsed_links_path: Path, links: Iterable[Link]
1015+
) -> List[Link]:
1016+
cacheable_links: List[Dict[str, Any]] = []
1017+
page_links: List[Link] = []
1018+
for link in links:
1019+
cache_info = link.cache_args()
1020+
assert cache_info is not None
1021+
cacheable_links.append(cache_info.to_json())
1022+
page_links.append(link)
1023+
1024+
logger.debug("writing page links to %s", parsed_links_path)
1025+
with bz2.open(parsed_links_path, mode="wt", encoding="utf-8") as f:
1026+
json.dump(cacheable_links, f)
1027+
1028+
return page_links
1029+
1030+
@staticmethod
1031+
def _try_load_installation_candidate_cache(
1032+
cached_candidates_path: Path,
1033+
) -> Optional[List[InstallationCandidate]]:
1034+
try:
1035+
with bz2.open(cached_candidates_path, mode="rt", encoding="utf-8") as f:
1036+
serialized_candidates = json.load(f)
1037+
logger.debug("read serialized candidates from %s", cached_candidates_path)
1038+
package_links: List[InstallationCandidate] = []
1039+
for cand in serialized_candidates:
1040+
link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"])
1041+
link = Link.from_cache_args(link_cache_args)
1042+
package_links.append(
1043+
InstallationCandidate(cand["name"], cand["version"], link)
1044+
)
1045+
return package_links
1046+
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
1047+
logger.debug(
1048+
"could not read cached candidates at %s %s(%s)",
1049+
cached_candidates_path,
1050+
e.__class__.__name__,
1051+
str(e),
1052+
)
1053+
return None
1054+
1055+
@staticmethod
1056+
def _write_installation_candidate_cache(
1057+
cached_candidates_path: Path,
1058+
candidates: Iterable[InstallationCandidate],
1059+
) -> List[InstallationCandidate]:
1060+
candidates = list(candidates)
1061+
serialized_candidates = [
1062+
{
1063+
"name": candidate.name,
1064+
"version": str(candidate.version),
1065+
"link": candidate.link.cache_args().to_json(),
1066+
}
1067+
for candidate in candidates
1068+
]
1069+
with bz2.open(cached_candidates_path, mode="wt", encoding="utf-8") as f:
1070+
logger.debug("writing serialized candidates to %s", cached_candidates_path)
1071+
json.dump(serialized_candidates, f)
1072+
return candidates
1073+
9571074
def _process_project_url_uncached(
9581075
self, project_url: Link, link_evaluator: LinkEvaluator
9591076
) -> List[InstallationCandidate]:
@@ -972,7 +1089,6 @@ def _process_project_url_uncached(
9721089
package_links = self.evaluate_links(link_evaluator, links=page_links)
9731090
return package_links
9741091

975-
@functools.lru_cache(maxsize=None)
9761092
def process_project_url(
9771093
self, project_url: Link, link_evaluator: LinkEvaluator
9781094
) -> List[InstallationCandidate]:
@@ -985,6 +1101,10 @@ def process_project_url(
9851101
etag_path = cached_path / "etag"
9861102
date_path = cached_path / "modified-since-date"
9871103
checksum_path = cached_path / "checksum"
1104+
parsed_links_path = cached_path / "parsed-links"
1105+
cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path(
1106+
project_url, link_evaluator
1107+
)
9881108

9891109
headers: Dict[str, str] = {}
9901110
# NB: mutates headers!
@@ -1021,16 +1141,45 @@ def process_project_url(
10211141
prev_checksum=prev_checksum,
10221142
)
10231143

1024-
page_links = parse_links(index_response)
1144+
page_links: Optional[List[Link]] = None
1145+
# Only try our persistent link parsing and evaluation caches if we know the page
1146+
# was unmodified via checksum.
1147+
if page_unmodified:
1148+
cached_candidates = self._try_load_installation_candidate_cache(
1149+
cached_candidates_path
1150+
)
1151+
if cached_candidates is not None:
1152+
return cached_candidates
1153+
1154+
page_links = self._try_load_parsed_links_cache(parsed_links_path)
1155+
else:
1156+
try:
1157+
parsed_links_path.unlink()
1158+
except OSError:
1159+
pass
1160+
self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator)
1161+
1162+
if page_links is None:
1163+
logger.debug(
1164+
"extracting new parsed links from index response %s", index_response
1165+
)
1166+
page_links = self._write_parsed_links_cache(
1167+
parsed_links_path,
1168+
parse_links(index_response),
1169+
)
10251170

10261171
with indent_log():
1027-
package_links = self.evaluate_links(
1028-
link_evaluator,
1029-
links=page_links,
1172+
package_links = self._write_installation_candidate_cache(
1173+
cached_candidates_path,
1174+
self.evaluate_links(
1175+
link_evaluator,
1176+
links=page_links,
1177+
),
10301178
)
10311179

10321180
return package_links
10331181

1182+
@_canonicalize_arg
10341183
@functools.lru_cache(maxsize=None)
10351184
def find_all_candidates(self, project_name: str) -> List[InstallationCandidate]:
10361185
"""Find all available InstallationCandidate for project_name

src/pip/_internal/models/link.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,43 @@ def _ensure_quoted_url(url: str) -> str:
178178
return urllib.parse.urlunparse(result._replace(path=path))
179179

180180

181+
@dataclass(frozen=True)
182+
class PersistentLinkCacheArgs:
183+
url: str
184+
comes_from: Optional[str] = None
185+
requires_python: Optional[str] = None
186+
yanked_reason: Optional[str] = None
187+
metadata_file_data: Optional[MetadataFile] = None
188+
hashes: Optional[Mapping[str, str]] = None
189+
190+
def to_json(self) -> Dict[str, Any]:
191+
return {
192+
"url": self.url,
193+
"comes_from": self.comes_from,
194+
"requires_python": self.requires_python,
195+
"yanked_reason": self.yanked_reason,
196+
"metadata_file_data": (
197+
self.metadata_file_data.hashes if self.metadata_file_data else None
198+
),
199+
"hashes": self.hashes,
200+
}
201+
202+
@classmethod
203+
def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs":
204+
return cls(
205+
url=cache_info["url"],
206+
comes_from=cache_info["comes_from"],
207+
requires_python=cache_info["requires_python"],
208+
yanked_reason=cache_info["yanked_reason"],
209+
metadata_file_data=(
210+
MetadataFile(hashes=cache_info["metadata_file_data"])
211+
if cache_info["metadata_file_data"]
212+
else None
213+
),
214+
hashes=cache_info["hashes"],
215+
)
216+
217+
181218
@functools.total_ordering
182219
class Link:
183220
"""Represents a parsed link from a Package Index's simple URL"""
@@ -303,6 +340,27 @@ def from_json(
303340
metadata_file_data=metadata_file_data,
304341
)
305342

343+
def cache_args(self) -> PersistentLinkCacheArgs:
344+
return PersistentLinkCacheArgs(
345+
url=self.url,
346+
comes_from=(str(self.comes_from) if self.comes_from else None),
347+
requires_python=self.requires_python,
348+
yanked_reason=self.yanked_reason,
349+
metadata_file_data=self.metadata_file_data,
350+
hashes=self._hashes,
351+
)
352+
353+
@classmethod
354+
def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link":
355+
return cls(
356+
args.url,
357+
comes_from=args.comes_from,
358+
requires_python=args.requires_python,
359+
yanked_reason=args.yanked_reason,
360+
metadata_file_data=args.metadata_file_data,
361+
hashes=args.hashes,
362+
)
363+
306364
@classmethod
307365
def from_element(
308366
cls,

0 commit comments

Comments
 (0)