Skip to content

Commit beac52a

Browse files
avoid caching parse_links() when the url is an index url
1 parent d2bc147 commit beac52a

File tree

3 files changed

+40
-22
lines changed

3 files changed

+40
-22
lines changed

src/pip/_internal/index/collector.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
from pip._internal.models.link import Link
2020
from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
21-
from pip._internal.utils.misc import redact_auth_from_url
21+
from pip._internal.utils.misc import pairwise, redact_auth_from_url
2222
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
2323
from pip._internal.utils.urls import path_to_url, url_to_path
2424
from pip._internal.vcs import is_url, vcs
@@ -341,6 +341,9 @@ def wrapper(cacheable_page):
341341

342342
def wrapper_wrapper(page):
343343
# type: (HTMLPage) -> List[Link]
344+
if page.is_index_url:
345+
# Avoid caching when requesting pypi indices.
346+
return list(fn(page))
344347
return wrapper(CacheablePageContent(page))
345348

346349
return wrapper_wrapper
@@ -376,9 +379,10 @@ class HTMLPage(object):
376379

377380
def __init__(
378381
self,
379-
content, # type: bytes
380-
encoding, # type: Optional[str]
381-
url, # type: str
382+
content, # type: bytes
383+
encoding, # type: Optional[str]
384+
url, # type: str
385+
is_index_url=False, # type: bool
382386
):
383387
# type: (...) -> None
384388
"""
@@ -388,6 +392,7 @@ def __init__(
388392
self.content = content
389393
self.encoding = encoding
390394
self.url = url
395+
self.is_index_url = is_index_url
391396

392397
def __str__(self):
393398
# type: () -> str
@@ -405,10 +410,13 @@ def _handle_get_page_fail(
405410
meth("Could not fetch URL %s: %s - skipping", link, reason)
406411

407412

408-
def _make_html_page(response):
409-
# type: (Response) -> HTMLPage
413+
def _make_html_page(response, is_index_url=False):
414+
# type: (Response, bool) -> HTMLPage
410415
encoding = _get_encoding_from_headers(response.headers)
411-
return HTMLPage(response.content, encoding=encoding, url=response.url)
416+
return HTMLPage(
417+
response.content,
418+
encoding=encoding,
419+
url=response.url, is_index_url=is_index_url)
412420

413421

414422
def _get_html_page(link, session=None):
@@ -461,7 +469,7 @@ def _get_html_page(link, session=None):
461469
except requests.Timeout:
462470
_handle_get_page_fail(link, "timed out")
463471
else:
464-
return _make_html_page(resp)
472+
return _make_html_page(resp, is_index_url=link.is_index_url)
465473
return None
466474

467475

@@ -624,7 +632,7 @@ def collect_links(self, project_name):
624632
# We want to filter out anything that does not have a secure origin.
625633
url_locations = [
626634
link for link in itertools.chain(
627-
(Link(url) for url in index_url_loc),
635+
(Link(url, is_index_url=True) for url in index_url_loc),
628636
(Link(url) for url in fl_url_loc),
629637
)
630638
if self.session.is_secure_origin(link)

src/pip/_internal/models/link.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def __init__(
3030
comes_from=None, # type: Optional[Union[str, HTMLPage]]
3131
requires_python=None, # type: Optional[str]
3232
yanked_reason=None, # type: Optional[Text]
33+
is_index_url=False, # type: bool
3334
):
3435
# type: (...) -> None
3536
"""
@@ -63,6 +64,8 @@ def __init__(
6364

6465
super(Link, self).__init__(key=url, defining_class=Link)
6566

67+
self.is_index_url = is_index_url
68+
6669
def __str__(self):
6770
# type: () -> str
6871
if self.requires_python:

tests/unit/test_collector.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os.path
3+
import re
34
import uuid
45
from textwrap import dedent
56

@@ -387,21 +388,27 @@ def test_parse_links_caches_same_page_by_url():
387388
encoding=None,
388389
url='https://example.com/simple/',
389390
)
391+
# Make a third page which represents an index url, which should not be
392+
# cached, even for the same url. We modify the page content slightly to
393+
# ensure that the result is not cached.
394+
page_3 = HTMLPage(
395+
re.sub(b'pkg1', b'pkg2', html_bytes),
396+
encoding=None,
397+
url='https://example.com/simple/',
398+
is_index_url=True,
399+
)
390400

391-
mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
392-
with mock_parse as mock_parse:
393-
mock_parse.return_value = html5lib.parse(
394-
page_1.content,
395-
transport_encoding=page_1.encoding,
396-
namespaceHTMLElements=False,
397-
)
398-
parsed_links_1 = list(parse_links(page_1))
399-
mock_parse.assert_called()
401+
parsed_links_1 = list(parse_links(page_1))
402+
assert len(parsed_links_1) == 1
403+
assert 'pkg1' in parsed_links_1[0].url
404+
405+
parsed_links_2 = list(parse_links(page_2))
406+
assert parsed_links_2 == parsed_links_1
400407

401-
with mock_parse as mock_parse:
402-
parsed_links_2 = list(parse_links(page_2))
403-
assert parsed_links_2 == parsed_links_1
404-
mock_parse.assert_not_called()
408+
parsed_links_3 = list(parse_links(page_3))
409+
assert len(parsed_links_3) == 1
410+
assert parsed_links_3 != parsed_links_1
411+
assert 'pkg2' in parsed_links_3[0].url
405412

406413

407414
def test_request_http_error(caplog):

0 commit comments

Comments
 (0)