Skip to content

Cache final file from resume retry process #13441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/13441.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Resumed downloads are saved to the HTTP cache like any other normal download.
19 changes: 16 additions & 3 deletions src/pip/_internal/network/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from __future__ import annotations

import os
import shutil
from collections.abc import Generator
from contextlib import contextmanager
from datetime import datetime
from typing import BinaryIO
from typing import Any, BinaryIO, Callable

from pip._vendor.cachecontrol.cache import SeparateBodyBaseCache
from pip._vendor.cachecontrol.caches import SeparateBodyFileCache
Expand Down Expand Up @@ -72,12 +73,13 @@ def get(self, key: str) -> bytes | None:
with open(metadata_path, "rb") as f:
return f.read()

def _write(self, path: str, data: bytes) -> None:
def _write_to_file(self, path: str, writer_func: Callable[[BinaryIO], Any]) -> None:
"""Common file writing logic with proper permissions and atomic replacement."""
with suppressed_cache_errors():
ensure_dir(os.path.dirname(path))

with adjacent_tmp_file(path) as f:
f.write(data)
writer_func(f)
# Inherit the read/write permissions of the cache directory
# to enable multi-user cache use-cases.
mode = (
Expand All @@ -93,6 +95,12 @@ def _write(self, path: str, data: bytes) -> None:

replace(f.name, path)

def _write(self, path: str, data: bytes) -> None:
self._write_to_file(path, lambda f: f.write(data))

def _write_from_io(self, path: str, source_file: BinaryIO) -> None:
self._write_to_file(path, lambda f: shutil.copyfileobj(source_file, f))

def set(
self, key: str, value: bytes, expires: int | datetime | None = None
) -> None:
Expand All @@ -118,3 +126,8 @@ def get_body(self, key: str) -> BinaryIO | None:
def set_body(self, key: str, body: bytes) -> None:
path = self._get_cache_path(key) + ".body"
self._write(path, body)

def set_body_from_io(self, key: str, body_file: BinaryIO) -> None:
"""Set the body of the cache entry from a file object."""
path = self._get_cache_path(key) + ".body"
self._write_from_io(path, body_file)
68 changes: 66 additions & 2 deletions src/pip/_internal/network/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@
from http import HTTPStatus
from typing import BinaryIO

from pip._vendor.requests import PreparedRequest
from pip._vendor.requests.models import Response
from pip._vendor.urllib3 import HTTPResponse as URLlib3Response
from pip._vendor.urllib3._collections import HTTPHeaderDict
from pip._vendor.urllib3.exceptions import ReadTimeoutError

from pip._internal.cli.progress_bars import get_download_progress_renderer
from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError
from pip._internal.models.index import PyPI
from pip._internal.models.link import Link
from pip._internal.network.cache import is_from_cache
from pip._internal.network.session import PipSession
from pip._internal.network.cache import SafeFileCache, is_from_cache
from pip._internal.network.session import CacheControlAdapter, PipSession
from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext

Expand Down Expand Up @@ -250,6 +253,67 @@ def _attempt_resumes_or_redownloads(
os.remove(download.output_file.name)
raise IncompleteDownloadError(download)

# If we successfully completed the download via resume, manually cache it
# as a complete response to enable future caching
if download.reattempts > 0:
self._cache_resumed_download(download, first_resp)

def _cache_resumed_download(
self, download: _FileDownload, original_response: Response
) -> None:
"""
Manually cache a file that was successfully downloaded via resume retries.

cachecontrol doesn't cache 206 (Partial Content) responses, since they
are not complete files. This method manually adds the final file to the
cache as though it was downloaded in a single request, so that future
requests can use the cache.
"""
url = download.link.url_without_fragment
adapter = self._session.get_adapter(url)

# Check if the adapter is the CacheControlAdapter (i.e. caching is enabled)
if not isinstance(adapter, CacheControlAdapter):
logger.debug(
"Skipping resume download caching: no cache controller for %s", url
)
return

# Check SafeFileCache is being used
assert isinstance(
adapter.cache, SafeFileCache
), "separate body cache not in use!"

synthetic_request = PreparedRequest()
synthetic_request.prepare(method="GET", url=url, headers={})

synthetic_response_headers = HTTPHeaderDict()
for key, value in original_response.headers.items():
if key.lower() not in ["content-range", "content-length"]:
synthetic_response_headers[key] = value
synthetic_response_headers["content-length"] = str(download.size)

synthetic_response = URLlib3Response(
body="",
headers=synthetic_response_headers,
status=200,
preload_content=False,
)

# Save metadata and then stream the file contents to cache.
cache_url = adapter.controller.cache_url(url)
metadata_blob = adapter.controller.serializer.dumps(
synthetic_request, synthetic_response, b""
)
adapter.cache.set(cache_url, metadata_blob)
download.output_file.flush()
with open(download.output_file.name, "rb") as f:
adapter.cache.set_body_from_io(cache_url, f)

logger.debug(
"Cached resumed download as complete response for future use: %s", url
)

def _http_get_resume(
self, download: _FileDownload, should_match: Response
) -> Response:
Expand Down
37 changes: 37 additions & 0 deletions tests/unit/test_network_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,40 @@ def test_downloader(

# Make sure that the downloader makes additional requests for resumption
_http_get_mock.assert_has_calls(calls)


def test_resumed_download_caching(tmpdir: Path) -> None:
"""Test that resumed downloads are cached properly for future use."""
cache_dir = tmpdir / "cache"
session = PipSession(cache=str(cache_dir))
link = Link("https://example.com/foo.tgz")
downloader = Downloader(session, "on", resume_retries=5)

# Mock an incomplete download followed by a successful resume
incomplete_resp = MockResponse(b"0cfa7e9d-1868-4dd7-9fb3-")
incomplete_resp.headers = {"content-length": "36"}
incomplete_resp.status_code = 200

resume_resp = MockResponse(b"f2561d5dfd89")
resume_resp.headers = {"content-length": "12"}
resume_resp.status_code = 206

responses = [incomplete_resp, resume_resp]
_http_get_mock = MagicMock(side_effect=responses)

with patch.object(Downloader, "_http_get", _http_get_mock):
# Perform the download (incomplete then resumed)
filepath, _ = downloader(link, str(tmpdir))

# Verify the file was downloaded correctly
with open(filepath, "rb") as downloaded_file:
downloaded_bytes = downloaded_file.read()
expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89"
assert downloaded_bytes == expected_bytes

# Verify that the cache directory was created and contains cache files
# The resumed download should have been cached for future use
assert cache_dir.exists()
cache_files = list(cache_dir.rglob("*"))
# Should have cache files (both metadata and body files)
assert len([f for f in cache_files if f.is_file()]) == 2