diff --git a/news/13441.bugfix.rst b/news/13441.bugfix.rst new file mode 100644 index 00000000000..519f38ea906 --- /dev/null +++ b/news/13441.bugfix.rst @@ -0,0 +1 @@ +Resumed downloads are saved to the HTTP cache like any other normal download. diff --git a/src/pip/_internal/network/cache.py b/src/pip/_internal/network/cache.py index 479a1bb98b8..0c5961c45b4 100644 --- a/src/pip/_internal/network/cache.py +++ b/src/pip/_internal/network/cache.py @@ -3,10 +3,11 @@ from __future__ import annotations import os +import shutil from collections.abc import Generator from contextlib import contextmanager from datetime import datetime -from typing import BinaryIO +from typing import Any, BinaryIO, Callable from pip._vendor.cachecontrol.cache import SeparateBodyBaseCache from pip._vendor.cachecontrol.caches import SeparateBodyFileCache @@ -72,12 +73,13 @@ def get(self, key: str) -> bytes | None: with open(metadata_path, "rb") as f: return f.read() - def _write(self, path: str, data: bytes) -> None: + def _write_to_file(self, path: str, writer_func: Callable[[BinaryIO], Any]) -> None: + """Common file writing logic with proper permissions and atomic replacement.""" with suppressed_cache_errors(): ensure_dir(os.path.dirname(path)) with adjacent_tmp_file(path) as f: - f.write(data) + writer_func(f) # Inherit the read/write permissions of the cache directory # to enable multi-user cache use-cases. mode = ( @@ -93,6 +95,12 @@ def _write(self, path: str, data: bytes) -> None: replace(f.name, path) + def _write(self, path: str, data: bytes) -> None: + self._write_to_file(path, lambda f: f.write(data)) + + def _write_from_io(self, path: str, source_file: BinaryIO) -> None: + self._write_to_file(path, lambda f: shutil.copyfileobj(source_file, f)) + def set( self, key: str, value: bytes, expires: int | datetime | None = None ) -> None: @@ -118,3 +126,8 @@ def get_body(self, key: str) -> BinaryIO | None: def set_body(self, key: str, body: bytes) -> None: path = self._get_cache_path(key) + ".body" self._write(path, body) + + def set_body_from_io(self, key: str, body_file: BinaryIO) -> None: + """Set the body of the cache entry from a file object.""" + path = self._get_cache_path(key) + ".body" + self._write_from_io(path, body_file) diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 1a00a5f246e..a655c19c366 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -11,15 +11,18 @@ from http import HTTPStatus from typing import BinaryIO +from pip._vendor.requests import PreparedRequest from pip._vendor.requests.models import Response +from pip._vendor.urllib3 import HTTPResponse as URLlib3Response +from pip._vendor.urllib3._collections import HTTPHeaderDict from pip._vendor.urllib3.exceptions import ReadTimeoutError from pip._internal.cli.progress_bars import get_download_progress_renderer from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError from pip._internal.models.index import PyPI from pip._internal.models.link import Link -from pip._internal.network.cache import is_from_cache -from pip._internal.network.session import PipSession +from pip._internal.network.cache import SafeFileCache, is_from_cache +from pip._internal.network.session import CacheControlAdapter, PipSession from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext @@ -250,6 +253,67 @@ def _attempt_resumes_or_redownloads( os.remove(download.output_file.name) raise IncompleteDownloadError(download) + # If we successfully completed the download via resume, manually cache it + # as a complete response to enable future caching + if download.reattempts > 0: + self._cache_resumed_download(download, first_resp) + + def _cache_resumed_download( + self, download: _FileDownload, original_response: Response + ) -> None: + """ + Manually cache a file that was successfully downloaded via resume retries. + + cachecontrol doesn't cache 206 (Partial Content) responses, since they + are not complete files. This method manually adds the final file to the + cache as though it was downloaded in a single request, so that future + requests can use the cache. + """ + url = download.link.url_without_fragment + adapter = self._session.get_adapter(url) + + # Check if the adapter is the CacheControlAdapter (i.e. caching is enabled) + if not isinstance(adapter, CacheControlAdapter): + logger.debug( + "Skipping resume download caching: no cache controller for %s", url + ) + return + + # Check SafeFileCache is being used + assert isinstance( + adapter.cache, SafeFileCache + ), "separate body cache not in use!" + + synthetic_request = PreparedRequest() + synthetic_request.prepare(method="GET", url=url, headers={}) + + synthetic_response_headers = HTTPHeaderDict() + for key, value in original_response.headers.items(): + if key.lower() not in ["content-range", "content-length"]: + synthetic_response_headers[key] = value + synthetic_response_headers["content-length"] = str(download.size) + + synthetic_response = URLlib3Response( + body="", + headers=synthetic_response_headers, + status=200, + preload_content=False, + ) + + # Save metadata and then stream the file contents to cache. + cache_url = adapter.controller.cache_url(url) + metadata_blob = adapter.controller.serializer.dumps( + synthetic_request, synthetic_response, b"" + ) + adapter.cache.set(cache_url, metadata_blob) + download.output_file.flush() + with open(download.output_file.name, "rb") as f: + adapter.cache.set_body_from_io(cache_url, f) + + logger.debug( + "Cached resumed download as complete response for future use: %s", url + ) + def _http_get_resume( self, download: _FileDownload, should_match: Response ) -> Response: diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py index 1aa8c9267fa..7fb6b7fe64b 100644 --- a/tests/unit/test_network_download.py +++ b/tests/unit/test_network_download.py @@ -350,3 +350,40 @@ def test_downloader( # Make sure that the downloader makes additional requests for resumption _http_get_mock.assert_has_calls(calls) + + +def test_resumed_download_caching(tmpdir: Path) -> None: + """Test that resumed downloads are cached properly for future use.""" + cache_dir = tmpdir / "cache" + session = PipSession(cache=str(cache_dir)) + link = Link("https://example.com/foo.tgz") + downloader = Downloader(session, "on", resume_retries=5) + + # Mock an incomplete download followed by a successful resume + incomplete_resp = MockResponse(b"0cfa7e9d-1868-4dd7-9fb3-") + incomplete_resp.headers = {"content-length": "36"} + incomplete_resp.status_code = 200 + + resume_resp = MockResponse(b"f2561d5dfd89") + resume_resp.headers = {"content-length": "12"} + resume_resp.status_code = 206 + + responses = [incomplete_resp, resume_resp] + _http_get_mock = MagicMock(side_effect=responses) + + with patch.object(Downloader, "_http_get", _http_get_mock): + # Perform the download (incomplete then resumed) + filepath, _ = downloader(link, str(tmpdir)) + + # Verify the file was downloaded correctly + with open(filepath, "rb") as downloaded_file: + downloaded_bytes = downloaded_file.read() + expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89" + assert downloaded_bytes == expected_bytes + + # Verify that the cache directory was created and contains cache files + # The resumed download should have been cached for future use + assert cache_dir.exists() + cache_files = list(cache_dir.rglob("*")) + # Should have cache files (both metadata and body files) + assert len([f for f in cache_files if f.is_file()]) == 2