From 67849af5e326cb3a31bf27ddb4907e5acef369b4 Mon Sep 17 00:00:00 2001 From: Steven Silvester Date: Wed, 15 Jan 2025 14:46:41 -0600 Subject: [PATCH 1/6] Use caching for the etag --- flask_pymongo/__init__.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/flask_pymongo/__init__.py b/flask_pymongo/__init__.py index a27abb5..ea5918b 100644 --- a/flask_pymongo/__init__.py +++ b/flask_pymongo/__init__.py @@ -183,15 +183,26 @@ def get_upload(filename): response.headers["Content-Disposition"] = f"attachment; filename={filename}" response.content_length = fileobj.length response.last_modified = fileobj.upload_date - # Compute the sha1 sum of the file for the etag. - pos = fileobj.tell() - raw_data = fileobj.read() - fileobj.seek(pos) - digest = hashlib.sha1(raw_data).hexdigest() - response.set_etag(digest) response.cache_control.max_age = cache_for response.cache_control.public = True response.make_conditional(request) + + # GridFS does not manage its own hash, so we manage our own using its + # metadata storage, to be used for the etag. + sha1_sum = fileobj.metadata.get("sha1_sum", None) + sha1_update_date = fileobj.metadata.get("sha1_update_date", None) + if sha1_update_date and sha1_update_date != fileobj.upload_date: + sha1_sum = None + if sha1_sum is None: + # Compute the sha1 sum of the file for the etag. + pos = fileobj.tell() + raw_data = fileobj.read() + fileobj.seek(pos) + sha1_sum = hashlib.sha1(raw_data).hexdigest() + fileobj.metadata["sha1_sum"] = sha1_sum + fileobj.metadata["sha1_update_date"] = fileobj.upload_date + response.set_etag(sha1_sum) + return response def save_file( From 17e7424fe9f849f16d6ddbc41d4c0e7a65452f47 Mon Sep 17 00:00:00 2001 From: Steven Silvester Date: Wed, 15 Jan 2025 15:03:57 -0600 Subject: [PATCH 2/6] Fix handling of sha sum --- flask_pymongo/__init__.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/flask_pymongo/__init__.py b/flask_pymongo/__init__.py index ea5918b..e756ece 100644 --- a/flask_pymongo/__init__.py +++ b/flask_pymongo/__init__.py @@ -187,20 +187,10 @@ def get_upload(filename): response.cache_control.public = True response.make_conditional(request) - # GridFS does not manage its own hash, so we manage our own using its - # metadata storage, to be used for the etag. - sha1_sum = fileobj.metadata.get("sha1_sum", None) - sha1_update_date = fileobj.metadata.get("sha1_update_date", None) - if sha1_update_date and sha1_update_date != fileobj.upload_date: - sha1_sum = None - if sha1_sum is None: - # Compute the sha1 sum of the file for the etag. - pos = fileobj.tell() - raw_data = fileobj.read() - fileobj.seek(pos) - sha1_sum = hashlib.sha1(raw_data).hexdigest() - fileobj.metadata["sha1_sum"] = sha1_sum - fileobj.metadata["sha1_update_date"] = fileobj.upload_date + # Get or compute the sha1 sum for the etag. + metadata = fileobj.metadata + sha1_sum = metadata and metadata.get("sha1_sum") + sha1_sum = sha1_sum or self._compute_sha(fileobj) response.set_etag(sha1_sum) return response @@ -248,5 +238,19 @@ def save_upload(filename): db_obj = self.db assert db_obj is not None, "Please initialize the app before calling save_file!" storage = GridFS(db_obj, base) - id = storage.put(fileobj, filename=filename, content_type=content_type, **kwargs) + + # GridFS does not manage its own hash, so we manage our own using its + # metadata storage, to be used for the etag. + sha1_sum = self._compute_sha(fileobj) + metadata = dict(sha1_sum=sha1_sum) + id = storage.put( + fileobj, filename=filename, content_type=content_type, metadata=metadata, **kwargs + ) return id + + def _compute_sha(self, fileobj: Any) -> str: + """Compute the sha sum of a file object.""" + pos = fileobj.tell() + raw_data = fileobj.read() + fileobj.seek(pos) + return hashlib.sha1(raw_data).hexdigest() From 794c9d46f72bce1f013d9a887ebf34022fa63fe0 Mon Sep 17 00:00:00 2001 From: Steven Silvester Date: Wed, 15 Jan 2025 15:10:45 -0600 Subject: [PATCH 3/6] fix test --- flask_pymongo/__init__.py | 6 +++--- tests/util.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flask_pymongo/__init__.py b/flask_pymongo/__init__.py index e756ece..2c7eb7a 100644 --- a/flask_pymongo/__init__.py +++ b/flask_pymongo/__init__.py @@ -183,9 +183,6 @@ def get_upload(filename): response.headers["Content-Disposition"] = f"attachment; filename={filename}" response.content_length = fileobj.length response.last_modified = fileobj.upload_date - response.cache_control.max_age = cache_for - response.cache_control.public = True - response.make_conditional(request) # Get or compute the sha1 sum for the etag. metadata = fileobj.metadata @@ -193,6 +190,9 @@ def get_upload(filename): sha1_sum = sha1_sum or self._compute_sha(fileobj) response.set_etag(sha1_sum) + response.cache_control.max_age = cache_for + response.cache_control.public = True + response.make_conditional(request) return response def save_file( diff --git a/tests/util.py b/tests/util.py index 9f2ab83..892b5f3 100644 --- a/tests/util.py +++ b/tests/util.py @@ -33,5 +33,5 @@ def setUp(self): def tearDown(self): assert self.mongo.cx is not None self.mongo.cx.drop_database(self.dbname) - + self.mongo.cx.close() super().tearDown() From 00a02fb2e3aa6869369c3c76c1ce0567b6e8ae6b Mon Sep 17 00:00:00 2001 From: Steven Silvester Date: Wed, 15 Jan 2025 19:43:14 -0600 Subject: [PATCH 4/6] Use a fixed size cache to store sha sums --- flask_pymongo/__init__.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/flask_pymongo/__init__.py b/flask_pymongo/__init__.py index 2c7eb7a..ff10f70 100644 --- a/flask_pymongo/__init__.py +++ b/flask_pymongo/__init__.py @@ -27,6 +27,7 @@ __all__ = ("PyMongo", "ASCENDING", "DESCENDING", "BSONObjectIdConverter", "BSONProvider") import hashlib +from collections import OrderedDict from mimetypes import guess_type from typing import Any @@ -66,6 +67,8 @@ def __init__( ) -> None: self.cx: MongoClient | None = None self.db: Database | None = None + self._hash_cache = OrderedDict() + self._hash_limit = 128 if app is not None: self.init_app(app, uri, *args, **kwargs) @@ -184,10 +187,18 @@ def get_upload(filename): response.content_length = fileobj.length response.last_modified = fileobj.upload_date - # Get or compute the sha1 sum for the etag. - metadata = fileobj.metadata - sha1_sum = metadata and metadata.get("sha1_sum") - sha1_sum = sha1_sum or self._compute_sha(fileobj) + # GridFS does not manage its own checksum, so we manage our own using its + # metadata storage, to be used for the etag. + sha1_sum = self._hash_cache.get(str(fileobj._id)) + if sha1_sum is None: + # Compute the checksum of the file for the etag. + pos = fileobj.tell() + raw_data = fileobj.read() + fileobj.seek(pos) + sha1_sum = hashlib.sha1(raw_data).hexdigest() + while len(self._hash_cache) >= self._hash_limit: + self._hash_cache.popitem() + self._hash_cache[str(fileobj._id)] = sha1_sum response.set_etag(sha1_sum) response.cache_control.max_age = cache_for @@ -238,19 +249,5 @@ def save_upload(filename): db_obj = self.db assert db_obj is not None, "Please initialize the app before calling save_file!" storage = GridFS(db_obj, base) - - # GridFS does not manage its own hash, so we manage our own using its - # metadata storage, to be used for the etag. - sha1_sum = self._compute_sha(fileobj) - metadata = dict(sha1_sum=sha1_sum) - id = storage.put( - fileobj, filename=filename, content_type=content_type, metadata=metadata, **kwargs - ) + id = storage.put(fileobj, filename=filename, content_type=content_type, **kwargs) return id - - def _compute_sha(self, fileobj: Any) -> str: - """Compute the sha sum of a file object.""" - pos = fileobj.tell() - raw_data = fileobj.read() - fileobj.seek(pos) - return hashlib.sha1(raw_data).hexdigest() From 006e7bc53b5f496dbecfc35df240a7031be84b2f Mon Sep 17 00:00:00 2001 From: Steven Silvester Date: Fri, 17 Jan 2025 07:45:25 -0600 Subject: [PATCH 5/6] address review --- flask_pymongo/__init__.py | 52 +++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/flask_pymongo/__init__.py b/flask_pymongo/__init__.py index ff10f70..de2fc2e 100644 --- a/flask_pymongo/__init__.py +++ b/flask_pymongo/__init__.py @@ -27,7 +27,6 @@ __all__ = ("PyMongo", "ASCENDING", "DESCENDING", "BSONObjectIdConverter", "BSONProvider") import hashlib -from collections import OrderedDict from mimetypes import guess_type from typing import Any @@ -67,8 +66,6 @@ def __init__( ) -> None: self.cx: MongoClient | None = None self.db: Database | None = None - self._hash_cache = OrderedDict() - self._hash_limit = 128 if app is not None: self.init_app(app, uri, *args, **kwargs) @@ -187,19 +184,20 @@ def get_upload(filename): response.content_length = fileobj.length response.last_modified = fileobj.upload_date - # GridFS does not manage its own checksum, so we manage our own using its - # metadata storage, to be used for the etag. - sha1_sum = self._hash_cache.get(str(fileobj._id)) - if sha1_sum is None: - # Compute the checksum of the file for the etag. - pos = fileobj.tell() - raw_data = fileobj.read() - fileobj.seek(pos) - sha1_sum = hashlib.sha1(raw_data).hexdigest() - while len(self._hash_cache) >= self._hash_limit: - self._hash_cache.popitem() - self._hash_cache[str(fileobj._id)] = sha1_sum - response.set_etag(sha1_sum) + # GridFS does not manage its own checksum. + # Try to use a sha1 sum that we have added during a save_file. + # Fall back to a legacy md5 sum if it exists. + # Otherwise, compute the sha1 sum directly. + try: + etag = fileobj.sha1 + except AttributeError: + etag = fileobj.md5 + if etag is None: + pos = fileobj.tell() + raw_data = fileobj.read() + fileobj.seek(pos) + etag = hashlib.sha1(raw_data).hexdigest() + response.set_etag(etag) response.cache_control.max_age = cache_for response.cache_control.public = True @@ -249,5 +247,23 @@ def save_upload(filename): db_obj = self.db assert db_obj is not None, "Please initialize the app before calling save_file!" storage = GridFS(db_obj, base) - id = storage.put(fileobj, filename=filename, content_type=content_type, **kwargs) - return id + + # GridFS does not manage its own checksum, so we attach a sha1 to the file + # for use as an etag. + hashingfile = _Wrapper(fileobj) + with storage.new_file(filename=filename, content_type=content_type, **kwargs) as grid_file: + grid_file.write(hashingfile) + grid_file.sha1 = hashingfile.hash.hexdigest() + return grid_file._id + + +class _Wrapper: + def __init__(self, file): + self.file = file + self.hash = hashlib.sha1() + + def read(self, n): + data = self.file.read(n) + if data: + self.hash.update(data) + return data From ed5237535abe7fd27578282bcc39f8da78124fd4 Mon Sep 17 00:00:00 2001 From: Steven Silvester Date: Tue, 21 Jan 2025 14:09:23 -0600 Subject: [PATCH 6/6] add md5 test --- flask_pymongo/__init__.py | 5 ++++- tests/test_gridfs.py | 23 ++++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/flask_pymongo/__init__.py b/flask_pymongo/__init__.py index de2fc2e..1737505 100644 --- a/flask_pymongo/__init__.py +++ b/flask_pymongo/__init__.py @@ -27,6 +27,7 @@ __all__ = ("PyMongo", "ASCENDING", "DESCENDING", "BSONObjectIdConverter", "BSONProvider") import hashlib +import warnings from mimetypes import guess_type from typing import Any @@ -191,7 +192,9 @@ def get_upload(filename): try: etag = fileobj.sha1 except AttributeError: - etag = fileobj.md5 + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + etag = fileobj.md5 if etag is None: pos = fileobj.tell() raw_data = fileobj.read() diff --git a/tests/test_gridfs.py b/tests/test_gridfs.py index ee6b6e3..8e8bf10 100644 --- a/tests/test_gridfs.py +++ b/tests/test_gridfs.py @@ -1,6 +1,7 @@ from __future__ import annotations -from hashlib import sha1 +import warnings +from hashlib import md5, sha1 from io import BytesIO import pytest @@ -95,6 +96,26 @@ def test_it_sets_supports_conditional_gets(self): resp = self.mongo.send_file("myfile.txt") assert resp.status_code == 304 + def test_it_sets_supports_conditional_gets_md5(self): + # a basic conditional GET + md5_hash = md5(self.myfile.getvalue()).hexdigest() + environ_args = { + "method": "GET", + "headers": { + "If-None-Match": md5_hash, + }, + } + storage = storage = GridFS(self.mongo.db) + with storage.new_file(filename="myfile.txt") as grid_file: + grid_file.write(self.myfile.getvalue()) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + grid_file.set("md5", md5_hash) + + with self.app.test_request_context(**environ_args): + resp = self.mongo.send_file("myfile.txt") + assert resp.status_code == 304 + def test_it_sets_cache_headers(self): resp = self.mongo.send_file("myfile.txt", cache_for=60) assert resp.cache_control.max_age == 60