Skip to content

Commit 69274bf

Browse files
authored
embedding content updates and removals (#2217)
* adding method to remove qdrant points by filter params * adding safety check * adding method to remove points by db id * ensuring ocw contentfiles have a checksum * adding checksum to contentfile serializer * ensuring we only generate embeddings if embedding context has changed * enforce checksum on contentfile save * adding checksum to param map * reverting changes to pipeline * adding separate archive checksum field * upserting points and contentfile diffing * adding missing migration * update specs * test fixes * fixing test * fixing tests * addition of updating payloads * adding more tests and task to remove embeddings from index * adding plugin hooks for embeddings * reverting additions to plugins * adding plugin hooks * fixing tests * adding setting to toggle embedding tasks in plugin methods * fix test * add docstring * setting archve_checksum from checksum value * reorganizing call order * removing archive_checksum from serializer * fix tests * separate methods for updating payloads * regenerate spec * fixing test to adjust for separate methods * separate _embedding_context methods for learning resource and content file * splitting methods for embedding contentfiles and learning resources
1 parent 67968be commit 69274bf

File tree

19 files changed

+652
-90
lines changed

19 files changed

+652
-90
lines changed

frontends/api/src/generated/v0/api.ts

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

frontends/api/src/generated/v1/api.ts

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

learning_resources/etl/edx_shared_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def test_sync_edx_course_files_matching_checksum(
124124
"learning_resources_search.plugins.tasks.index_run_content_files"
125125
)
126126
mock_deindex = mocker.patch(
127-
"learning_resources_search.plugins.tasks.deindex_run_content_files"
127+
"learning_resources_search.plugins.tasks.deindex_run_content_files.si"
128128
)
129129
mock_log = mocker.patch("learning_resources.etl.edx_shared.log.info")
130130
mock_load = mocker.patch("learning_resources.etl.edx_shared.load_content_files")
@@ -142,7 +142,7 @@ def test_sync_edx_course_files_matching_checksum(
142142
)
143143
sync_edx_course_files("mitxonline", [run.learning_resource.id], [key])
144144
mock_log.assert_any_call("Checksums match for %s, skipping load", key)
145-
mock_deindex.assert_called_once_with(other_run.id, False) # noqa: FBT003
145+
mock_deindex.assert_called_once_with(other_run.id, unpublished_only=False)
146146
mock_load.assert_not_called()
147147
mock_index.assert_not_called()
148148

learning_resources/etl/loaders_test.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,11 @@ def mock_upsert_tasks(mocker):
132132
deindex_learning_resource=mocker.patch(
133133
"learning_resources_search.tasks.deindex_document"
134134
),
135+
deindex_learning_resource_immutable_signature=mocker.patch(
136+
"learning_resources_search.tasks.deindex_document.si"
137+
),
135138
batch_deindex_resources=mocker.patch(
136-
"learning_resources_search.tasks.bulk_deindex_learning_resources"
139+
"learning_resources_search.tasks.bulk_deindex_learning_resources.si"
137140
),
138141
)
139142

@@ -260,13 +263,15 @@ def test_load_program( # noqa: PLR0913
260263
assert relationship.child.readable_id == data.learning_resource.readable_id
261264

262265
if program_exists and not is_published:
263-
mock_upsert_tasks.deindex_learning_resource.assert_called_with(
266+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_called_with(
264267
result.id, result.resource_type
265268
)
266269
elif is_published:
267-
mock_upsert_tasks.upsert_learning_resource.assert_called_with(result.id)
270+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
271+
result.id
272+
)
268273
else:
269-
mock_upsert_tasks.upsert_learning_resource.assert_not_called()
274+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_not_called()
270275

271276

272277
def test_load_program_bad_platform(mocker):
@@ -414,14 +419,16 @@ def test_load_course( # noqa: PLR0913,PLR0912,PLR0915
414419
)
415420

416421
if course_exists and ((not is_published or not is_run_published) or blocklisted):
417-
mock_upsert_tasks.deindex_learning_resource.assert_called_with(
422+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_called_with(
418423
result.id, result.resource_type
419424
)
420425
elif is_published and is_run_published and not blocklisted:
421-
mock_upsert_tasks.upsert_learning_resource.assert_called_with(result.id)
426+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
427+
result.id
428+
)
422429
else:
423-
mock_upsert_tasks.deindex_learning_resource.assert_not_called()
424-
mock_upsert_tasks.upsert_learning_resource.assert_not_called()
430+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_not_called()
431+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_not_called()
425432

426433
if course_exists and is_published and not blocklisted:
427434
course.refresh_from_db()
@@ -545,11 +552,11 @@ def test_load_duplicate_course(
545552
result = load_course(props, [], duplicates)
546553

547554
if course_id_is_duplicate and duplicate_course_exists:
548-
mock_upsert_tasks.deindex_learning_resource.assert_called()
555+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_called()
549556
else:
550-
mock_upsert_tasks.deindex_learning_resource.assert_not_called()
557+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_not_called()
551558
if course.learning_resource.id:
552-
mock_upsert_tasks.upsert_learning_resource.assert_called_with(
559+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
553560
course.learning_resource.id
554561
)
555562

@@ -925,10 +932,10 @@ def test_load_content_files(mocker, is_published, extra_run, calc_score):
925932
autospec=True,
926933
)
927934
mock_bulk_index = mocker.patch(
928-
"learning_resources_search.plugins.tasks.index_run_content_files",
935+
"learning_resources_search.plugins.tasks.index_run_content_files.si",
929936
)
930937
mock_bulk_delete = mocker.patch(
931-
"learning_resources_search.plugins.tasks.deindex_run_content_files",
938+
"learning_resources_search.plugins.tasks.deindex_run_content_files.si",
932939
autospec=True,
933940
)
934941
mock_calc_score = mocker.patch(
@@ -1108,13 +1115,15 @@ def test_load_podcast_episode(
11081115
assert getattr(result, key) == value, f"Property {key} should equal {value}"
11091116

11101117
if podcast_episode_exists and not is_published:
1111-
mock_upsert_tasks.deindex_learning_resource.assert_called_with(
1118+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_called_with(
11121119
result.id, result.resource_type
11131120
)
11141121
elif is_published:
1115-
mock_upsert_tasks.upsert_learning_resource.assert_called_with(result.id)
1122+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
1123+
result.id
1124+
)
11161125
else:
1117-
mock_upsert_tasks.upsert_learning_resource.assert_not_called()
1126+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_not_called()
11181127
mock_upsert_tasks.deindex_learning_resource.assert_not_called()
11191128

11201129

@@ -1183,14 +1192,16 @@ def test_load_podcast(
11831192
assert new_podcast.resources.count() == 0
11841193

11851194
if podcast_exists and not is_published:
1186-
mock_upsert_tasks.deindex_learning_resource.assert_called_with(
1195+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_called_with(
11871196
result.id, result.resource_type
11881197
)
11891198
elif is_published:
1190-
mock_upsert_tasks.upsert_learning_resource.assert_called_with(result.id)
1199+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
1200+
result.id
1201+
)
11911202
else:
1192-
mock_upsert_tasks.deindex_learning_resource.assert_not_called()
1193-
mock_upsert_tasks.upsert_learning_resource.assert_not_called()
1203+
mock_upsert_tasks.deindex_learning_resource_immutable_signature.assert_not_called()
1204+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_not_called()
11941205

11951206

11961207
@pytest.mark.parametrize("video_exists", [True, False])
@@ -1322,7 +1333,7 @@ def test_load_playlist(mocker):
13221333

13231334
def test_load_playlists_unpublish(mocker):
13241335
"""Test load_playlists when a video/playlist gets unpublished"""
1325-
mocker.patch("learning_resources_search.tasks.bulk_deindex_learning_resources")
1336+
mocker.patch("learning_resources_search.tasks.bulk_deindex_learning_resources.si")
13261337
channel = VideoChannelFactory.create()
13271338

13281339
playlists = sorted(
@@ -1498,7 +1509,9 @@ def test_load_course_percolation(
14981509

14991510
blocklist = [learning_resource.readable_id] if blocklisted else []
15001511
result = load_course(props, blocklist, [], config=CourseLoaderConfig(prune=True))
1501-
mock_upsert_tasks.upsert_learning_resource.assert_called_with(result.id)
1512+
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
1513+
result.id
1514+
)
15021515

15031516

15041517
@pytest.mark.parametrize("certification", [True, False])

learning_resources/etl/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -331,14 +331,14 @@ def documents_from_olx(
331331
filebytes = f.read()
332332

333333
mimetype = mimetypes.types_map.get(extension_lower)
334-
checksum = md5(filebytes).hexdigest() # noqa: S324
334+
archive_checksum = md5(filebytes).hexdigest() # noqa: S324
335335

336336
yield (
337337
filebytes,
338338
{
339339
"content_type": CONTENT_TYPE_FILE,
340340
"mime_type": mimetype,
341-
"checksum": checksum,
341+
"archive_checksum": archive_checksum,
342342
"file_extension": extension_lower,
343343
"source_path": f"{path}/{filename}",
344344
},
@@ -434,7 +434,7 @@ def transform_content_files(
434434
existing_content = ContentFile.objects.filter(key=key, run=run).first()
435435
if (
436436
not existing_content
437-
or existing_content.checksum != metadata.get("checksum")
437+
or existing_content.archive_checksum != metadata.get("archive_checksum")
438438
) or overwrite:
439439
if settings.SKIP_TIKA and settings.ENVIRONMENT != "production":
440440
content_dict = {
@@ -478,7 +478,7 @@ def transform_content_files(
478478
"key": key,
479479
"published": True,
480480
"content_type": content_type,
481-
"checksum": metadata.get("checksum"),
481+
"archive_checksum": metadata.get("archive_checksum"),
482482
"file_extension": file_extension,
483483
"source_path": source_path,
484484
"edx_module_id": edx_module_id,

learning_resources/etl/utils_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def test_transform_content_files( # noqa: PLR0913
182182
document = "some text in the document"
183183
file_extension = ".pdf" if folder == "static" else ".html"
184184
content_type = "course"
185-
checksum = "7s35721d1647f962d59b8120a52210a7"
185+
archive_checksum = "7s35721d1647f962d59b8120a52210a7"
186186
metadata = {"title": "the title of the course"} if has_metadata else None
187187
tika_output = {"content": tika_content, "metadata": metadata}
188188

@@ -202,7 +202,7 @@ def test_transform_content_files( # noqa: PLR0913
202202
content_type=content_type,
203203
published=True,
204204
run=run,
205-
checksum=checksum,
205+
archive_checksum=archive_checksum,
206206
key=edx_module_id,
207207
)
208208

@@ -213,7 +213,7 @@ def test_transform_content_files( # noqa: PLR0913
213213
document,
214214
{
215215
"content_type": content_type,
216-
"checksum": checksum,
216+
"archive_checksum": archive_checksum,
217217
"file_extension": file_extension,
218218
"source_path": f"root/{folder}/uuid{file_extension}",
219219
},
@@ -244,7 +244,7 @@ def test_transform_content_files( # noqa: PLR0913
244244
"published": True,
245245
"content_title": metadata["title"] if has_metadata else "",
246246
"content_type": content_type,
247-
"checksum": checksum,
247+
"archive_checksum": archive_checksum,
248248
"file_extension": file_extension,
249249
"source_path": f"root/{folder}/uuid{file_extension}",
250250
"edx_module_id": edx_module_id,
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Generated by Django 4.2.20 on 2025-04-25 19:34
2+
3+
from django.db import migrations, models
4+
5+
6+
def populate_archive_checksum(apps, schema_editor):
7+
for content_file in apps.get_model(
8+
"learning_resources", "ContentFile"
9+
).objects.all():
10+
if content_file.archive_checksum is None:
11+
content_file.archive_checksum = content_file.checksum
12+
content_file.save()
13+
14+
15+
class Migration(migrations.Migration):
16+
dependencies = [
17+
("learning_resources", "0088_add_content_summarizer_config"),
18+
]
19+
20+
operations = [
21+
migrations.AddField(
22+
model_name="contentfile",
23+
name="archive_checksum",
24+
field=models.CharField(blank=True, max_length=32, null=True),
25+
),
26+
migrations.RunPython(populate_archive_checksum, migrations.RunPython.noop),
27+
]

learning_resources/models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import uuid
44
from abc import abstractmethod
55
from functools import cached_property
6+
from hashlib import md5
67
from typing import TYPE_CHECKING, Optional
78

89
from django.conf import settings
@@ -904,13 +905,25 @@ class ContentFile(TimestampedModel):
904905
)
905906
content_tags = models.ManyToManyField(LearningResourceContentTag)
906907
published = models.BooleanField(default=True)
908+
archive_checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
907909
checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
908910
source_path = models.CharField(max_length=1024, null=True, blank=True) # noqa: DJ001
909911
file_extension = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
910912
edx_module_id = models.CharField(max_length=1024, null=True, blank=True) # noqa: DJ001
911913
summary = models.TextField(blank=True, default="")
912914
flashcards = models.JSONField(blank=True, default=list)
913915

916+
def content_checksum(self):
917+
hasher = md5() # noqa: S324
918+
if self.content:
919+
hasher.update(self.content.encode("utf-8"))
920+
return hasher.hexdigest()
921+
return None
922+
923+
def save(self, **kwargs):
924+
self.checksum = self.content_checksum()
925+
super().save(**kwargs)
926+
914927
class Meta:
915928
unique_together = (("key", "run"),)
916929
verbose_name = "contentfile"

learning_resources/serializers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,7 @@ class ContentFileSerializer(serializers.ModelSerializer):
11791179
run_title = serializers.CharField(source="run.title", required=False)
11801180
run_slug = serializers.CharField(source="run.slug", required=False)
11811181
semester = serializers.CharField(source="run.semester", required=False)
1182+
checksum = serializers.CharField(required=False)
11821183
year = serializers.IntegerField(source="run.year", required=False)
11831184
topics = serializers.SerializerMethodField()
11841185
resource_id = serializers.SerializerMethodField()
@@ -1293,6 +1294,7 @@ class Meta:
12931294
"content_title",
12941295
"content_author",
12951296
"content_language",
1297+
"checksum",
12961298
"image_src",
12971299
"resource_id",
12981300
"resource_readable_id",

learning_resources/serializers_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,7 @@ def test_content_file_serializer(settings, expected_types, has_channels):
570570
"description": content_file.description,
571571
"file_type": content_file.file_type,
572572
"content_type": content_file.content_type,
573+
"checksum": content_file.checksum,
573574
"url": content_file.url,
574575
"content": content_kwargs["content"],
575576
"content_title": content_kwargs["content_title"],

0 commit comments

Comments
 (0)