Skip to content

Commit 4ad71c6

Browse files
shanbadymbertrand
authored andcommitted
Consistent qdrant point ids (#1839)
* adding util method for generating point id * moving point id generation outside of model and adding to embed command * fixing vector similarity endpoint * adding test * sorting ids in test * updating hash key for contentfiles
1 parent 41117c3 commit 4ad71c6

File tree

4 files changed

+53
-9
lines changed

4 files changed

+53
-9
lines changed

learning_resources_search/api.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -922,19 +922,17 @@ def _qdrant_similar_results(doc, num_resources):
922922
list of dict:
923923
list of serialized resources
924924
"""
925-
from learning_resources_search.indexing_api import qdrant_client
925+
from learning_resources_search.indexing_api import qdrant_client, vector_point_id
926926

927927
client = qdrant_client()
928928
return [
929-
hit.metadata
930-
for hit in client.query(
929+
hit.payload
930+
for hit in client.query_points(
931931
collection_name=f"{settings.QDRANT_BASE_COLLECTION_NAME}.resources",
932-
query_text=(
933-
f'{doc.get("title")} {doc.get("description")} '
934-
f'{doc.get("full_description")} {doc.get("content")}'
935-
),
932+
query=vector_point_id(doc["readable_id"]),
936933
limit=num_resources,
937-
)
934+
using=settings.QDRANT_SEARCH_VECTOR_NAME,
935+
).points
938936
]
939937

940938

learning_resources_search/indexing_api.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import json
66
import logging
7+
import uuid
78
from math import ceil
89

910
from django.conf import settings
@@ -143,6 +144,10 @@ def create_qdrand_collections(force_recreate):
143144
)
144145

145146

147+
def vector_point_id(readable_id):
148+
return str(uuid.uuid5(uuid.NAMESPACE_DNS, readable_id))
149+
150+
146151
def embed_learning_resources(ids, resource_type):
147152
# update embeddings
148153
client = qdrant_client()
@@ -168,7 +173,13 @@ def embed_learning_resources(ids, resource_type):
168173
f'{doc.get("full_description")} {doc.get("content")}'
169174
)
170175
metadata.append(doc)
171-
ids.append(doc["id"])
176+
if resource_type != CONTENT_FILE_TYPE:
177+
vector_point_key = doc["readable_id"]
178+
else:
179+
vector_point_key = (
180+
f"{doc['key']}.{doc['run_readable_id']}.{doc['resource_readable_id']}"
181+
)
182+
ids.append(vector_point_id(vector_point_key))
172183
client.add(
173184
collection_name=collection_name,
174185
ids=ids,

learning_resources_search/indexing_api_test.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from learning_resources.factories import (
1414
ContentFileFactory,
1515
CourseFactory,
16+
LearningResourceFactory,
1617
LearningResourceRunFactory,
1718
)
1819
from learning_resources.models import ContentFile
@@ -37,6 +38,7 @@
3738
deindex_percolators,
3839
deindex_run_content_files,
3940
delete_orphaned_indexes,
41+
embed_learning_resources,
4042
get_reindexing_alias_name,
4143
index_content_files,
4244
index_course_content_files,
@@ -45,8 +47,10 @@
4547
index_run_content_files,
4648
switch_indices,
4749
update_document_with_partial,
50+
vector_point_id,
4851
)
4952
from learning_resources_search.models import PercolateQuery
53+
from learning_resources_search.serializers import serialize_bulk_content_files
5054
from learning_resources_search.utils import remove_child_queries
5155
from main.utils import chunks
5256

@@ -896,3 +900,31 @@ def test_clear_featured_rank(mocked_es, mocker, clear_all_greater_than):
896900
"query": query,
897901
},
898902
)
903+
904+
905+
@pytest.mark.parametrize("content_type", ["learning_resource", "content_file"])
906+
def test_vector_point_id_used_for_embed(mocker, content_type):
907+
# test the vector ids we generate for embedding resources and files
908+
if content_type == "learning_resource":
909+
resources = LearningResourceFactory.create_batch(5)
910+
else:
911+
resources = ContentFileFactory.create_batch(5)
912+
mock_qdrant = mocker.patch("qdrant_client.QdrantClient")
913+
mock_qdrant.query.return_value = []
914+
mocker.patch(
915+
"learning_resources_search.indexing_api.qdrant_client",
916+
return_value=mock_qdrant,
917+
)
918+
919+
embed_learning_resources([resource.id for resource in resources], content_type)
920+
921+
if content_type == "learning_resource":
922+
point_ids = [vector_point_id(resource.readable_id) for resource in resources]
923+
else:
924+
point_ids = [
925+
vector_point_id(
926+
f"{resource['key']}.{resource['run_readable_id']}.{resource['resource_readable_id']}"
927+
)
928+
for resource in serialize_bulk_content_files([r.id for r in resources])
929+
]
930+
assert sorted(mock_qdrant.add.mock_calls[0].kwargs["ids"]) == sorted(point_ids)

main/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,9 @@ def get_all_config_keys():
799799
name="QDRANT_COLLECTION_NAME", default="resource_embeddings"
800800
)
801801

802+
QDRANT_SEARCH_VECTOR_NAME = get_string(
803+
name="QDRANT_SEARCH_VECTOR_NAME", default="fast-bge-small-en"
804+
)
802805

803806
QDRANT_DENSE_MODEL = get_string(
804807
name="QDRANT_DENSE_MODEL", default="sentence-transformers/all-MiniLM-L6-v2"

0 commit comments

Comments
 (0)