Skip to content

Commit aa02630

Browse files
Add all Contentfile metadata to chunk responses (#2075)
* serialize contentfiles like we do with learning resources * fixing contentfile serialization * optimize loop and data fetch Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * fixing n+1 queries * adding block id to embedded metadata * adding block id as filter parameter * regenerate spec * fixing test: * some consolidation --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 1001aee commit aa02630

File tree

6 files changed

+78
-19
lines changed

6 files changed

+78
-19
lines changed

frontends/api/src/generated/v0/api.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10916,6 +10916,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
1091610916
* @param {string} [collection_name] Manually specify the name of the Qdrant collection to query
1091710917
* @param {Array<string>} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/
1091810918
* @param {Array<string>} [course_number] Course number of the content file
10919+
* @param {Array<string>} [edx_block_id] The edx_block_id of the content file
1091910920
* @param {Array<string>} [file_extension] The extension of the content file.
1092010921
* @param {Array<string>} [key] The filename of the content file
1092110922
* @param {number} [limit] Number of results to return per page
@@ -10933,6 +10934,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
1093310934
collection_name?: string,
1093410935
content_feature_type?: Array<string>,
1093510936
course_number?: Array<string>,
10937+
edx_block_id?: Array<string>,
1093610938
file_extension?: Array<string>,
1093710939
key?: Array<string>,
1093810940
limit?: number,
@@ -10973,6 +10975,10 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
1097310975
localVarQueryParameter["course_number"] = course_number
1097410976
}
1097510977

10978+
if (edx_block_id) {
10979+
localVarQueryParameter["edx_block_id"] = edx_block_id
10980+
}
10981+
1097610982
if (file_extension) {
1097710983
localVarQueryParameter["file_extension"] = file_extension
1097810984
}
@@ -11046,6 +11052,7 @@ export const VectorContentFilesSearchApiFp = function (
1104611052
* @param {string} [collection_name] Manually specify the name of the Qdrant collection to query
1104711053
* @param {Array<string>} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/
1104811054
* @param {Array<string>} [course_number] Course number of the content file
11055+
* @param {Array<string>} [edx_block_id] The edx_block_id of the content file
1104911056
* @param {Array<string>} [file_extension] The extension of the content file.
1105011057
* @param {Array<string>} [key] The filename of the content file
1105111058
* @param {number} [limit] Number of results to return per page
@@ -11063,6 +11070,7 @@ export const VectorContentFilesSearchApiFp = function (
1106311070
collection_name?: string,
1106411071
content_feature_type?: Array<string>,
1106511072
course_number?: Array<string>,
11073+
edx_block_id?: Array<string>,
1106611074
file_extension?: Array<string>,
1106711075
key?: Array<string>,
1106811076
limit?: number,
@@ -11085,6 +11093,7 @@ export const VectorContentFilesSearchApiFp = function (
1108511093
collection_name,
1108611094
content_feature_type,
1108711095
course_number,
11096+
edx_block_id,
1108811097
file_extension,
1108911098
key,
1109011099
limit,
@@ -11140,6 +11149,7 @@ export const VectorContentFilesSearchApiFactory = function (
1114011149
requestParameters.collection_name,
1114111150
requestParameters.content_feature_type,
1114211151
requestParameters.course_number,
11152+
requestParameters.edx_block_id,
1114311153
requestParameters.file_extension,
1114411154
requestParameters.key,
1114511155
requestParameters.limit,
@@ -11184,6 +11194,13 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ
1118411194
*/
1118511195
readonly course_number?: Array<string>
1118611196

11197+
/**
11198+
* The edx_block_id of the content file
11199+
* @type {Array<string>}
11200+
* @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve
11201+
*/
11202+
readonly edx_block_id?: Array<string>
11203+
1118711204
/**
1118811205
* The extension of the content file.
1118911206
* @type {Array<string>}
@@ -11279,6 +11296,7 @@ export class VectorContentFilesSearchApi extends BaseAPI {
1127911296
requestParameters.collection_name,
1128011297
requestParameters.content_feature_type,
1128111298
requestParameters.course_number,
11299+
requestParameters.edx_block_id,
1128211300
requestParameters.file_extension,
1128311301
requestParameters.key,
1128411302
requestParameters.limit,

learning_resources/models.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -838,13 +838,21 @@ def for_serialization(self):
838838
return self.select_related("run").prefetch_related(
839839
"content_tags",
840840
"run__learning_resource",
841+
"run__learning_resource__course",
842+
"run__learning_resource__platform",
841843
Prefetch(
842844
"run__learning_resource__topics",
843845
queryset=LearningResourceTopic.objects.for_serialization(),
844846
),
847+
Prefetch(
848+
"run__learning_resource__offered_by",
849+
queryset=LearningResourceOfferor.objects.for_serialization(),
850+
),
845851
Prefetch(
846852
"run__learning_resource__departments",
847-
queryset=LearningResourceDepartment.objects.for_serialization(),
853+
queryset=LearningResourceDepartment.objects.for_serialization(
854+
prefetch_school=True
855+
).select_related("school"),
848856
),
849857
)
850858

openapi/specs/v0.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,14 @@ paths:
854854
type: string
855855
minLength: 1
856856
description: Course number of the content file
857+
- in: query
858+
name: edx_block_id
859+
schema:
860+
type: array
861+
items:
862+
type: string
863+
minLength: 1
864+
description: The edx_block_id of the content file
857865
- in: query
858866
name: file_extension
859867
schema:

vector_search/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414
"run_readable_id": "run_readable_id",
1515
"resource_readable_id": "resource_readable_id",
1616
"run_title": "run_title",
17+
"edx_block_id": "edx_block_id",
18+
"content_type": "content_type",
19+
"description": "description",
20+
"url": "url",
21+
"file_type": "file_type",
1722
}
1823

1924
QDRANT_RESOURCE_PARAM_MAP = {
@@ -66,4 +71,5 @@
6671
"run_readable_id": models.PayloadSchemaType.INTEGER,
6772
"resource_readable_id": models.PayloadSchemaType.KEYWORD,
6873
"run_title": models.PayloadSchemaType.KEYWORD,
74+
"edx_block_id": models.PayloadSchemaType.KEYWORD,
6975
}

vector_search/serializers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,11 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer):
229229
"The readable_id value of the parent learning resource for the content file"
230230
),
231231
)
232+
edx_block_id = serializers.ListField(
233+
required=False,
234+
child=serializers.CharField(),
235+
help_text="The edx_block_id of the content file",
236+
)
232237
collection_name = serializers.CharField(
233238
required=False,
234239
help_text=("Manually specify the name of the Qdrant collection to query"),

vector_search/utils.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
from langchain_experimental.text_splitter import SemanticChunker
77
from qdrant_client import QdrantClient, models
88

9-
from learning_resources.models import LearningResource
10-
from learning_resources.serializers import LearningResourceSerializer
9+
from learning_resources.models import ContentFile, LearningResource
10+
from learning_resources.serializers import (
11+
ContentFileSerializer,
12+
LearningResourceSerializer,
13+
)
1114
from learning_resources_search.constants import CONTENT_FILE_TYPE
1215
from learning_resources_search.serializers import (
1316
serialize_bulk_content_files,
@@ -235,21 +238,8 @@ def _process_content_embeddings(serialized_content):
235238
"chunk_content": d.page_content,
236239
**{
237240
key: d.metadata[key]
238-
for key in [
239-
"run_title",
240-
"platform",
241-
"offered_by",
242-
"run_readable_id",
243-
"resource_readable_id",
244-
"content_type",
245-
"file_extension",
246-
"content_feature_type",
247-
"course_number",
248-
"file_type",
249-
"description",
250-
"key",
251-
"url",
252-
]
241+
for key in QDRANT_CONTENT_FILE_PARAM_MAP
242+
if key in d.metadata
253243
},
254244
}
255245
for chunk_id, d in enumerate(split_docs)
@@ -368,7 +358,31 @@ def _resource_vector_hits(search_result):
368358

369359

370360
def _content_file_vector_hits(search_result):
371-
return [hit.payload for hit in search_result]
361+
run_readable_ids = [hit.payload["run_readable_id"] for hit in search_result]
362+
keys = [hit.payload["key"] for hit in search_result]
363+
364+
serialized_content_files = ContentFileSerializer(
365+
ContentFile.objects.for_serialization().filter(
366+
run__run_id__in=run_readable_ids, key__in=keys
367+
),
368+
many=True,
369+
).data
370+
results = []
371+
contentfiles_dict = {}
372+
[
373+
contentfiles_dict.update({(cf["run_readable_id"], cf["key"]): cf})
374+
for cf in serialized_content_files
375+
]
376+
results = []
377+
for hit in search_result:
378+
payload = hit.payload
379+
serialized = contentfiles_dict.get((payload["run_readable_id"], payload["key"]))
380+
if serialized:
381+
if "content" in serialized:
382+
serialized.pop("content")
383+
payload.update(serialized)
384+
results.append(payload)
385+
return results
372386

373387

374388
def vector_search(

0 commit comments

Comments
 (0)