Contentfile chunk embeddings (#1905)

shanbady · web-flow · commit 302bf6cbf26a · 2024-12-16T16:06:05.000-05:00
* updating deps and adding method for getting token count

* working contentfile chunk embeds

* storing chunks and update initial resource record with embeddings from contentfile chunk

* adding management command flag to generate embeds by id

* fixing test

* ensuring we stay under token size

* removing full content from points

* moving splitter to separate function

* adding test for text splitter

* adding more tests

* fixing test

* changing chunk key name

* fix test setting:

* fixing test
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -84,6 +84,8 @@ qdrant-client = {extras = ["fastembed"], version = "^1.12.0"}
 onnxruntime = "1.20.1"
 openai = "^1.55.3"
 litellm = "^1.53.5"
+langchain = "^0.3.11"
+tiktoken = "^0.8.0"
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/vector_search/conftest.py b/vector_search/conftest.py
@@ -23,6 +23,7 @@ def encode_batch(self, texts: list[str]) -> list[list[float]]:
 @pytest.fixture(autouse=True)
 def _use_dummy_encoder(settings):
     settings.QDRANT_ENCODER = "vector_search.conftest.DummyEmbedEncoder"
+    settings.QDRANT_DENSE_MODEL = None
 
 
 @pytest.fixture(autouse=True)
diff --git a/vector_search/encoders/litellm.py b/vector_search/encoders/litellm.py
@@ -1,15 +1,27 @@
+import logging
+
+import tiktoken
 from litellm import embedding
 
 from vector_search.encoders.base import BaseEncoder
 
+log = logging.getLogger()
+
 
 class LiteLLMEncoder(BaseEncoder):
     """
     LiteLLM encoder
     """
 
+    token_encoding_name = "cl100k_base"  # noqa: S105
+
     def __init__(self, model_name="text-embedding-3-small"):
         self.model_name = model_name
+        try:
+            self.token_encoding_name = tiktoken.encoding_name_for_model(model_name)
+        except KeyError:
+            msg = f"Model {model_name} not found in tiktoken. defaulting to cl100k_base"
+            log.warning(msg)
 
     def encode_batch(self, texts: list[str]) -> list[list[float]]:
         return [
diff --git a/vector_search/management/commands/generate_embeddings.py b/vector_search/management/commands/generate_embeddings.py
@@ -4,7 +4,7 @@
 
 from learning_resources_search.constants import LEARNING_RESOURCE_TYPES
 from main.utils import clear_search_cache, now_in_utc
-from vector_search.tasks import start_embed_resources
+from vector_search.tasks import embed_learning_resources_by_id, start_embed_resources
 from vector_search.utils import (
     create_qdrand_collections,
 )
@@ -30,6 +30,12 @@ def add_arguments(self, parser):
             help="Embed all resource types (including content files)",
         )
 
+        parser.add_argument(
+            "--resource-ids",
+            dest="resource-ids",
+            help="Embed a specific set of reesources (overrides the --all flag)",
+        )
+
         parser.add_argument(
             "--skip-contentfiles",
             dest="skip_content_files",
@@ -49,7 +55,7 @@ def add_arguments(self, parser):
     def handle(self, *args, **options):  # noqa: ARG002
         """Embed all LEARNING_RESOURCE_TYPES"""
 
-        if options["all"]:
+        if options["all"] or options["resource-ids"]:
             indexes_to_update = list(LEARNING_RESOURCE_TYPES)
         else:
             indexes_to_update = list(
@@ -66,9 +72,18 @@ def handle(self, *args, **options):  # noqa: ARG002
                 return
         if options["recreate_collections"]:
             create_qdrand_collections(force_recreate=True)
-        task = start_embed_resources.delay(
-            indexes_to_update, skip_content_files=options["skip_content_files"]
-        )
+        if options["resource-ids"]:
+            task = embed_learning_resources_by_id.delay(
+                [
+                    int(resource_id)
+                    for resource_id in options["resource-ids"].split(",")
+                ],
+                skip_content_files=options["skip_content_files"],
+            )
+        else:
+            task = start_embed_resources.delay(
+                indexes_to_update, skip_content_files=options["skip_content_files"]
+            )
         self.stdout.write(
             f"Started celery task {task} to index content for the following"
             f" Types to embed: {indexes_to_update}"
diff --git a/vector_search/tasks.py b/vector_search/tasks.py
@@ -16,6 +16,7 @@
     CONTENT_FILE_TYPE,
     COURSE_TYPE,
     LEARNING_PATH_TYPE,
+    LEARNING_RESOURCE_TYPES,
     PODCAST_EPISODE_TYPE,
     PODCAST_TYPE,
     PROGRAM_TYPE,
@@ -67,7 +68,7 @@ def generate_embeddings(ids, resource_type):
 @app.task(bind=True)
 def start_embed_resources(self, indexes, skip_content_files):
     """
-    Celery task to embed learning resources
+    Celery task to embed all learning resources for given indexes
 
     Args:
         indexes (list of str): resource types to embed
@@ -152,6 +153,73 @@ def start_embed_resources(self, indexes, skip_content_files):
     return self.replace(celery.chain(*index_tasks))
 
 
+@app.task(bind=True)
+def embed_learning_resources_by_id(self, ids, skip_content_files):
+    """
+    Celery task to embed specific resources
+
+    Args:
+        ids (list of int): list of resource ids to embed
+        skip_content_files (bool): whether to skip embedding content files
+    """
+    index_tasks = []
+    if not all([settings.QDRANT_HOST, settings.QDRANT_BASE_COLLECTION_NAME]):
+        log.warning(
+            "skipping. start_embed_resources called without setting "
+            "QDRANT_HOST and QDRANT_BASE_COLLECTION_NAME"
+        )
+        return None
+    resources = LearningResource.objects.filter(
+        id__in=ids,
+        published=True,
+    )
+    try:
+        for resource_type in LEARNING_RESOURCE_TYPES:
+            resources = resources.filter(resource_type=resource_type)
+
+            [
+                index_tasks.append(
+                    generate_embeddings.si(
+                        chunk_ids,
+                        resource_type,
+                    )
+                )
+                for chunk_ids in chunks(
+                    resources.order_by("id").values_list("id", flat=True),
+                    chunk_size=settings.OPENSEARCH_INDEXING_CHUNK_SIZE,
+                )
+            ]
+            if not skip_content_files and resource_type == COURSE_TYPE:
+                for course in resources.filter(
+                    etl_source__in=RESOURCE_FILE_ETL_SOURCES
+                ).order_by("id"):
+                    index_tasks = index_tasks + [
+                        generate_embeddings.si(
+                            content_ids,
+                            CONTENT_FILE_TYPE,
+                        )
+                        for content_ids in chunks(
+                            ContentFile.objects.filter(
+                                run__learning_resource_id=course.id,
+                                published=True,
+                                run__published=True,
+                            )
+                            .order_by("id")
+                            .values_list("id", flat=True),
+                            chunk_size=settings.OPENSEARCH_DOCUMENT_INDEXING_CHUNK_SIZE,
+                        )
+                    ]
+    except:  # noqa: E722
+        error = "start_embed_resources threw an error"
+        log.exception(error)
+        return error
+
+    # Use self.replace so that code waiting on this task will also wait on the embedding
+    #  and finish tasks
+
+    return self.replace(celery.chain(*index_tasks))
+
+
 @app.task(bind=True)
 def embed_new_learning_resources(self):
     """
@@ -165,13 +233,16 @@ def embed_new_learning_resources(self):
         created_on__gt=since,
     ).exclude(resource_type=CONTENT_FILE_TYPE)
     filtered_resources = filter_existing_qdrant_points(new_learning_resources)
-    embed_tasks = celery.group(
-        [
-            generate_embeddings.si(ids, COURSE_TYPE)
-            for ids in chunks(
-                filtered_resources.order_by("id").values_list("id", flat=True),
-                chunk_size=settings.OPENSEARCH_INDEXING_CHUNK_SIZE,
-            )
-        ]
-    )
+    for resource_type in LEARNING_RESOURCE_TYPES:
+        embed_tasks = celery.group(
+            [
+                generate_embeddings.si(ids, resource_type)
+                for ids in chunks(
+                    filtered_resources.filter(resource_type=resource_type).values_list(
+                        "id", flat=True
+                    ),
+                    chunk_size=settings.OPENSEARCH_INDEXING_CHUNK_SIZE,
+                )
+            ]
+        )
     return self.replace(embed_tasks)
diff --git a/vector_search/tasks_test.py b/vector_search/tasks_test.py
@@ -3,19 +3,24 @@
 import pytest
 from django.conf import settings
 
-from learning_resources.etl.constants import ETLSource
+from learning_resources.etl.constants import RESOURCE_FILE_ETL_SOURCES, ETLSource
 from learning_resources.factories import (
     ContentFileFactory,
     CourseFactory,
     LearningResourceFactory,
+    LearningResourceRunFactory,
     ProgramFactory,
 )
 from learning_resources.models import LearningResource
 from learning_resources_search.constants import (
     COURSE_TYPE,
 )
 from main.utils import now_in_utc
-from vector_search.tasks import embed_new_learning_resources, start_embed_resources
+from vector_search.tasks import (
+    embed_learning_resources_by_id,
+    embed_new_learning_resources,
+    start_embed_resources,
+)
 
 pytestmark = pytest.mark.django_db
 
@@ -138,3 +143,38 @@ def test_embed_new_learning_resources(mocker, mocked_celery):
 
     embedded_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
     assert sorted(daily_resource_ids) == sorted(embedded_ids)
+
+
+def test_embed_learning_resources_by_id(mocker, mocked_celery):
+    """
+    embed_learning_resources_by_id should generate embeddings for resources
+    based the ids passed as well as associated contentfiles
+    """
+    mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[])
+
+    resources = LearningResourceFactory.create_batch(
+        4,
+        resource_type=COURSE_TYPE,
+        etl_source=RESOURCE_FILE_ETL_SOURCES[0],
+        published=True,
+    )
+
+    resource_ids = [resource.id for resource in resources]
+
+    generate_embeddings_mock = mocker.patch(
+        "vector_search.tasks.generate_embeddings", autospec=True
+    )
+    content_ids = []
+    for resource in resources:
+        cf = ContentFileFactory.create(
+            run=LearningResourceRunFactory.create(learning_resource=resource)
+        )
+        content_ids.append(cf.id)
+
+    with pytest.raises(mocked_celery.replace_exception_class):
+        embed_learning_resources_by_id.delay(resource_ids, skip_content_files=False)
+    for mock_call in generate_embeddings_mock.si.mock_calls[1:]:
+        assert mock_call.args[0][0] in content_ids
+        assert mock_call.args[1] == "content_file"
+    embedded_resource_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
+    assert sorted(resource_ids) == sorted(embedded_resource_ids)
diff --git a/vector_search/utils.py b/vector_search/utils.py
diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py