add unit test

Frankie Bromage · Frankie Bromage · commit 9313f6df990c · 2024-11-26T15:46:42.000-08:00
diff --git a/airbyte_cdk/destinations/vector_db_based/document_processor.py b/airbyte_cdk/destinations/vector_db_based/document_processor.py
@@ -166,12 +166,18 @@ def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]
         if self.omit_field_names_from_embeddings == False:
             text = stringify_dict(relevant_fields)
         else:
-            text = ""
-            for key, value in relevant_fields.items():
-                text += f"{value}\n"
+            text = self._extract_values_from_dict(relevant_fields)
         metadata = self._extract_metadata(record)
         return Document(page_content=text, metadata=metadata)
 
+    def _extract_values_from_dict(self, data):
+        if isinstance(data, dict):
+            return "\n".join(self._extract_values_from_dict(value) for value in data.values())
+        elif isinstance(data, list):
+            return "\n".join(self._extract_values_from_dict(item) for item in data)
+        else:
+            return str(data)
+
     def _extract_relevant_fields(
         self, record: AirbyteRecordMessage, fields: Optional[List[str]]
     ) -> Dict[str, Any]:
diff --git a/unit_tests/destinations/vector_db_based/config_test.py b/unit_tests/destinations/vector_db_based/config_test.py
@@ -242,6 +242,13 @@ def test_json_schema_generation():
                         "type": "array",
                         "items": {"type": "string"},
                     },
+                    "omit_field_names_from_embeddings": {
+                        "title": "Omit field names from embeddings",
+                        "description": "Do not include the field names in the text that gets embedded. By default field names are embedded e.g 'user: name, user.email: email@email.com'. If set to true, only the values are embedded e.g. 'name, email@email.com'.",
+                        "default": False,
+                        "always_show": True,
+                        "type": "boolean",
+                    },
                     "metadata_fields": {
                         "title": "Fields to store as metadata",
                         "description": "List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.",
diff --git a/unit_tests/destinations/vector_db_based/document_processor_test.py b/unit_tests/destinations/vector_db_based/document_processor_test.py
@@ -193,6 +193,7 @@ def test_complex_text_fields():
         "non.*.existing",
     ]
     processor.metadata_fields = ["non_text", "non_text_2", "id"]
+    processor.omit_field_names_from_embeddings = False
 
     chunks, _ = processor.process(record)
 
@@ -212,6 +213,54 @@ def test_complex_text_fields():
         "_ab_stream": "namespace1_stream1",
     }
 
+def test_complex_text_fields_omit_field_names():
+    processor = initialize_processor()
+
+    record = AirbyteRecordMessage(
+        stream="stream1",
+        namespace="namespace1",
+        data={
+            "id": 1,
+            "nested": {
+                "texts": [
+                    {"text": "This is the text"},
+                    {"text": "And another"},
+                ]
+            },
+            "non_text": "a",
+            "non_text_2": 1,
+            "text": "This is the regular text",
+            "other_nested": {"non_text": {"a": "xyz", "b": "abc"}},
+        },
+        emitted_at=1234,
+    )
+
+    processor.text_fields = [
+        "nested.texts.*.text",
+        "text",
+        "other_nested.non_text",
+        "non.*.existing",
+    ]
+    processor.metadata_fields = ["non_text", "non_text_2", "id"]
+    processor.omit_field_names_from_embeddings = True
+
+    chunks, _ = processor.process(record)
+
+    assert len(chunks) == 1
+    assert (
+        chunks[0].page_content
+        == """This is the text
+And another
+This is the regular text
+xyz
+abc"""
+    )
+    assert chunks[0].metadata == {
+        "id": 1,
+        "non_text": "a",
+        "non_text_2": 1,
+        "_ab_stream": "namespace1_stream1",
+    }
 
 def test_no_text_fields():
     processor = initialize_processor()