Skip to content

Commit 9313f6d

Browse files
Frankie BromageFrankie Bromage
authored andcommitted
add unit test
1 parent 0f33b05 commit 9313f6d

File tree

3 files changed

+65
-3
lines changed

3 files changed

+65
-3
lines changed

airbyte_cdk/destinations/vector_db_based/document_processor.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,12 +166,18 @@ def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]
166166
if self.omit_field_names_from_embeddings == False:
167167
text = stringify_dict(relevant_fields)
168168
else:
169-
text = ""
170-
for key, value in relevant_fields.items():
171-
text += f"{value}\n"
169+
text = self._extract_values_from_dict(relevant_fields)
172170
metadata = self._extract_metadata(record)
173171
return Document(page_content=text, metadata=metadata)
174172

173+
def _extract_values_from_dict(self, data):
174+
if isinstance(data, dict):
175+
return "\n".join(self._extract_values_from_dict(value) for value in data.values())
176+
elif isinstance(data, list):
177+
return "\n".join(self._extract_values_from_dict(item) for item in data)
178+
else:
179+
return str(data)
180+
175181
def _extract_relevant_fields(
176182
self, record: AirbyteRecordMessage, fields: Optional[List[str]]
177183
) -> Dict[str, Any]:

unit_tests/destinations/vector_db_based/config_test.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,13 @@ def test_json_schema_generation():
242242
"type": "array",
243243
"items": {"type": "string"},
244244
},
245+
"omit_field_names_from_embeddings": {
246+
"title": "Omit field names from embeddings",
247+
"description": "Do not include the field names in the text that gets embedded. By default field names are embedded e.g 'user: name, user.email: [email protected]'. If set to true, only the values are embedded e.g. 'name, [email protected]'.",
248+
"default": False,
249+
"always_show": True,
250+
"type": "boolean",
251+
},
245252
"metadata_fields": {
246253
"title": "Fields to store as metadata",
247254
"description": "List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.",

unit_tests/destinations/vector_db_based/document_processor_test.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def test_complex_text_fields():
193193
"non.*.existing",
194194
]
195195
processor.metadata_fields = ["non_text", "non_text_2", "id"]
196+
processor.omit_field_names_from_embeddings = False
196197

197198
chunks, _ = processor.process(record)
198199

@@ -212,6 +213,54 @@ def test_complex_text_fields():
212213
"_ab_stream": "namespace1_stream1",
213214
}
214215

216+
def test_complex_text_fields_omit_field_names():
217+
processor = initialize_processor()
218+
219+
record = AirbyteRecordMessage(
220+
stream="stream1",
221+
namespace="namespace1",
222+
data={
223+
"id": 1,
224+
"nested": {
225+
"texts": [
226+
{"text": "This is the text"},
227+
{"text": "And another"},
228+
]
229+
},
230+
"non_text": "a",
231+
"non_text_2": 1,
232+
"text": "This is the regular text",
233+
"other_nested": {"non_text": {"a": "xyz", "b": "abc"}},
234+
},
235+
emitted_at=1234,
236+
)
237+
238+
processor.text_fields = [
239+
"nested.texts.*.text",
240+
"text",
241+
"other_nested.non_text",
242+
"non.*.existing",
243+
]
244+
processor.metadata_fields = ["non_text", "non_text_2", "id"]
245+
processor.omit_field_names_from_embeddings = True
246+
247+
chunks, _ = processor.process(record)
248+
249+
assert len(chunks) == 1
250+
assert (
251+
chunks[0].page_content
252+
== """This is the text
253+
And another
254+
This is the regular text
255+
xyz
256+
abc"""
257+
)
258+
assert chunks[0].metadata == {
259+
"id": 1,
260+
"non_text": "a",
261+
"non_text_2": 1,
262+
"_ab_stream": "namespace1_stream1",
263+
}
215264

216265
def test_no_text_fields():
217266
processor = initialize_processor()

0 commit comments

Comments
 (0)