Skip to content

Commit 0f33b05

Browse files
Frankie BromageFrankie Bromage
authored andcommitted
add option to not embed field names for vector db destination
1 parent a8b1b2b commit 0f33b05

File tree

2 files changed

+13
-1
lines changed

2 files changed

+13
-1
lines changed

airbyte_cdk/destinations/vector_db_based/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ class ProcessingConfigModel(BaseModel):
108108
always_show=True,
109109
examples=["text", "user.name", "users.*.name"],
110110
)
111+
omit_field_names_from_embeddings: bool = Field(
112+
default=False,
113+
title="Omit field names from embeddings",
114+
description="Do not include the field names in the text that gets embedded. By default field names are embedded e.g 'user: name, user.email: [email protected]'. If set to true, only the values are embedded e.g. 'name, [email protected]'.",
115+
always_show=True,
116+
)
111117
metadata_fields: Optional[List[str]] = Field(
112118
default=[],
113119
title="Fields to store as metadata",

airbyte_cdk/destinations/vector_db_based/document_processor.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCata
125125
self.text_fields = config.text_fields
126126
self.metadata_fields = config.metadata_fields
127127
self.field_name_mappings = config.field_name_mappings
128+
self.omit_field_names_from_embeddings = config.omit_field_names_from_embeddings
128129
self.logger = logging.getLogger("airbyte.document_processor")
129130

130131
def process(self, record: AirbyteRecordMessage) -> Tuple[List[Chunk], Optional[str]]:
@@ -162,7 +163,12 @@ def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]
162163
relevant_fields = self._extract_relevant_fields(record, self.text_fields)
163164
if len(relevant_fields) == 0:
164165
return None
165-
text = stringify_dict(relevant_fields)
166+
if self.omit_field_names_from_embeddings == False:
167+
text = stringify_dict(relevant_fields)
168+
else:
169+
text = ""
170+
for key, value in relevant_fields.items():
171+
text += f"{value}\n"
166172
metadata = self._extract_metadata(record)
167173
return Document(page_content=text, metadata=metadata)
168174

0 commit comments

Comments
 (0)