Skip to content

Commit c7fc2e4

Browse files
committed
fix split id assignment
1 parent f842fdb commit c7fc2e4

File tree

2 files changed

+37
-19
lines changed

2 files changed

+37
-19
lines changed

haystack/components/preprocessors/markdown_header_splitter.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
169169
Ensures page counting is maintained across splits.
170170
"""
171171
result_docs = []
172+
current_split_id = 0 # track split_id across all secondary splits from the same parent
172173

173174
for doc in documents:
174175
if doc.content is None:
@@ -186,8 +187,11 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
186187
# track page from meta
187188
current_page = doc.meta.get("page_number", 1)
188189

190+
# create a clean meta dict without split_id for secondary splitting
191+
clean_meta = {k: v for k, v in doc.meta.items() if k != "split_id"}
192+
189193
secondary_splits = self.secondary_splitter.run(
190-
documents=[Document(content=content_for_splitting, meta=doc.meta)]
194+
documents=[Document(content=content_for_splitting, meta=clean_meta)]
191195
)["documents"]
192196

193197
# split processing
@@ -196,8 +200,13 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
196200
if i > 0 and secondary_splits[i - 1].content:
197201
current_page = self._update_page_number_with_breaks(secondary_splits[i - 1].content, current_page)
198202

199-
# set page number to meta
203+
# set page number and split_id to meta
200204
split.meta["page_number"] = current_page
205+
split.meta["split_id"] = current_split_id
206+
# ensure source_id is preserved from the original document
207+
if "source_id" in doc.meta:
208+
split.meta["source_id"] = doc.meta["source_id"]
209+
current_split_id += 1
201210

202211
# preserve header metadata if we're not keeping headers in content
203212
if not self.keep_headers:
@@ -255,11 +264,11 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
255264
current_page=current_page,
256265
total_pages=total_pages,
257266
)
258-
for split in splits:
267+
for split_idx, split in enumerate(splits):
259268
meta = {}
260269
if doc.meta:
261270
meta = doc.meta.copy()
262-
meta.update({"source_id": doc.id, "page_number": current_page})
271+
meta.update({"source_id": doc.id, "page_number": current_page, "split_id": split_idx})
263272
if split.get("meta"):
264273
meta.update(split["meta"])
265274
current_page = self._update_page_number_with_breaks(split["content"], current_page)
@@ -284,7 +293,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
284293
- `documents`: List of documents with the split texts. Each document includes:
285294
- A metadata field `source_id` to track the original document.
286295
- A metadata field `page_number` to track the original page number.
287-
- A metadata field `split_id` to uniquely identify each split chunk.
296+
- A metadata field `split_id` to identify the split chunk index within its parent document.
288297
- All other metadata copied from the original document.
289298
"""
290299
# validate input documents
@@ -325,8 +334,4 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
325334

326335
final_docs.extend(doc_splits)
327336

328-
# assign split_id to all output documents
329-
for idx, doc in enumerate(final_docs):
330-
doc.meta["split_id"] = idx
331-
332337
return {"documents": final_docs}

test/components/preprocessors/test_markdown_header_splitter.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from collections import defaultdict
56
from unittest.mock import ANY
67

78
import pytest
@@ -181,10 +182,14 @@ def test_split_multiple_documents(sample_text):
181182
headers = {doc.meta["header"] for doc in split_docs}
182183
assert {"Another Header", "H1", "H2"}.issubset(headers)
183184

184-
# Verify that all documents have a split_id and they're sequential
185-
split_ids = [doc.meta.get("split_id") for doc in split_docs]
186-
assert all(split_id is not None for split_id in split_ids)
187-
assert split_ids == list(range(len(split_ids)))
185+
# Verify that split_ids are per-parent-document
186+
splits_by_source = defaultdict(list)
187+
for doc in split_docs:
188+
splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
189+
190+
# Each parent document should have split_ids starting from 0
191+
for source_id, split_ids in splits_by_source.items():
192+
assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
188193

189194

190195
def test_split_only_headers():
@@ -268,7 +273,7 @@ def test_empty_content_handling():
268273

269274

270275
def test_split_id_sequentiality_primary_and_secondary(sample_text):
271-
# Test primary splitting
276+
# Test primary splitting with single document
272277
splitter = MarkdownHeaderSplitter()
273278
docs = [Document(content=sample_text)]
274279
result = splitter.run(documents=docs)
@@ -277,11 +282,11 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
277282
# Test number of documents
278283
assert len(split_docs) == 5
279284

280-
# Check that split_ids are sequential
285+
# Check that split_ids are sequential from 0 for this single parent document
281286
split_ids = [doc.meta["split_id"] for doc in split_docs]
282287
assert split_ids == list(range(len(split_ids)))
283288

284-
# Test secondary splitting
289+
# Test secondary splitting with single document
285290
splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3)
286291
docs = [Document(content=sample_text)]
287292
result = splitter.run(documents=docs)
@@ -290,19 +295,27 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
290295
# Test number of documents
291296
assert len(split_docs) == 12
292297

298+
# Check that split_ids are sequential from 0 for this single parent document
293299
split_ids = [doc.meta["split_id"] for doc in split_docs]
294300
assert split_ids == list(range(len(split_ids)))
295301

296-
# Test with multiple input documents
302+
# Test with multiple input documents - each should have its own split_id sequence
303+
splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3) # Use fresh instance
297304
docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")]
298305
result = splitter.run(documents=docs)
299306
split_docs = result["documents"]
300307

301308
# Test number of documents
302309
assert len(split_docs) == 14
303310

304-
split_ids = [doc.meta["split_id"] for doc in split_docs]
305-
assert split_ids == list(range(len(split_ids)))
311+
# Verify split_ids are per-parent-document
312+
splits_by_source = defaultdict(list)
313+
for doc in split_docs:
314+
splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
315+
316+
# Each parent document should have split_ids starting from 0
317+
for source_id, split_ids in splits_by_source.items():
318+
assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
306319

307320

308321
def test_secondary_split_with_overlap():

0 commit comments

Comments
 (0)