fix split id assignment

OGuggenbuehl · OGuggenbuehl · commit c7fc2e45d259 · 2025-12-01T09:28:16.000+01:00
diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -169,6 +169,7 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
         Ensures page counting is maintained across splits.
         """
         result_docs = []
+        current_split_id = 0  # track split_id across all secondary splits from the same parent
 
         for doc in documents:
             if doc.content is None:
@@ -186,8 +187,11 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
             # track page from meta
             current_page = doc.meta.get("page_number", 1)
 
+            # create a clean meta dict without split_id for secondary splitting
+            clean_meta = {k: v for k, v in doc.meta.items() if k != "split_id"}
+
             secondary_splits = self.secondary_splitter.run(
-                documents=[Document(content=content_for_splitting, meta=doc.meta)]
+                documents=[Document(content=content_for_splitting, meta=clean_meta)]
             )["documents"]
 
             # split processing
@@ -196,8 +200,13 @@ def _apply_secondary_splitting(self, documents: list[Document]) -> list[Document
                 if i > 0 and secondary_splits[i - 1].content:
                     current_page = self._update_page_number_with_breaks(secondary_splits[i - 1].content, current_page)
 
-                # set page number to meta
+                # set page number and split_id to meta
                 split.meta["page_number"] = current_page
+                split.meta["split_id"] = current_split_id
+                # ensure source_id is preserved from the original document
+                if "source_id" in doc.meta:
+                    split.meta["source_id"] = doc.meta["source_id"]
+                current_split_id += 1
 
                 # preserve header metadata if we're not keeping headers in content
                 if not self.keep_headers:
@@ -255,11 +264,11 @@ def _split_documents_by_markdown_headers(self, documents: list[Document]) -> lis
                 current_page=current_page,
                 total_pages=total_pages,
             )
-            for split in splits:
+            for split_idx, split in enumerate(splits):
                 meta = {}
                 if doc.meta:
                     meta = doc.meta.copy()
-                meta.update({"source_id": doc.id, "page_number": current_page})
+                meta.update({"source_id": doc.id, "page_number": current_page, "split_id": split_idx})
                 if split.get("meta"):
                     meta.update(split["meta"])
                 current_page = self._update_page_number_with_breaks(split["content"], current_page)
@@ -284,7 +293,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
             - `documents`: List of documents with the split texts. Each document includes:
                 - A metadata field `source_id` to track the original document.
                 - A metadata field `page_number` to track the original page number.
-                - A metadata field `split_id` to uniquely identify each split chunk.
+                - A metadata field `split_id` to identify the split chunk index within its parent document.
                 - All other metadata copied from the original document.
         """
         # validate input documents
@@ -325,8 +334,4 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
 
             final_docs.extend(doc_splits)
 
-        # assign split_id to all output documents
-        for idx, doc in enumerate(final_docs):
-            doc.meta["split_id"] = idx
-
         return {"documents": final_docs}
diff --git a/test/components/preprocessors/test_markdown_header_splitter.py b/test/components/preprocessors/test_markdown_header_splitter.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from collections import defaultdict
 from unittest.mock import ANY
 
 import pytest
@@ -181,10 +182,14 @@ def test_split_multiple_documents(sample_text):
     headers = {doc.meta["header"] for doc in split_docs}
     assert {"Another Header", "H1", "H2"}.issubset(headers)
 
-    # Verify that all documents have a split_id and they're sequential
-    split_ids = [doc.meta.get("split_id") for doc in split_docs]
-    assert all(split_id is not None for split_id in split_ids)
-    assert split_ids == list(range(len(split_ids)))
+    # Verify that split_ids are per-parent-document
+    splits_by_source = defaultdict(list)
+    for doc in split_docs:
+        splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
+
+    # Each parent document should have split_ids starting from 0
+    for source_id, split_ids in splits_by_source.items():
+        assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
 
 
 def test_split_only_headers():
@@ -268,7 +273,7 @@ def test_empty_content_handling():
 
 
 def test_split_id_sequentiality_primary_and_secondary(sample_text):
-    # Test primary splitting
+    # Test primary splitting with single document
     splitter = MarkdownHeaderSplitter()
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
@@ -277,11 +282,11 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test number of documents
     assert len(split_docs) == 5
 
-    # Check that split_ids are sequential
+    # Check that split_ids are sequential from 0 for this single parent document
     split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
-    # Test secondary splitting
+    # Test secondary splitting with single document
     splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3)
     docs = [Document(content=sample_text)]
     result = splitter.run(documents=docs)
@@ -290,19 +295,27 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
     # Test number of documents
     assert len(split_docs) == 12
 
+    # Check that split_ids are sequential from 0 for this single parent document
     split_ids = [doc.meta["split_id"] for doc in split_docs]
     assert split_ids == list(range(len(split_ids)))
 
-    # Test with multiple input documents
+    # Test with multiple input documents - each should have its own split_id sequence
+    splitter = MarkdownHeaderSplitter(secondary_split="word", split_length=3)  # Use fresh instance
     docs = [Document(content=sample_text), Document(content="# Another Header\nSome more content here.")]
     result = splitter.run(documents=docs)
     split_docs = result["documents"]
 
     # Test number of documents
     assert len(split_docs) == 14
 
-    split_ids = [doc.meta["split_id"] for doc in split_docs]
-    assert split_ids == list(range(len(split_ids)))
+    # Verify split_ids are per-parent-document
+    splits_by_source = defaultdict(list)
+    for doc in split_docs:
+        splits_by_source[doc.meta["source_id"]].append(doc.meta["split_id"])
+
+    # Each parent document should have split_ids starting from 0
+    for source_id, split_ids in splits_by_source.items():
+        assert split_ids == list(range(len(split_ids))), f"Split IDs for {source_id} should be sequential from 0"
 
 
 def test_secondary_split_with_overlap():