Skip to content

Commit 169cb06

Browse files
committed
cleanup
1 parent 970ec90 commit 169cb06

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

haystack/components/preprocessors/markdown_header_splitter.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import re
2-
from typing import Any, Literal, Optional
2+
from typing import Literal, Optional
33

4-
from haystack import Document, component, default_from_dict, default_to_dict, logging
4+
from haystack import Document, component, logging
55
from haystack.components.preprocessors import DocumentSplitter
66

77
logger = logging.getLogger(__name__)
@@ -73,8 +73,8 @@ def _split_by_function(self, doc: Document) -> list[Document]:
7373
if doc.meta:
7474
meta = self._flatten_dict(doc.meta)
7575

76-
# add standard metadata
77-
meta.update({"source_id": doc.id, "split_id": i, "total_pages": total_pages, "page_number": current_page})
76+
# add standard metadata (no split_id here)
77+
meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page})
7878

7979
# get page number based on page breaks
8080
page_breaks = self._process_split_content(split["content"], i)
@@ -365,6 +365,10 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
365365
else:
366366
final_docs = header_split_docs
367367

368+
# assign unique, sequential split_id to all final chunks
369+
for idx, doc in enumerate(final_docs):
370+
doc.meta["split_id"] = idx
371+
368372
return {"documents": final_docs}
369373

370374

0 commit comments

Comments
 (0)