|
1 | 1 | import re |
2 | | -from typing import Any, Literal, Optional |
| 2 | +from typing import Literal, Optional |
3 | 3 |
|
4 | | -from haystack import Document, component, default_from_dict, default_to_dict, logging |
| 4 | +from haystack import Document, component, logging |
5 | 5 | from haystack.components.preprocessors import DocumentSplitter |
6 | 6 |
|
7 | 7 | logger = logging.getLogger(__name__) |
@@ -73,8 +73,8 @@ def _split_by_function(self, doc: Document) -> list[Document]: |
73 | 73 | if doc.meta: |
74 | 74 | meta = self._flatten_dict(doc.meta) |
75 | 75 |
|
76 | | - # add standard metadata |
77 | | - meta.update({"source_id": doc.id, "split_id": i, "total_pages": total_pages, "page_number": current_page}) |
| 76 | + # add standard metadata (no split_id here) |
| 77 | + meta.update({"source_id": doc.id, "total_pages": total_pages, "page_number": current_page}) |
78 | 78 |
|
79 | 79 | # get page number based on page breaks |
80 | 80 | page_breaks = self._process_split_content(split["content"], i) |
@@ -365,6 +365,10 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N |
365 | 365 | else: |
366 | 366 | final_docs = header_split_docs |
367 | 367 |
|
| 368 | + # assign unique, sequential split_id to all final chunks |
| 369 | + for idx, doc in enumerate(final_docs): |
| 370 | + doc.meta["split_id"] = idx |
| 371 | + |
368 | 372 | return {"documents": final_docs} |
369 | 373 |
|
370 | 374 |
|
|
0 commit comments