|
1 | 1 | import re |
2 | | -from typing import Literal, Optional |
| 2 | +from typing import Callable, Literal, Optional |
3 | 3 |
|
4 | 4 | from haystack import Document, component, logging |
5 | 5 | from haystack.components.preprocessors import DocumentSplitter |
6 | 6 |
|
7 | 7 | logger = logging.getLogger(__name__) |
8 | 8 |
|
9 | 9 |
|
10 | | -class CustomDocumentSplitter(DocumentSplitter): |
| 10 | +class _CustomDocumentSplitter(DocumentSplitter): |
11 | 11 | """ |
12 | | - Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'. |
| 12 | + Internal helper class that extends DocumentSplitter to support splitting functions. |
| 13 | +
|
| 14 | + This class handles splitting functions that return dictionaries with 'content' and 'meta' |
| 15 | + keys instead of just strings. For internal use only within the MarkdownHeaderSplitter. |
13 | 16 | """ |
14 | 17 |
|
15 | | - def __init__(self, split_by="function", splitting_function=None, page_break_character="\\f"): |
| 18 | + def __init__( |
| 19 | + self, |
| 20 | + split_by: str = "function", |
| 21 | + splitting_function: Optional[Callable] = None, |
| 22 | + page_break_character: str = "\\f", |
| 23 | + ): |
16 | 24 | """ |
17 | | - Initialize the CustomDocumentSplitter. |
| 25 | + Initialize the _CustomDocumentSplitter. |
18 | 26 |
|
19 | 27 | :param split_by: The method to split by. Must be "function" for custom splitting functions. |
20 | | - :param splitting_function: A custom function that takes a string and returns a list of dicts with 'content' and optional 'meta'. |
21 | | - :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f"). |
| 28 | + :param splitting_function: A custom function that takes a string and returns a list of dicts |
| 29 | + with 'content' and optional 'meta'. |
| 30 | + :param page_break_character: Character used to identify page breaks. |
| 31 | + Defaults to form feed ("\\f"). |
22 | 32 | """ |
23 | 33 | super().__init__(split_by=split_by, splitting_function=splitting_function) |
24 | 34 | self.page_break_character = page_break_character |
@@ -119,7 +129,8 @@ def __init__( |
119 | 129 | :param secondary_split: Optional secondary split condition after header splitting. |
120 | 130 | Options are "none", "word", "passage", "period", "line". Defaults to "none". |
121 | 131 | :param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200. |
122 | | - :param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0. |
| 132 | + :param split_overlap: The number of overlapping units for each split when using secondary splitting. |
| 133 | + Defaults to 0. |
123 | 134 | :param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0. |
124 | 135 | """ |
125 | 136 | self.infer_header_levels = infer_header_levels |
@@ -349,7 +360,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N |
349 | 360 | processed_documents.append(doc) |
350 | 361 |
|
351 | 362 | # split by markdown headers |
352 | | - header_splitter = CustomDocumentSplitter( |
| 363 | + header_splitter = _CustomDocumentSplitter( |
353 | 364 | split_by="function", |
354 | 365 | splitting_function=lambda text: self._split_by_markdown_headers(text), |
355 | 366 | page_break_character=self.page_break_character, |
|
0 commit comments