Skip to content

Commit 3dc0504

Browse files
committed
cleanup
1 parent 169cb06 commit 3dc0504

File tree

1 file changed

+20
-9
lines changed

1 file changed

+20
-9
lines changed

haystack/components/preprocessors/markdown_header_splitter.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,34 @@
11
import re
2-
from typing import Literal, Optional
2+
from typing import Callable, Literal, Optional
33

44
from haystack import Document, component, logging
55
from haystack.components.preprocessors import DocumentSplitter
66

77
logger = logging.getLogger(__name__)
88

99

10-
class CustomDocumentSplitter(DocumentSplitter):
10+
class _CustomDocumentSplitter(DocumentSplitter):
1111
"""
12-
Custom DocumentSplitter that supports splitting functions returning dicts with 'content' and 'meta'.
12+
Internal helper class that extends DocumentSplitter to support splitting functions.
13+
14+
This class handles splitting functions that return dictionaries with 'content' and 'meta'
15+
keys instead of just strings. For internal use only within the MarkdownHeaderSplitter.
1316
"""
1417

15-
def __init__(self, split_by="function", splitting_function=None, page_break_character="\\f"):
18+
def __init__(
19+
self,
20+
split_by: str = "function",
21+
splitting_function: Optional[Callable] = None,
22+
page_break_character: str = "\\f",
23+
):
1624
"""
17-
Initialize the CustomDocumentSplitter.
25+
Initialize the _CustomDocumentSplitter.
1826
1927
:param split_by: The method to split by. Must be "function" for custom splitting functions.
20-
:param splitting_function: A custom function that takes a string and returns a list of dicts with 'content' and optional 'meta'.
21-
:param page_break_character: Character used to identify page breaks. Defaults to form feed ("\\f").
28+
:param splitting_function: A custom function that takes a string and returns a list of dicts
29+
with 'content' and optional 'meta'.
30+
:param page_break_character: Character used to identify page breaks.
31+
Defaults to form feed ("\\f").
2232
"""
2333
super().__init__(split_by=split_by, splitting_function=splitting_function)
2434
self.page_break_character = page_break_character
@@ -119,7 +129,8 @@ def __init__(
119129
:param secondary_split: Optional secondary split condition after header splitting.
120130
Options are "none", "word", "passage", "period", "line". Defaults to "none".
121131
:param split_length: The maximum number of units in each split when using secondary splitting. Defaults to 200.
122-
:param split_overlap: The number of overlapping units for each split when using secondary splitting. Defaults to 0.
132+
:param split_overlap: The number of overlapping units for each split when using secondary splitting.
133+
Defaults to 0.
123134
:param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
124135
"""
125136
self.infer_header_levels = infer_header_levels
@@ -349,7 +360,7 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
349360
processed_documents.append(doc)
350361

351362
# split by markdown headers
352-
header_splitter = CustomDocumentSplitter(
363+
header_splitter = _CustomDocumentSplitter(
353364
split_by="function",
354365
splitting_function=lambda text: self._split_by_markdown_headers(text),
355366
page_break_character=self.page_break_character,

0 commit comments

Comments
 (0)