22#
33# SPDX-License-Identifier: Apache-2.0
44
5+ from collections import defaultdict
56from unittest .mock import ANY
67
78import pytest
@@ -181,10 +182,14 @@ def test_split_multiple_documents(sample_text):
181182 headers = {doc .meta ["header" ] for doc in split_docs }
182183 assert {"Another Header" , "H1" , "H2" }.issubset (headers )
183184
184- # Verify that all documents have a split_id and they're sequential
185- split_ids = [doc .meta .get ("split_id" ) for doc in split_docs ]
186- assert all (split_id is not None for split_id in split_ids )
187- assert split_ids == list (range (len (split_ids )))
185+ # Verify that split_ids are per-parent-document
186+ splits_by_source = defaultdict (list )
187+ for doc in split_docs :
188+ splits_by_source [doc .meta ["source_id" ]].append (doc .meta ["split_id" ])
189+
190+ # Each parent document should have split_ids starting from 0
191+ for source_id , split_ids in splits_by_source .items ():
192+ assert split_ids == list (range (len (split_ids ))), f"Split IDs for { source_id } should be sequential from 0"
188193
189194
190195def test_split_only_headers ():
@@ -268,7 +273,7 @@ def test_empty_content_handling():
268273
269274
270275def test_split_id_sequentiality_primary_and_secondary (sample_text ):
271- # Test primary splitting
276+ # Test primary splitting with single document
272277 splitter = MarkdownHeaderSplitter ()
273278 docs = [Document (content = sample_text )]
274279 result = splitter .run (documents = docs )
@@ -277,11 +282,11 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
277282 # Test number of documents
278283 assert len (split_docs ) == 5
279284
280- # Check that split_ids are sequential
285+ # Check that split_ids are sequential from 0 for this single parent document
281286 split_ids = [doc .meta ["split_id" ] for doc in split_docs ]
282287 assert split_ids == list (range (len (split_ids )))
283288
284- # Test secondary splitting
289+ # Test secondary splitting with single document
285290 splitter = MarkdownHeaderSplitter (secondary_split = "word" , split_length = 3 )
286291 docs = [Document (content = sample_text )]
287292 result = splitter .run (documents = docs )
@@ -290,19 +295,27 @@ def test_split_id_sequentiality_primary_and_secondary(sample_text):
290295 # Test number of documents
291296 assert len (split_docs ) == 12
292297
298+ # Check that split_ids are sequential from 0 for this single parent document
293299 split_ids = [doc .meta ["split_id" ] for doc in split_docs ]
294300 assert split_ids == list (range (len (split_ids )))
295301
296- # Test with multiple input documents
302+ # Test with multiple input documents - each should have its own split_id sequence
303+ splitter = MarkdownHeaderSplitter (secondary_split = "word" , split_length = 3 ) # Use fresh instance
297304 docs = [Document (content = sample_text ), Document (content = "# Another Header\n Some more content here." )]
298305 result = splitter .run (documents = docs )
299306 split_docs = result ["documents" ]
300307
301308 # Test number of documents
302309 assert len (split_docs ) == 14
303310
304- split_ids = [doc .meta ["split_id" ] for doc in split_docs ]
305- assert split_ids == list (range (len (split_ids )))
311+ # Verify split_ids are per-parent-document
312+ splits_by_source = defaultdict (list )
313+ for doc in split_docs :
314+ splits_by_source [doc .meta ["source_id" ]].append (doc .meta ["split_id" ])
315+
316+ # Each parent document should have split_ids starting from 0
317+ for source_id , split_ids in splits_by_source .items ():
318+ assert split_ids == list (range (len (split_ids ))), f"Split IDs for { source_id } should be sequential from 0"
306319
307320
308321def test_secondary_split_with_overlap ():
0 commit comments