1111import yaml
1212
1313from guidellm .dataset .synthetic import (
14+ PrefixBucketConfig ,
1415 SyntheticDatasetConfig ,
1516 SyntheticDatasetCreator ,
1617 SyntheticTextItemsGenerator ,
@@ -29,8 +30,12 @@ def test_config_creation_with_all_params(self):
2930
3031 ### WRITTEN BY AI ###
3132 """
33+ prefix_bucket = PrefixBucketConfig (
34+ bucket_weight = 100 , prefix_count = 1 , prefix_tokens = 5
35+ )
36+
3237 config = SyntheticDatasetConfig (
33- prefix_tokens = 5 ,
38+ prefix_buckets = [ prefix_bucket ] ,
3439 prompt_tokens = 100 ,
3540 prompt_tokens_stdev = 10 ,
3641 prompt_tokens_min = 50 ,
@@ -43,7 +48,7 @@ def test_config_creation_with_all_params(self):
4348 source = "custom_text.txt" ,
4449 )
4550
46- assert config .prefix_tokens == 5
51+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 5
4752 assert config .prompt_tokens == 100
4853 assert config .prompt_tokens_stdev == 10
4954 assert config .prompt_tokens_min == 50
@@ -67,7 +72,9 @@ def test_parse_json_string(self):
6772 "output_tokens" : 25 ,
6873 "samples" : 200 ,
6974 "source" : "test.txt" ,
70- "prefix_tokens" : 10 ,
75+ "prefix_buckets" : [
76+ {"bucket_weight" : 100 , "prefix_count" : 1 , "prefix_tokens" : 10 }
77+ ],
7178 }
7279 )
7380
@@ -77,23 +84,23 @@ def test_parse_json_string(self):
7784 assert config .output_tokens == 25
7885 assert config .samples == 200
7986 assert config .source == "test.txt"
80- assert config .prefix_tokens == 10
87+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 10
8188
8289 @pytest .mark .regression
8390 def test_parse_key_value_pairs (self ):
8491 """Test parsing key-value pairs configuration.
8592
8693 ### WRITTEN BY AI ###
8794 """
88- kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt,prefix_tokens=5" # noqa: E501
95+ kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt"
8996
9097 config = SyntheticDatasetConfig .parse_str (kv_str )
9198
9299 assert config .prompt_tokens == 80
93100 assert config .output_tokens == 30
94101 assert config .samples == 300
95102 assert config .source == "data.txt"
96- assert config .prefix_tokens == 5
103+ assert config .prefix_buckets is None
97104
98105 @pytest .mark .sanity
99106 def test_parse_yaml_file (self ):
@@ -106,7 +113,9 @@ def test_parse_yaml_file(self):
106113 "output_tokens" : 15 ,
107114 "samples" : 100 ,
108115 "source" : "yaml_test.txt" ,
109- "prefix_tokens" : 3 ,
116+ "prefix_buckets" : [
117+ {"bucket_weight" : 100 , "prefix_count" : 1 , "prefix_tokens" : 3 }
118+ ],
110119 }
111120
112121 with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".yaml" , delete = False ) as f :
@@ -120,7 +129,7 @@ def test_parse_yaml_file(self):
120129 assert config .output_tokens == 15
121130 assert config .samples == 100
122131 assert config .source == "yaml_test.txt"
123- assert config .prefix_tokens == 3
132+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 3
124133 finally :
125134 Path (yaml_path ).unlink ()
126135
@@ -134,7 +143,9 @@ def test_parse_config_file(self):
134143 "prompt_tokens" : 90 ,
135144 "output_tokens" : 35 ,
136145 "samples" : 150 ,
137- "prefix_tokens" : 2 ,
146+ "prefix_buckets" : [
147+ {"bucket_weight" : 100 , "prefix_count" : 1 , "prefix_tokens" : 2 }
148+ ],
138149 }
139150
140151 with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".config" , delete = False ) as f :
@@ -147,7 +158,7 @@ def test_parse_config_file(self):
147158 assert config .prompt_tokens == 90
148159 assert config .output_tokens == 35
149160 assert config .samples == 150
150- assert config .prefix_tokens == 2
161+ assert config .prefix_buckets [ 0 ]. prefix_tokens == 2
151162 finally :
152163 Path (config_path ).unlink ()
153164
@@ -194,8 +205,9 @@ def test_validation_positive_values(self):
194205 with pytest .raises (ValueError ):
195206 SyntheticDatasetConfig (prompt_tokens = 20 , output_tokens = 10 , samples = 0 )
196207
208+ # Test negative prefix tokens via PrefixBucketConfig validation
197209 with pytest .raises (ValueError ):
198- SyntheticDatasetConfig ( prompt_tokens = 20 , output_tokens = 10 , prefix_tokens = - 1 )
210+ PrefixBucketConfig ( prefix_tokens = - 1 )
199211
200212 @pytest .mark .regression
201213 def test_validation_optional_positive_values (self ):
@@ -279,7 +291,7 @@ def mock_tokenizer(self):
279291 """
280292 tokenizer = Mock ()
281293 tokenizer .get_vocab .return_value = {f"token_{ i } " : i for i in range (1000 )}
282- tokenizer .encode .side_effect = lambda text : [ 1 , 2 , 3 ] * ( len (text ) // 10 + 1 )
294+ tokenizer .encode .side_effect = lambda text : list ( range ( len (text . split ())) )
283295 tokenizer .decode .side_effect = (
284296 lambda tokens , skip_special_tokens = False : " " .join (
285297 f"token_{ t } " for t in tokens [:5 ]
@@ -306,8 +318,12 @@ def config_with_prefix(self):
306318
307319 ### WRITTEN BY AI ###
308320 """
321+ prefix_bucket = PrefixBucketConfig (
322+ bucket_weight = 100 , prefix_count = 1 , prefix_tokens = 3
323+ )
324+
309325 return SyntheticDatasetConfig (
310- prefix_tokens = 3 ,
326+ prefix_buckets = [ prefix_bucket ] ,
311327 prompt_tokens = 15 ,
312328 output_tokens = 10 ,
313329 samples = 5 ,
@@ -352,20 +368,14 @@ def test_generator_initialization(
352368 mock_text_creator .assert_called_once_with (data = simple_config .source )
353369
354370 @pytest .mark .smoke
355- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
356371 @patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
357372 def test_basic_iteration (
358- self , mock_sampler , mock_text_creator , simple_config , mock_tokenizer
373+ self ,
374+ mock_sampler ,
375+ simple_config ,
376+ mock_tokenizer ,
359377 ):
360- """Test basic iteration functionality.
361-
362- ### WRITTEN BY AI ###
363- """
364- # Setup mocks
365- mock_text_creator_instance = Mock ()
366- mock_text_creator_instance .words = ["word1" , "word2" , "word3" ] * 100
367- mock_text_creator_instance .create_text .return_value = "sample text"
368- mock_text_creator .return_value = mock_text_creator_instance
378+ """Test basic iteration functionality."""
369379
370380 # Mock IntegerRangeSampler to return iterators
371381 def mock_sampler_side_effect (* args , ** kwargs ):
@@ -394,59 +404,34 @@ def mock_sampler_side_effect(*args, **kwargs):
394404 assert isinstance (item ["output_tokens_count" ], int )
395405
396406 @pytest .mark .sanity
397- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
398- def test_create_prompt_method (
399- self , mock_text_creator , simple_config , mock_tokenizer
400- ):
407+ def test_create_prompt_method (self , simple_config , mock_tokenizer ):
401408 """Test _create_prompt method.
402409
403410 ### WRITTEN BY AI ###
404411 """
405- mock_text_creator_instance = Mock ()
406- mock_text_creator_instance .words = ["word" ] * 100
407- mock_text_creator_instance .create_text .return_value = "test text"
408- mock_text_creator .return_value = mock_text_creator_instance
409-
410- mock_tokenizer .encode .return_value = [1 , 2 , 3 ]
411-
412412 generator = SyntheticTextItemsGenerator (
413413 simple_config , mock_tokenizer , random_seed = 42
414414 )
415415
416416 # Test normal case
417417 result = generator ._create_prompt (5 , 0 , 42 )
418- assert result == [42 , 1 , 2 , 3 ]
418+ assert result [0 ] == 42 # Unique prefix token
419+ assert len (result ) == 5
419420
420421 # Test zero tokens
421422 result = generator ._create_prompt (0 , 0 , 42 )
422423 assert result == []
423424
424425 # Test without unique prefix
425426 result = generator ._create_prompt (3 , 0 )
426- assert result == [ 1 , 2 , 3 ]
427+ assert len ( result ) == 3
427428
428429 @pytest .mark .regression
429- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
430- def test_create_prompt_binary_search (
431- self , mock_text_creator , simple_config , mock_tokenizer
432- ):
430+ def test_create_prompt_binary_search (self , simple_config , mock_tokenizer ):
433431 """Test binary search logic in _create_prompt.
434432
435433 ### WRITTEN BY AI ###
436434 """
437- mock_text_creator_instance = Mock ()
438- mock_text_creator_instance .words = ["word" ] * 1000
439- mock_text_creator_instance .create_text .side_effect = lambda start , length : (
440- "text " * max (1 , length // 4 )
441- ).strip ()
442- mock_text_creator .return_value = mock_text_creator_instance
443-
444- # Mock tokenizer to return different lengths based on input
445- def mock_encode (text ):
446- return [1 ] * len (text .split ())
447-
448- mock_tokenizer .encode .side_effect = mock_encode
449-
450435 generator = SyntheticTextItemsGenerator (
451436 simple_config , mock_tokenizer , random_seed = 42
452437 )
@@ -456,21 +441,14 @@ def mock_encode(text):
456441 assert len (result ) >= 4 # Should include prefix + some tokens
457442
458443 @pytest .mark .sanity
459- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
460444 @patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
461445 def test_prefix_tokens_integration (
462- self , mock_sampler , mock_text_creator , config_with_prefix , mock_tokenizer
446+ self , mock_sampler , config_with_prefix , mock_tokenizer
463447 ):
464448 """Test integration with prefix tokens.
465449
466450 ### WRITTEN BY AI ###
467451 """
468- # Setup mocks
469- mock_text_creator_instance = Mock ()
470- mock_text_creator_instance .words = ["word" ] * 100
471- mock_text_creator_instance .create_text .return_value = "sample text"
472- mock_text_creator .return_value = mock_text_creator_instance
473-
474452 mock_sampler_instance = Mock ()
475453 mock_sampler_instance .__iter__ = Mock (return_value = iter ([15 , 15 , 15 , 15 , 15 ]))
476454 mock_sampler .return_value = mock_sampler_instance
@@ -483,24 +461,20 @@ def test_prefix_tokens_integration(
483461
484462 # Verify prompt_tokens_count includes prefix
485463 for item in items :
486- assert item ["prompt_tokens_count" ] == config_with_prefix .prefix_tokens + 15
464+ assert (
465+ item ["prompt_tokens_count" ]
466+ == config_with_prefix .prefix_buckets [0 ].prefix_tokens + 15
467+ )
487468
488469 @pytest .mark .regression
489- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
490470 @patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
491471 def test_random_seeding_consistency (
492- self , mock_sampler , mock_text_creator , simple_config , mock_tokenizer
472+ self , mock_sampler , simple_config , mock_tokenizer
493473 ):
494474 """Test that same seed produces consistent results.
495475
496476 ### WRITTEN BY AI ###
497477 """
498- # Setup mocks
499- mock_text_creator_instance = Mock ()
500- mock_text_creator_instance .words = ["word" ] * 100
501- mock_text_creator_instance .create_text .return_value = "sample text"
502- mock_text_creator .return_value = mock_text_creator_instance
503-
504478 # Create consistent mock sampler behavior
505479 call_count = 0
506480
@@ -536,25 +510,12 @@ def mock_sampler_side_effect(*args, **kwargs):
536510 assert item1 ["output_tokens_count" ] == item2 ["output_tokens_count" ]
537511
538512 @pytest .mark .regression
539- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
540513 @patch ("guidellm.dataset.synthetic.IntegerRangeSampler" )
541- def test_variance_configuration (
542- self , mock_sampler , mock_text_creator , complex_config , mock_tokenizer
543- ):
514+ def test_variance_configuration (self , mock_sampler , complex_config , mock_tokenizer ):
544515 """Test that variance configuration is properly used.
545516
546517 ### WRITTEN BY AI ###
547518 """
548- # Setup mocks
549- mock_text_creator_instance = Mock ()
550- mock_text_creator_instance .words = ["word" ] * 100
551- mock_text_creator_instance .create_text .return_value = "sample text"
552- mock_text_creator .return_value = mock_text_creator_instance
553-
554- # Fix tokenizer mock to handle the create_text return properly
555- mock_tokenizer .encode .side_effect = (
556- lambda text : [1 , 2 , 3 ] if isinstance (text , str ) else [1 , 2 , 3 ]
557- )
558519
559520 # Setup mock sampler to track calls
560521 def mock_sampler_side_effect (* args , ** kwargs ):
@@ -592,19 +553,11 @@ def mock_sampler_side_effect(*args, **kwargs):
592553 assert output_call [1 ]["random_seed" ] == 43 # 42 + 1
593554
594555 @pytest .mark .regression
595- @patch ("guidellm.dataset.synthetic.EndlessTextCreator" )
596- def test_unique_prefix_generation (
597- self , mock_text_creator , simple_config , mock_tokenizer
598- ):
556+ def test_unique_prefix_generation (self , simple_config , mock_tokenizer ):
599557 """Test that unique prefixes are generated for each request.
600558
601559 ### WRITTEN BY AI ###
602560 """
603- mock_text_creator_instance = Mock ()
604- mock_text_creator_instance .words = ["word" ] * 100
605- mock_text_creator_instance .create_text .return_value = "sample text"
606- mock_text_creator .return_value = mock_text_creator_instance
607-
608561 # Mock the cycle to return predictable values
609562 with patch ("guidellm.dataset.synthetic.cycle" ) as mock_cycle :
610563 mock_cycle .return_value = iter ([100 , 101 , 102 , 103 , 104 ])
0 commit comments