From c791acc8749f77c6f53789d1a3c49ca1b817227d Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Tue, 24 Sep 2024 00:13:48 +0000 Subject: [PATCH 1/7] Fixing tests which fail with BlockManager V2 --- tests/core/test_chunked_prefill_scheduler.py | 224 ++++++++++++------- 1 file changed, 143 insertions(+), 81 deletions(-) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 2f6ea632a5d9..8c14334a31e1 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -27,16 +27,19 @@ def schedule_and_update_computed_tokens(scheduler): return metas, out -def test_simple(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_simple(use_v2_block_manager: bool): """Verify basic scheduling works.""" block_size = 4 num_seq_group = 4 max_model_len = 16 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - num_seq_group, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + num_seq_group, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 @@ -45,7 +48,9 @@ def test_simple(): # Add seq groups to scheduler. for i in range(num_seq_group): - _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) + _, seq_group = create_dummy_prompt(str(i), + prompt_length=block_size, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) @@ -69,30 +74,36 @@ def test_simple(): assert len(seq_group_meta) == num_seq_group -def test_chunk(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_chunk(use_v2_block_manager: bool): """Verify prefills are chunked properly.""" block_size = 4 max_seqs = 60 max_model_len = 80 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 32 + cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) running: List[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): - _, seq_group = create_dummy_prompt(str(i), prompt_length=60) + _, seq_group = create_dummy_prompt(str(i), + prompt_length=60, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) # Verify the second request is chunked. seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) + print() assert set(get_sequence_groups(out)) == set(running) assert seq_group_meta[0].token_chunk_size == 60 # Verify it is chunked. @@ -113,24 +124,29 @@ def test_chunk(): assert out.num_batched_tokens == 57 -def test_complex(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_complex(use_v2_block_manager: bool): block_size = 4 max_seqs = 60 max_model_len = 80 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 64 + cache_config.num_gpu_blocks = 64 scheduler = Scheduler(scheduler_config, cache_config, None) running: List[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): - _, seq_group = create_dummy_prompt(str(i), prompt_length=60) + _, seq_group = create_dummy_prompt(str(i), + prompt_length=60, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) assert seq_group.is_prefill() @@ -151,7 +167,9 @@ def test_complex(): # Add 2 more requests. for i in range(2, 4): - _, seq_group = create_dummy_prompt(str(i), prompt_length=60) + _, seq_group = create_dummy_prompt(str(i), + prompt_length=60, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) @@ -176,16 +194,19 @@ def test_complex(): assert running[2].is_prefill() -def test_maximal_decoding(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_maximal_decoding(use_v2_block_manager: bool): """Verify decoding requests are prioritized.""" block_size = 4 max_seqs = 2 max_model_len = 8 max_num_batched_tokens = 2 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 @@ -194,7 +215,9 @@ def test_maximal_decoding(): # Add seq groups to scheduler. for i in range(2): - _, seq_group = create_dummy_prompt(str(i), prompt_length=2) + _, seq_group = create_dummy_prompt(str(i), + prompt_length=2, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) assert seq_group.is_prefill() @@ -211,7 +234,9 @@ def test_maximal_decoding(): append_new_token(running[0], 1) # Create one more seq_group. - _, seq_group = create_dummy_prompt("3", prompt_length=2) + _, seq_group = create_dummy_prompt("3", + prompt_length=2, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) assert seq_group.is_prefill() @@ -263,23 +288,28 @@ def test_maximal_decoding(): assert out.num_batched_tokens == 2 -def test_prompt_limit(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_prompt_limit(use_v2_block_manager: bool): """Verify max_num_batched_tokens < max_model_len is possible.""" block_size = 4 max_seqs = 32 max_model_len = 64 max_num_batched_tokens = 32 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 16 + cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) running: List[SequenceGroup] = [] - _, seq_group = create_dummy_prompt("1", prompt_length=48) + _, seq_group = create_dummy_prompt("1", + prompt_length=48, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) assert seq_group.is_prefill() @@ -293,7 +323,8 @@ def test_prompt_limit(): assert out.num_batched_tokens == 32 -def test_prompt_limit_exceed(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_prompt_limit_exceed(use_v2_block_manager: bool): block_size = 4 max_seqs = 64 max_model_len = 32 @@ -303,12 +334,14 @@ def test_prompt_limit_exceed(): max_model_len, enable_chunked_prefill=True) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 16 + cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) running: List[SequenceGroup] = [] - _, seq_group = create_dummy_prompt("2", prompt_length=48) + _, seq_group = create_dummy_prompt("2", + prompt_length=48, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) assert seq_group.is_prefill() @@ -317,22 +350,28 @@ def test_prompt_limit_exceed(): assert out.ignored_seq_groups[0] == seq_group -def test_swap(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_swap(use_v2_block_manager: bool): """Verify swapping works with chunked prefill requests""" block_size = 4 max_seqs = 30 max_model_len = 200 max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 16 + cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) - _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) + _, seq_group = create_dummy_prompt("1", + prompt_length=60, + best_of=2, + block_size=block_size) scheduler.add_seq_group(seq_group) _, out = schedule_and_update_computed_tokens(scheduler) # The request is chunked. @@ -369,21 +408,27 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert out.blocks_to_swap_out == [] -def test_running_prefill_prioritized_over_swap(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool): block_size = 4 max_seqs = 30 max_model_len = 200 max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 32 + cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) + _, seq_group = create_dummy_prompt("1", + prompt_length=60, + best_of=2, + block_size=block_size) scheduler.add_seq_group(seq_group) _, out = schedule_and_update_computed_tokens(scheduler) # The request is chunked. @@ -413,7 +458,9 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): scheduler.block_manager.can_swap_in = MagicMock() scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER - _, seq_group2 = create_dummy_prompt("2", prompt_length=60) + _, seq_group2 = create_dummy_prompt("2", + prompt_length=60, + block_size=block_size) scheduler.add_seq_group(seq_group2) _, out = schedule_and_update_computed_tokens(scheduler) assert len(out.scheduled_seq_groups) == 1 @@ -455,22 +502,27 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert out.blocks_to_swap_out == [] -def test_chunked_prefill_preempt(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_chunked_prefill_preempt(use_v2_block_manager: bool): """Verify preempt works with chunked prefill requests""" block_size = 4 max_seqs = 30 max_model_len = 200 max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 16 + cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) - _, seq_group = create_dummy_prompt("1", prompt_length=60) + _, seq_group = create_dummy_prompt("1", + prompt_length=60, + block_size=block_size) scheduler.add_seq_group(seq_group) _, out = schedule_and_update_computed_tokens(scheduler) # The request is chunked. @@ -517,22 +569,27 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots): assert out.num_batched_tokens == max_num_batched_tokens -def test_chunked_prefill_max_seqs(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_chunked_prefill_max_seqs(use_v2_block_manager: bool): block_size = 4 max_seqs = 2 max_model_len = 80 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 + cache_config.num_cpu_blocks = 128 + cache_config.num_gpu_blocks = 128 scheduler = Scheduler(scheduler_config, cache_config, None) running: List[SequenceGroup] = [] - _, seq_group = create_dummy_prompt("1", prompt_length=65) + _, seq_group = create_dummy_prompt("1", + prompt_length=65, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) # The first prefill is chunked. @@ -542,7 +599,9 @@ def test_chunked_prefill_max_seqs(): # Add new requests. for i in range(4): - _, seq_group = create_dummy_prompt(str(i), prompt_length=65) + _, seq_group = create_dummy_prompt(str(i), + prompt_length=65, + block_size=block_size) scheduler.add_seq_group(seq_group) running.append(seq_group) @@ -564,16 +623,19 @@ def test_chunked_prefill_max_seqs(): assert not running[1].is_prefill() -def test_perfix_caching(): +@pytest.mark.parametrize('use_v2_block_manager', [True, False]) +def test_perfix_caching(use_v2_block_manager: bool): """Verify allocating full blocks when prefix caching is enabled.""" block_size = 4 max_seqs = 10 max_model_len = 80 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) + scheduler_config = SchedulerConfig( + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + use_v2_block_manager=use_v2_block_manager) cache_config = CacheConfig(block_size, 1.0, 1, From 86e27361b0f4d29e562227dddfb20e07b9ddbff5 Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Tue, 24 Sep 2024 02:38:46 +0000 Subject: [PATCH 2/7] Dummy commut --- tests/core/test_chunked_prefill_scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 8c14334a31e1..fa9388fa6222 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -334,6 +334,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): max_model_len, enable_chunked_prefill=True) cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) From 7802b94c2c09a95362812e61f1f49a67d278f871 Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Tue, 24 Sep 2024 02:39:34 +0000 Subject: [PATCH 3/7] Dummy commit --- tests/core/test_chunked_prefill_scheduler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index fa9388fa6222..e1e29680a82b 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -333,8 +333,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): max_seqs, max_model_len, enable_chunked_prefill=True) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - + cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) From 6b5cca5aedb6841ed9bf138cadd0de02f5eefdd7 Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Tue, 24 Sep 2024 02:42:30 +0000 Subject: [PATCH 4/7] Fix format --- tests/core/test_chunked_prefill_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index e1e29680a82b..8c14334a31e1 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -333,7 +333,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): max_seqs, max_model_len, enable_chunked_prefill=True) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) From 0db1e39ffebf3015c8919b386601ccfeedebfdcf Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Tue, 24 Sep 2024 15:56:52 +0000 Subject: [PATCH 5/7] Dummy --- tests/core/test_chunked_prefill_scheduler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 8c14334a31e1..9dddd751c785 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -338,7 +338,6 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) running: List[SequenceGroup] = [] - _, seq_group = create_dummy_prompt("2", prompt_length=48, block_size=block_size) From 1a43b888cfff89b27bca9111baa66a1fb0fdb835 Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Tue, 24 Sep 2024 21:04:30 +0000 Subject: [PATCH 6/7] Dummy commit --- tests/core/test_chunked_prefill_scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 9dddd751c785..91be809f9f53 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -334,6 +334,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): max_model_len, enable_chunked_prefill=True) cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) From 173b80ed2f8b3b98339d88b77f26b379f4baa5b1 Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Tue, 24 Sep 2024 21:05:43 +0000 Subject: [PATCH 7/7] Dummy commit --- tests/core/test_chunked_prefill_scheduler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 91be809f9f53..9dddd751c785 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -334,7 +334,6 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): max_model_len, enable_chunked_prefill=True) cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None)