|
10 | 10 | from vllm.core.interfaces import AllocStatus
|
11 | 11 | from vllm.core.scheduler import Scheduler, SchedulingBudget
|
12 | 12 | from vllm.lora.request import LoRARequest
|
13 |
| -from vllm.sequence import SequenceGroup, SequenceStatus |
| 13 | +from vllm.sequence import SequenceGroup |
14 | 14 |
|
15 | 15 | from .utils import (append_new_token, append_new_token_seq_group,
|
16 | 16 | create_dummy_prompt, get_sequence_groups,
|
@@ -296,55 +296,6 @@ def test_scheduler_delay_factor():
|
296 | 296 | append_new_token(out, 1)
|
297 | 297 |
|
298 | 298 |
|
299 |
| -def test_swapped_out_prioritized(): |
300 |
| - block_size = 4 |
301 |
| - scheduler = initialize_scheduler(max_num_seqs=6, |
302 |
| - block_size=block_size, |
303 |
| - num_cpu_blocks=64, |
304 |
| - num_gpu_blocks=64) |
305 |
| - # best_of=2 * 3 == 6 sequences. |
306 |
| - for i in range(3): |
307 |
| - _, seq_group = create_dummy_prompt(str(i), |
308 |
| - prompt_length=60, |
309 |
| - best_of=2, |
310 |
| - block_size=block_size) |
311 |
| - scheduler.add_seq_group(seq_group) |
312 |
| - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) |
313 |
| - # prefill scheduled now. |
314 |
| - assert len(out.scheduled_seq_groups) == 3 |
315 |
| - append_new_token(out, 1) |
316 |
| - |
317 |
| - # The last request should be swapped out. |
318 |
| - scheduler.block_manager.can_append_slots = MagicMock() |
319 |
| - |
320 |
| - def cannot_append_second_group(seq_group, num_lookahead_slots): |
321 |
| - return seq_group.request_id != "2" |
322 |
| - |
323 |
| - scheduler.block_manager.can_append_slots.side_effect = ( |
324 |
| - cannot_append_second_group) |
325 |
| - |
326 |
| - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) |
327 |
| - assert len(out.scheduled_seq_groups) == 2 |
328 |
| - assert out.num_batched_tokens == 2 |
329 |
| - assert out.blocks_to_swap_out != [] |
330 |
| - assert out.blocks_to_swap_in == [] |
331 |
| - append_new_token(out, 1) |
332 |
| - |
333 |
| - # Add 1 more task. Swap should be prioritized over prefill. |
334 |
| - _, seq_group = create_dummy_prompt(str(i), |
335 |
| - prompt_length=60, |
336 |
| - best_of=2, |
337 |
| - block_size=block_size) |
338 |
| - scheduler.add_seq_group(seq_group) |
339 |
| - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) |
340 |
| - append_new_token(out, 1) |
341 |
| - assert len(out.scheduled_seq_groups) == 3 |
342 |
| - # 3 decodes. It is swapped in. |
343 |
| - assert out.num_batched_tokens == 3 |
344 |
| - assert out.blocks_to_swap_in != [] |
345 |
| - assert out.blocks_to_swap_out == [] |
346 |
| - |
347 |
| - |
348 | 299 | def initialize_scheduler(
|
349 | 300 | *,
|
350 | 301 | max_num_seqs=1000,
|
@@ -646,60 +597,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
|
646 | 597 | assert output.blocks_to_copy == []
|
647 | 598 |
|
648 | 599 |
|
649 |
| -def test_decode_swap_beam_search(): |
650 |
| - """ |
651 |
| - Test best_of > 1 swap out blocks |
652 |
| - """ |
653 |
| - block_size = 4 |
654 |
| - scheduler = initialize_scheduler(block_size=block_size, |
655 |
| - num_gpu_blocks=64, |
656 |
| - num_cpu_blocks=64) |
657 |
| - curr_loras = None |
658 |
| - budget = create_token_budget() |
659 |
| - for i in range(3): |
660 |
| - _, seq_group = create_dummy_prompt(str(i), |
661 |
| - prompt_length=60, |
662 |
| - best_of=2, |
663 |
| - block_size=block_size) |
664 |
| - scheduler._allocate_and_set_running(seq_group) |
665 |
| - scheduler._add_seq_group_to_running(seq_group) |
666 |
| - append_new_token_seq_group(60, seq_group, 1) |
667 |
| - budget.add_num_seqs(seq_group.request_id, |
668 |
| - seq_group.get_max_num_running_seqs()) |
669 |
| - budget.add_num_batched_tokens( |
670 |
| - seq_group.request_id, seq_group.num_seqs(SequenceStatus.RUNNING)) |
671 |
| - |
672 |
| - # The last request should be swapped out. |
673 |
| - scheduler.block_manager.can_append_slots = MagicMock() |
674 |
| - |
675 |
| - def cannot_append_second_group(seq_group, num_lookahead_slots): |
676 |
| - return seq_group.request_id != "2" |
677 |
| - |
678 |
| - scheduler.block_manager.can_append_slots.side_effect = ( |
679 |
| - cannot_append_second_group) |
680 |
| - scheduler.block_manager.swap_out = MagicMock() |
681 |
| - expected_swap_mapping = [("5", "7")] |
682 |
| - scheduler.block_manager.swap_out.return_value = expected_swap_mapping |
683 |
| - |
684 |
| - output = scheduler._schedule_running(budget, curr_loras) |
685 |
| - remainig_running = scheduler.running |
686 |
| - assert len(remainig_running) == 0 |
687 |
| - assert len(output.decode_seq_groups) == 2 |
688 |
| - assert len(output.prefill_seq_groups) == 0 |
689 |
| - assert output.decode_seq_groups[0].seq_group.request_id == "0" |
690 |
| - assert output.decode_seq_groups[1].seq_group.request_id == "1" |
691 |
| - assert len(output.preempted) == 0 |
692 |
| - assert len(output.swapped_out) == 1 |
693 |
| - # Budget should refledct preempted requests. |
694 |
| - assert budget.num_batched_tokens == 2 |
695 |
| - # since there are 2 sequences, 2 should be subtracted. |
696 |
| - assert budget.num_curr_seqs == 4 |
697 |
| - # Both should be preempted, not swapped. |
698 |
| - assert output.blocks_to_swap_out == expected_swap_mapping |
699 |
| - # Nothing is copied. |
700 |
| - assert output.blocks_to_copy == [] |
701 |
| - |
702 |
| - |
703 | 600 | def test_schedule_decode_blocks_to_copy_update():
|
704 | 601 | """
|
705 | 602 | Verify blocks_to_copy is updated.
|
@@ -736,105 +633,6 @@ def test_schedule_decode_blocks_to_copy_update():
|
736 | 633 | assert output.blocks_to_copy == [(2, 3)]
|
737 | 634 |
|
738 | 635 |
|
739 |
| -def test_schedule_swapped_simple(): |
740 |
| - block_size = 4 |
741 |
| - scheduler = initialize_scheduler(block_size=block_size) |
742 |
| - curr_loras = None |
743 |
| - blocks_to_swap_out: List[Tuple[int, int]] = [] |
744 |
| - _, seq_group = create_dummy_prompt("1", |
745 |
| - prompt_length=4, |
746 |
| - best_of=2, |
747 |
| - block_size=block_size) |
748 |
| - scheduler._allocate_and_set_running(seq_group) |
749 |
| - append_new_token_seq_group(4, seq_group, 1) |
750 |
| - scheduler._swap_out(seq_group, blocks_to_swap_out) |
751 |
| - scheduler._add_seq_group_to_swapped(seq_group) |
752 |
| - |
753 |
| - budget = create_token_budget() |
754 |
| - output = scheduler._schedule_swapped(budget, curr_loras) |
755 |
| - remaining_swapped = scheduler.swapped |
756 |
| - assert len(remaining_swapped) == 0 |
757 |
| - assert budget.num_batched_tokens == 1 |
758 |
| - assert budget.num_curr_seqs == 2 |
759 |
| - assert len(output.decode_seq_groups) == 1 |
760 |
| - assert len(output.prefill_seq_groups) == 0 |
761 |
| - # swap in is the reverse of swap out |
762 |
| - blocks_to_swap_in_reverse = [] |
763 |
| - for swapin, swapout in output.blocks_to_swap_in: |
764 |
| - blocks_to_swap_in_reverse.append((swapout, swapin)) |
765 |
| - assert blocks_to_swap_out == blocks_to_swap_in_reverse |
766 |
| - |
767 |
| - |
768 |
| -def test_schedule_swapped_max_token_budget(): |
769 |
| - block_size = 4 |
770 |
| - scheduler = initialize_scheduler(block_size=block_size, |
771 |
| - num_cpu_blocks=32, |
772 |
| - num_gpu_blocks=32) |
773 |
| - curr_loras = None |
774 |
| - blocks_to_swap_out: List[Tuple[int, int]] = [] |
775 |
| - for i in range(2): |
776 |
| - _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) |
777 |
| - scheduler._allocate_and_set_running(seq_group) |
778 |
| - append_new_token_seq_group(60, seq_group, 1) |
779 |
| - scheduler._swap_out(seq_group, blocks_to_swap_out) |
780 |
| - scheduler._add_seq_group_to_swapped(seq_group) |
781 |
| - |
782 |
| - budget = create_token_budget(token_budget=1) |
783 |
| - output = scheduler._schedule_swapped(budget, curr_loras) |
784 |
| - remaining_swapped = scheduler.swapped |
785 |
| - assert len(remaining_swapped) == 1 |
786 |
| - assert budget.num_batched_tokens == 1 |
787 |
| - assert budget.num_curr_seqs == 2 |
788 |
| - assert len(output.decode_seq_groups) == 1 |
789 |
| - assert len(output.prefill_seq_groups) == 0 |
790 |
| - |
791 |
| - # Verify num_batched_tokens are respected. |
792 |
| - budget = create_token_budget(token_budget=1) |
793 |
| - add_token_budget(budget, 1, 0) |
794 |
| - output = scheduler._schedule_swapped(budget, curr_loras) |
795 |
| - remaining_swapped = scheduler.swapped |
796 |
| - assert len(remaining_swapped) == 1 |
797 |
| - assert budget.num_batched_tokens == 1 |
798 |
| - assert budget.num_curr_seqs == 0 |
799 |
| - assert len(output.decode_seq_groups) == 0 |
800 |
| - assert len(output.prefill_seq_groups) == 0 |
801 |
| - |
802 |
| - |
803 |
| -def test_schedule_swapped_max_seqs(): |
804 |
| - block_size = 4 |
805 |
| - scheduler = initialize_scheduler(block_size=block_size, |
806 |
| - num_cpu_blocks=64, |
807 |
| - num_gpu_blocks=64) |
808 |
| - curr_loras = None |
809 |
| - blocks_to_swap_out: List[Tuple[int, int]] = [] |
810 |
| - for i in range(4): |
811 |
| - _, seq_group = create_dummy_prompt(str(i), |
812 |
| - prompt_length=60, |
813 |
| - block_size=4) |
814 |
| - scheduler._allocate_and_set_running(seq_group) |
815 |
| - append_new_token_seq_group(60, seq_group, 1) |
816 |
| - scheduler._swap_out(seq_group, blocks_to_swap_out) |
817 |
| - scheduler._add_seq_group_to_swapped(seq_group) |
818 |
| - |
819 |
| - budget = create_token_budget(max_num_seqs=2) |
820 |
| - output = scheduler._schedule_swapped(budget, curr_loras) |
821 |
| - remaining_swapped = scheduler.swapped |
822 |
| - assert len(remaining_swapped) == 2 |
823 |
| - assert budget.num_batched_tokens == 2 |
824 |
| - assert budget.num_curr_seqs == 2 |
825 |
| - assert len(output.decode_seq_groups) == 2 |
826 |
| - assert len(output.prefill_seq_groups) == 0 |
827 |
| - |
828 |
| - # Verify num_curr_seqs are respected. |
829 |
| - output = scheduler._schedule_swapped(budget, curr_loras) |
830 |
| - remaining_swapped = scheduler.swapped |
831 |
| - assert len(remaining_swapped) == 2 |
832 |
| - assert budget.num_batched_tokens == 2 |
833 |
| - assert budget.num_curr_seqs == 2 |
834 |
| - assert len(output.decode_seq_groups) == 0 |
835 |
| - assert len(output.prefill_seq_groups) == 0 |
836 |
| - |
837 |
| - |
838 | 636 | def test_schedule_swapped_max_loras():
|
839 | 637 | block_size = 4
|
840 | 638 | lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
|
|
0 commit comments