1919from vllm .multimodal import MULTIMODAL_REGISTRY , MultiModalRegistry
2020from vllm .v1 .core .encoder_cache_manager import (EncoderCacheManager ,
2121 compute_encoder_budget )
22- from vllm .v1 .core .kv_cache_manager import KVCacheManager
22+ from vllm .v1 .core .kv_cache_manager import KVCacheBlocks , KVCacheManager
2323from vllm .v1 .core .sched .interface import SchedulerInterface
2424from vllm .v1 .core .sched .output import (CachedRequestData , NewRequestData ,
2525 SchedulerOutput )
@@ -185,7 +185,7 @@ def schedule(self) -> SchedulerOutput:
185185 # uses structured decoding.
186186 structured_output_request_ids : dict [str , int ] = {}
187187
188- req_to_new_block_ids : dict [str , tuple [ list [ int ], ...] ] = {}
188+ req_to_new_blocks : dict [str , KVCacheBlocks ] = {}
189189 num_scheduled_tokens : dict [str , int ] = {}
190190 token_budget = self .max_num_scheduled_tokens
191191 # Encoder-related.
@@ -288,8 +288,7 @@ def schedule(self) -> SchedulerOutput:
288288 # Therefore, we might introduce some additional
289289 # cycle to fill in the bitmask, which could be a big no-op.
290290 structured_output_request_ids [request .request_id ] = req_index
291- req_to_new_block_ids [request .request_id ] = (
292- new_blocks .get_block_ids ())
291+ req_to_new_blocks [request .request_id ] = new_blocks
293292 num_scheduled_tokens [request .request_id ] = num_new_tokens
294293 token_budget -= num_new_tokens
295294 req_index += 1
@@ -496,8 +495,8 @@ def schedule(self) -> SchedulerOutput:
496495
497496 if self .lora_config and request .lora_request :
498497 scheduled_loras .add (request .lora_request .lora_int_id )
499- req_to_new_block_ids [request .request_id ] = (
500- self .kv_cache_manager .get_block_ids (request .request_id ))
498+ req_to_new_blocks [request .request_id ] = (
499+ self .kv_cache_manager .get_blocks (request .request_id ))
501500 num_scheduled_tokens [request .request_id ] = num_new_tokens
502501 token_budget -= num_new_tokens
503502 request .status = RequestStatus .RUNNING
@@ -546,16 +545,16 @@ def schedule(self) -> SchedulerOutput:
546545 )
547546 # Construct the scheduler output.
548547 new_reqs_data = [
549- NewRequestData .from_request (req ,
550- req_to_new_block_ids [req .request_id ])
548+ NewRequestData .from_request (
549+ req , req_to_new_blocks [req .request_id ]. get_block_ids () )
551550 for req in scheduled_new_reqs
552551 ]
553552 cached_reqs_data = self ._make_cached_request_data (
554553 scheduled_running_reqs ,
555554 scheduled_resumed_reqs ,
556555 num_scheduled_tokens ,
557556 scheduled_spec_decode_tokens ,
558- req_to_new_block_ids ,
557+ req_to_new_blocks ,
559558 )
560559 scheduler_output = SchedulerOutput (
561560 scheduled_new_reqs = new_reqs_data ,
@@ -628,11 +627,11 @@ def _make_cached_request_data(
628627 resumed_reqs : list [Request ],
629628 num_scheduled_tokens : dict [str , int ],
630629 spec_decode_tokens : dict [str , list [int ]],
631- req_to_new_block_ids : dict [str , tuple [ list [ int ], ...] ],
630+ req_to_new_blocks : dict [str , KVCacheBlocks ],
632631 ) -> CachedRequestData :
633632 req_ids : list [str ] = []
634633 new_token_ids : list [list [int ]] = []
635- new_block_ids : list [tuple [list [int ], ...]] = []
634+ new_block_ids : list [Optional [ tuple [list [int ], ...] ]] = []
636635 num_computed_tokens : list [int ] = []
637636
638637 use_connector = self .connector is not None
@@ -655,7 +654,8 @@ def _make_cached_request_data(
655654 # out of bounds errors. TODO: Remove this once the KVConnector
656655 # is updated to handle token IDs properly.
657656 new_token_ids .append ([])
658- new_block_ids .append (req_to_new_block_ids [req_id ])
657+ new_block_ids .append (
658+ req_to_new_blocks [req_id ].get_block_ids (allow_none = True ))
659659 num_computed_tokens .append (req .num_computed_tokens )
660660 # Because resumed_reqs is usually empty, it is more efficient to do
661661 # in-place appending so that we don't need to allocate a new list.
0 commit comments