From b574814928d921396f1a4b773544eab8941f5693 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 8 Mar 2025 02:54:56 +0000 Subject: [PATCH] Done Signed-off-by: Jee Jee Li --- vllm/worker/model_runner.py | 58 ++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0ea1d5dcbbb7..473bd901b5b2 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1111,41 +1111,41 @@ def load_model(self) -> None: with DeviceMemoryProfiler(self.device) as m: time_before_load = time.perf_counter() self.model = get_model(vllm_config=self.vllm_config) + if self.lora_config: + assert supports_lora( + self.model + ), f"{self.model.__class__.__name__} does not support LoRA yet." + + if supports_multimodal(self.model): + logger.warning( + "Regarding multimodal models, vLLM currently " + "only supports adding LoRA to language model.") + # It's necessary to distinguish between the + # max_position_embeddings of VLMs and LLMs. + if hasattr(self.model.config, "max_position_embeddings"): + max_pos_embeddings = ( + self.model.config.max_position_embeddings) + else: + max_pos_embeddings = ( + self.model.config.text_config.max_position_embeddings) + + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, + self.vocab_size, + self.lora_config, + self.device, + self.model.embedding_modules, + self.model.embedding_padding_modules, + max_position_embeddings=max_pos_embeddings, + ) + self.model = self.lora_manager.create_lora_manager(self.model) time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory logger.info("Model loading took %.4f GB and %.6f seconds", self.model_memory_usage / float(2**30), time_after_load - time_before_load) - - if self.lora_config: - assert supports_lora( - self.model - ), f"{self.model.__class__.__name__} does not support LoRA yet." - - if supports_multimodal(self.model): - logger.warning("Regarding multimodal models, vLLM currently " - "only supports adding LoRA to language model.") - # It's necessary to distinguish between the max_position_embeddings - # of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = self.model.config.max_position_embeddings - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) - - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, - self.vocab_size, - self.lora_config, - self.device, - self.model.embedding_modules, - self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, - ) - self.model = self.lora_manager.create_lora_manager(self.model) - if self.prompt_adapter_config: self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager( self.scheduler_config.max_num_seqs,