@@ -1111,41 +1111,41 @@ def load_model(self) -> None:
1111
1111
with DeviceMemoryProfiler (self .device ) as m :
1112
1112
time_before_load = time .perf_counter ()
1113
1113
self .model = get_model (vllm_config = self .vllm_config )
1114
+ if self .lora_config :
1115
+ assert supports_lora (
1116
+ self .model
1117
+ ), f"{ self .model .__class__ .__name__ } does not support LoRA yet."
1118
+
1119
+ if supports_multimodal (self .model ):
1120
+ logger .warning (
1121
+ "Regarding multimodal models, vLLM currently "
1122
+ "only supports adding LoRA to language model." )
1123
+ # It's necessary to distinguish between the
1124
+ # max_position_embeddings of VLMs and LLMs.
1125
+ if hasattr (self .model .config , "max_position_embeddings" ):
1126
+ max_pos_embeddings = (
1127
+ self .model .config .max_position_embeddings )
1128
+ else :
1129
+ max_pos_embeddings = (
1130
+ self .model .config .text_config .max_position_embeddings )
1131
+
1132
+ self .lora_manager = LRUCacheWorkerLoRAManager (
1133
+ self .scheduler_config .max_num_seqs ,
1134
+ self .scheduler_config .max_num_batched_tokens ,
1135
+ self .vocab_size ,
1136
+ self .lora_config ,
1137
+ self .device ,
1138
+ self .model .embedding_modules ,
1139
+ self .model .embedding_padding_modules ,
1140
+ max_position_embeddings = max_pos_embeddings ,
1141
+ )
1142
+ self .model = self .lora_manager .create_lora_manager (self .model )
1114
1143
time_after_load = time .perf_counter ()
1115
1144
1116
1145
self .model_memory_usage = m .consumed_memory
1117
1146
logger .info ("Model loading took %.4f GB and %.6f seconds" ,
1118
1147
self .model_memory_usage / float (2 ** 30 ),
1119
1148
time_after_load - time_before_load )
1120
-
1121
- if self .lora_config :
1122
- assert supports_lora (
1123
- self .model
1124
- ), f"{ self .model .__class__ .__name__ } does not support LoRA yet."
1125
-
1126
- if supports_multimodal (self .model ):
1127
- logger .warning ("Regarding multimodal models, vLLM currently "
1128
- "only supports adding LoRA to language model." )
1129
- # It's necessary to distinguish between the max_position_embeddings
1130
- # of VLMs and LLMs.
1131
- if hasattr (self .model .config , "max_position_embeddings" ):
1132
- max_pos_embeddings = self .model .config .max_position_embeddings
1133
- else :
1134
- max_pos_embeddings = (
1135
- self .model .config .text_config .max_position_embeddings )
1136
-
1137
- self .lora_manager = LRUCacheWorkerLoRAManager (
1138
- self .scheduler_config .max_num_seqs ,
1139
- self .scheduler_config .max_num_batched_tokens ,
1140
- self .vocab_size ,
1141
- self .lora_config ,
1142
- self .device ,
1143
- self .model .embedding_modules ,
1144
- self .model .embedding_padding_modules ,
1145
- max_position_embeddings = max_pos_embeddings ,
1146
- )
1147
- self .model = self .lora_manager .create_lora_manager (self .model )
1148
-
1149
1149
if self .prompt_adapter_config :
1150
1150
self .prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager (
1151
1151
self .scheduler_config .max_num_seqs ,
0 commit comments