7
7
8
8
import hashlib
9
9
import os
10
- import sys
11
10
from pathlib import Path
12
11
from typing import Any , List , Optional
13
12
24
23
from QEfficient .utils .constants import QEFF_MODELS_DIR
25
24
from QEfficient .utils .logging_utils import logger
26
25
27
- INTMAX = sys .maxsize
28
-
29
26
30
27
class QEffAutoLoraModelForCausalLM (QEFFAutoModelForCausalLM ):
31
28
"""
@@ -54,7 +51,7 @@ class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
54
51
m.compile(num_cores=16, device_group=[0])
55
52
56
53
prompts=["code prompt", "math prompt", "generic"]
57
- m.generate(prompts, device_group=[0], prompt_to_lora_id_mapping=[magicoder_id,gsm8k_id,INTMAX ])
54
+ m.generate(prompts, device_group=[0], prompt_to_lora_id_mapping=[magicoder_id,gsm8k_id,0 ])
58
55
59
56
"""
60
57
@@ -148,7 +145,7 @@ def load_adapter(self, adapter_model_id: str, adapter_name: str, **kwargs: Any):
148
145
149
146
# set active adapter id to current max if adapter_name is new
150
147
if adapter_name not in self .active_adapter_to_id .keys ():
151
- self .active_adapter_to_id [adapter_name ] = self .max_num_adapters
148
+ self .active_adapter_to_id [adapter_name ] = self .max_num_adapters + 1 # reserve 0 for base
152
149
153
150
# add active adapter to set
154
151
self .active_adapters .add (adapter_name )
@@ -168,7 +165,7 @@ def unload_adapter(self, adapter_name: str):
168
165
169
166
# renumbering of active adapter id
170
167
for index , (key , value ) in enumerate (self .active_adapter_to_id .items ()):
171
- self .active_adapter_to_id [key ] = index
168
+ self .active_adapter_to_id [key ] = index + 1
172
169
173
170
logger .warning (f"Deleting { adapter_name } from active adapters." )
174
171
if self .onnx_path or self .qpc_path :
@@ -203,9 +200,9 @@ def load_adapter_weights_to_model(self):
203
200
for i in range (num_hidden_layers ):
204
201
for target_module in self .target_modules_for_all_adapters :
205
202
# stack all adapters weights
206
- a_tensor_list = list (range (self .max_num_adapters ))
207
- b_tensor_list = list (range (self .max_num_adapters ))
208
- c_tensor_list = list (range (self .max_num_adapters ))
203
+ a_tensor_list = list (range (self .max_num_adapters + 1 ))
204
+ b_tensor_list = list (range (self .max_num_adapters + 1 ))
205
+ c_tensor_list = list (range (self .max_num_adapters + 1 ))
209
206
210
207
for lora_name , lora_id in self .active_adapter_to_id .items ():
211
208
if (
@@ -232,12 +229,18 @@ def load_adapter_weights_to_model(self):
232
229
dtype = torch .float16 ,
233
230
)
234
231
232
+ # dummy zero tensor for base model
233
+ a_tensor_list [0 ] = torch .zeros_like (a_tensor_list [1 ])
234
+ b_tensor_list [0 ] = torch .zeros_like (b_tensor_list [1 ])
235
+ c_tensor_list [0 ] = torch .zeros_like (c_tensor_list [1 ])
236
+
237
+ # stack weight tensors
235
238
stacked_lora_A = (
236
239
torch .stack (a_tensor_list , dim = 0 ).unsqueeze (1 ).transpose (2 , 3 )
237
- ) # <num_adapters , 1, in_feature, r>
240
+ ) # <num_loras , 1, in_feature, r>
238
241
stacked_lora_B = (
239
242
torch .stack (b_tensor_list , dim = 0 ).unsqueeze (1 ).transpose (2 , 3 )
240
- ) # <num_adapters , 1, r, out_feature>
243
+ ) # <num_loras , 1, r, out_feature>
241
244
stacked_lora_C = (
242
245
torch .stack (c_tensor_list , dim = 0 ).unsqueeze (1 ).unsqueeze (2 ).unsqueeze (3 )
243
246
) # <num_loras, 1, 1, 1>
@@ -308,6 +311,7 @@ def export(self, **kwargs) -> str:
308
311
export_dir = kwargs .get ("export_dir" , None )
309
312
310
313
# obtain all necessary information to initialize the model
314
+ assert self .max_num_adapters , "Please use load_adapter() to add at least one adapter; otherwise, refer to QEFFAutoModelForCausalLM for base model usage"
311
315
self .init_adapter_model ()
312
316
313
317
assert self .is_transformed , "Please first run transform on the QEFFAutoModelForCausalLM object"
@@ -411,7 +415,7 @@ def export_and_compile(
411
415
def run_cloud_ai_100 (self , prompts : List [str ], device_id : List [int ] = None , ** kwargs ):
412
416
assert isinstance (self .qpc_path , str ), "Please run compile API first!"
413
417
generation_len = kwargs .pop ("generation_len" , None )
414
- default_mapping = [INTMAX for _ in range (len (prompts ))]
418
+ default_mapping = [0 for _ in range (len (prompts ))]
415
419
prompt_to_lora_id_mapping = kwargs .pop ("prompt_to_lora_id_mapping" , default_mapping )
416
420
return QEfficient .cloud_ai_100_exec_kv (
417
421
self .tokenizer ,
0 commit comments