17
17
from QEfficient .base .common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP , QEFF_MODEL_TYPE , QEFFCommonLoader
18
18
from QEfficient .base .modeling_qeff import QEFFBaseModel
19
19
from QEfficient .exporter .export_utils import export_onnx , fix_onnx_fp16 , generate_input_files , run_model_on_ort
20
+ from QEfficient .lora .auto import QEffAutoLoraModelForCausalLM
20
21
from QEfficient .transformers .modeling_utils import get_lists_of_cb_qeff_models
21
22
from QEfficient .transformers .models .modeling_auto import QEFFAutoModelForCausalLM
22
23
from QEfficient .utils import load_hf_tokenizer
@@ -149,6 +150,7 @@ def convert_to_cloud_kvstyle(
149
150
tokenizer : Union [PreTrainedTokenizer , PreTrainedTokenizerFast ],
150
151
onnx_dir_path : str ,
151
152
seq_len : int ,
153
+ max_num_adapters : int ,
152
154
) -> str :
153
155
"""
154
156
API to convert model with kv retention and export to ONNX.
@@ -181,7 +183,7 @@ def convert_to_cloud_kvstyle(
181
183
182
184
# Decide path for saving exported ONNX files.
183
185
model_name = export_kvstyle_transformed_model_to_onnx (
184
- model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len
186
+ model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len , max_num_adapters
185
187
) # type: ignore
186
188
187
189
# return the model path for automation.
@@ -195,6 +197,7 @@ def export_kvstyle_transformed_model_to_onnx(
195
197
onnx_dir_path : str ,
196
198
seq_len : int ,
197
199
full_batch_size : Optional [int ] = None ,
200
+ max_num_adapters : Optional [int ] = None ,
198
201
) -> str :
199
202
# Disabling requires_grad on all parameters
200
203
for _ , p in enumerate (transformed_model .parameters ()):
@@ -213,6 +216,7 @@ def export_kvstyle_transformed_model_to_onnx(
213
216
prompt_len = Constants .PROMPT_LEN ,
214
217
ctx_len = seq_len ,
215
218
full_batch_size = full_batch_size ,
219
+ max_num_adapters = max_num_adapters ,
216
220
)
217
221
218
222
inputs = input_handler .prepare_pytorch_inputs ()
@@ -318,6 +322,7 @@ def export_for_cloud(
318
322
onnx_dir_path : str ,
319
323
seq_length : int = Constants .SEQ_LEN ,
320
324
full_batch_size : Optional [int ] = None ,
325
+ max_num_adapters : Optional [int ] = None ,
321
326
) -> str :
322
327
# Check if model architecture is supported for continuous batching.
323
328
if full_batch_size and qeff_model .model .config .architectures [0 ] not in get_lists_of_cb_qeff_models .architectures :
@@ -326,14 +331,18 @@ def export_for_cloud(
326
331
)
327
332
328
333
# FIXME: move all this to class instead of here, and just call qeff_model.export here.
329
- if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP .get (qeff_model .__class__ , None ) == QEFF_MODEL_TYPE .CAUSALLM : # type: ignore
334
+ if (
335
+ AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP .get (qeff_model .__class__ , None ) == QEFF_MODEL_TYPE .CAUSALLM
336
+ or qeff_model .__class__ == QEffAutoLoraModelForCausalLM
337
+ ): # type: ignore
330
338
return export_lm_model_for_cloud (
331
339
model_name = model_name ,
332
340
qeff_model = qeff_model , # type: ignore
333
341
tokenizer = tokenizer ,
334
342
onnx_dir_path = onnx_dir_path ,
335
343
seq_length = seq_length ,
336
344
full_batch_size = full_batch_size ,
345
+ max_num_adapters = max_num_adapters ,
337
346
)
338
347
else :
339
348
raise NotImplementedError (
@@ -348,6 +357,7 @@ def export_lm_model_for_cloud(
348
357
onnx_dir_path : str ,
349
358
seq_length : int ,
350
359
full_batch_size : Optional [int ] = None ,
360
+ max_num_adapters : Optional [int ] = None ,
351
361
) -> str :
352
362
if os .path .exists (onnx_dir_path ):
353
363
logger .warning (f"Overriding { onnx_dir_path } " )
@@ -361,6 +371,7 @@ def export_lm_model_for_cloud(
361
371
onnx_dir_path = onnx_dir_path ,
362
372
seq_len = seq_length ,
363
373
full_batch_size = full_batch_size ,
374
+ max_num_adapters = max_num_adapters ,
364
375
) # type: ignore
365
376
366
377
else :
@@ -386,6 +397,7 @@ def qualcomm_efficient_converter(
386
397
kv : bool = True ,
387
398
form_factor : str = "cloud" ,
388
399
full_batch_size : Optional [int ] = None ,
400
+ max_num_adapters : Optional [int ] = None ,
389
401
) -> Tuple [str , str ]:
390
402
"""
391
403
This method is an alias for ``QEfficient.export``.
@@ -466,6 +478,7 @@ def qualcomm_efficient_converter(
466
478
onnx_dir_path = onnx_dir_path ,
467
479
seq_length = seq_length ,
468
480
full_batch_size = full_batch_size ,
481
+ max_num_adapters = max_num_adapters ,
469
482
)
470
483
return onnx_dir_path , generated_onnx_model_path
471
484
else :
0 commit comments