Skip to content

Commit 8ed2088

Browse files
committed
Initial commit for finite loras implementation
Signed-off-by: Jou-An Chen <[email protected]>
1 parent 08ca83c commit 8ed2088

File tree

16 files changed

+1044
-12
lines changed

16 files changed

+1044
-12
lines changed

QEfficient/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from QEfficient.compile.compile_helper import compile
1010
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
1111
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
12+
from QEfficient.lora import QEffAutoLoraModelForCausalLM
1213
from QEfficient.peft import QEffAutoPeftModelForCausalLM
1314
from QEfficient.transformers.transform import transform
1415

@@ -24,5 +25,6 @@
2425
"QEffAutoModel",
2526
"QEFFAutoModelForCausalLM",
2627
"QEffAutoPeftModelForCausalLM",
28+
"QEffAutoLoraModelForCausalLM",
2729
"QEFFCommonLoader",
2830
]

QEfficient/exporter/export_hf_to_cloud_ai_100.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
1818
from QEfficient.base.modeling_qeff import QEFFBaseModel
1919
from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
20+
from QEfficient.lora.auto import QEffAutoLoraModelForCausalLM
2021
from QEfficient.transformers.modeling_utils import get_lists_of_cb_qeff_models
2122
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
2223
from QEfficient.utils import load_hf_tokenizer
@@ -149,6 +150,7 @@ def convert_to_cloud_kvstyle(
149150
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
150151
onnx_dir_path: str,
151152
seq_len: int,
153+
max_num_adapters: int,
152154
) -> str:
153155
"""
154156
API to convert model with kv retention and export to ONNX.
@@ -181,7 +183,7 @@ def convert_to_cloud_kvstyle(
181183

182184
# Decide path for saving exported ONNX files.
183185
model_name = export_kvstyle_transformed_model_to_onnx(
184-
model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len
186+
model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, max_num_adapters
185187
) # type: ignore
186188

187189
# return the model path for automation.
@@ -195,6 +197,7 @@ def export_kvstyle_transformed_model_to_onnx(
195197
onnx_dir_path: str,
196198
seq_len: int,
197199
full_batch_size: Optional[int] = None,
200+
max_num_adapters: Optional[int] = None,
198201
) -> str:
199202
# Disabling requires_grad on all parameters
200203
for _, p in enumerate(transformed_model.parameters()):
@@ -213,6 +216,7 @@ def export_kvstyle_transformed_model_to_onnx(
213216
prompt_len=Constants.PROMPT_LEN,
214217
ctx_len=seq_len,
215218
full_batch_size=full_batch_size,
219+
max_num_adapters=max_num_adapters,
216220
)
217221

218222
inputs = input_handler.prepare_pytorch_inputs()
@@ -318,6 +322,7 @@ def export_for_cloud(
318322
onnx_dir_path: str,
319323
seq_length: int = Constants.SEQ_LEN,
320324
full_batch_size: Optional[int] = None,
325+
max_num_adapters: Optional[int] = None,
321326
) -> str:
322327
# Check if model architecture is supported for continuous batching.
323328
if full_batch_size and qeff_model.model.config.architectures[0] not in get_lists_of_cb_qeff_models.architectures:
@@ -326,14 +331,18 @@ def export_for_cloud(
326331
)
327332

328333
# FIXME: move all this to class instead of here, and just call qeff_model.export here.
329-
if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
334+
if (
335+
AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM
336+
or qeff_model.__class__ == QEffAutoLoraModelForCausalLM
337+
): # type: ignore
330338
return export_lm_model_for_cloud(
331339
model_name=model_name,
332340
qeff_model=qeff_model, # type: ignore
333341
tokenizer=tokenizer,
334342
onnx_dir_path=onnx_dir_path,
335343
seq_length=seq_length,
336344
full_batch_size=full_batch_size,
345+
max_num_adapters=max_num_adapters,
337346
)
338347
else:
339348
raise NotImplementedError(
@@ -348,6 +357,7 @@ def export_lm_model_for_cloud(
348357
onnx_dir_path: str,
349358
seq_length: int,
350359
full_batch_size: Optional[int] = None,
360+
max_num_adapters: Optional[int] = None,
351361
) -> str:
352362
if os.path.exists(onnx_dir_path):
353363
logger.warning(f"Overriding {onnx_dir_path}")
@@ -361,6 +371,7 @@ def export_lm_model_for_cloud(
361371
onnx_dir_path=onnx_dir_path,
362372
seq_len=seq_length,
363373
full_batch_size=full_batch_size,
374+
max_num_adapters=max_num_adapters,
364375
) # type: ignore
365376

366377
else:
@@ -386,6 +397,7 @@ def qualcomm_efficient_converter(
386397
kv: bool = True,
387398
form_factor: str = "cloud",
388399
full_batch_size: Optional[int] = None,
400+
max_num_adapters: Optional[int] = None,
389401
) -> Tuple[str, str]:
390402
"""
391403
This method is an alias for ``QEfficient.export``.
@@ -466,6 +478,7 @@ def qualcomm_efficient_converter(
466478
onnx_dir_path=onnx_dir_path,
467479
seq_length=seq_length,
468480
full_batch_size=full_batch_size,
481+
max_num_adapters=max_num_adapters,
469482
)
470483
return onnx_dir_path, generated_onnx_model_path
471484
else:

QEfficient/exporter/export_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ def export_onnx(
8383
dynamic_axes[iname] = {0: dynamic_axis_past_key, 2: "ctx_len"}
8484
elif iname == "batch_index":
8585
dynamic_axes[iname] = {0: "batch_size"}
86+
elif iname == "lora_ids":
87+
dynamic_axes[iname] = {0: "batch_size"}
8688

8789
if "past_key.0" in input_names and "attention_mask" in input_names:
8890
dynamic_axes["attention_mask"] = {0: "batch_size", 1: "ctx_len"}

QEfficient/generation/text_generation_inference.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ def cloud_ai_100_exec_kv(
230230
write_io_dir: Optional[str] = None,
231231
automation=False,
232232
full_batch_size: Optional[int] = None,
233+
prompt_to_lora_id_mapping: Optional[List[int]] = None,
233234
):
234235
"""
235236
This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
@@ -277,6 +278,7 @@ def cloud_ai_100_exec_kv(
277278
stream=stream,
278279
write_io_dir=write_io_dir,
279280
full_batch_size=full_batch_size,
281+
prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
280282
)
281283
if full_batch_size is None:
282284
exec_info = [
@@ -313,6 +315,7 @@ def __init__(
313315
qpc_path: str,
314316
prompt: List[str],
315317
full_batch_size: Optional[int] = None,
318+
prompt_to_lora_id_mapping: Optional[List[int]] = None,
316319
ctx_len: Optional[int] = None,
317320
generation_len: Optional[int] = None,
318321
device_id: Optional[List[int]] = None,
@@ -342,6 +345,13 @@ def __init__(
342345
full_batch_size if full_batch_size else self._fetch_full_batch_size()
343346
) # Check and fetch full batch size if CB is enabled
344347

348+
if prompt_to_lora_id_mapping:
349+
self.prompt_to_lora_id_mapping_prefill = deque(prompt_to_lora_id_mapping)
350+
self.prompt_to_lora_id_mapping_decode = prompt_to_lora_id_mapping
351+
else:
352+
self.prompt_to_lora_id_mapping_prefill = None
353+
self.prompt_to_lora_id_mapping_decode = None
354+
345355
self.set_tokenizer_params() # set tokenizer params
346356

347357
# Initialize the storage variables.
@@ -460,6 +470,10 @@ def prepare_decode_inputs(self):
460470
if self.batch_index is not None:
461471
decode_inputs["batch_index"] = self.batch_index
462472

473+
if self.prompt_to_lora_id_mapping_decode and self.full_batch_size is not None:
474+
first_batch_lora_ids = [self.prompt_to_lora_id_mapping_decode[i] for i in range(self.full_batch_size)]
475+
decode_inputs["lora_ids"] = np.array(first_batch_lora_ids, dtype=np.int64).reshape(self.full_batch_size, 1)
476+
463477
return decode_inputs
464478

465479
def _update_decode_input(self, outputs, position_ids, generation_len, decode_batch_id=None):
@@ -547,6 +561,11 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
547561
if decode_batch_id is not None:
548562
inputs["batch_index"] = decode_batch_id
549563

564+
if self.prompt_to_lora_id_mapping_prefill:
565+
inputs["lora_ids"] = np.array(self.prompt_to_lora_id_mapping_prefill.popleft(), dtype=np.int64).reshape(
566+
1, 1
567+
)
568+
550569
for i in range(num_chunks):
551570
chunk_inputs = inputs.copy()
552571
chunk_inputs["input_ids"] = inputs["input_ids"][
@@ -634,6 +653,12 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
634653
)
635654

636655
generated_id_current_index[decode_batch_id] += 1
656+
657+
if self.prompt_to_lora_id_mapping_decode:
658+
decode_inputs["lora_ids"][decode_batch_id] = self.prompt_to_lora_id_mapping_decode[
659+
batch_id_map[decode_batch_id]
660+
]
661+
637662
return decode_pause_time
638663

639664
def run_decode(self, decode_inputs, generation_len):

QEfficient/lora/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# ----------------------------------------------------------------------------
7+
8+
from QEfficient.lora.auto import QEffAutoLoraModelForCausalLM
9+
10+
__all__ = [
11+
"QEffAutoLoraModelForCausalLM",
12+
]

0 commit comments

Comments
 (0)