Enable init from QEffAutoPeftModelForCausalLM with finite_adapters flag

quic-jouachen · quic-jouachen · commit 96ce832832bd · 2024-11-13T17:17:09.000-08:00
Signed-off-by: Jou-An Chen &lt;quic_jouachen@quicinc.com&gt;
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -9,7 +9,6 @@
 from QEfficient.compile.compile_helper import compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.lora import QEffAutoLoraModelForCausalLM
 from QEfficient.peft import QEffAutoPeftModelForCausalLM
 from QEfficient.transformers.transform import transform
 
@@ -25,6 +24,5 @@
     "QEffAutoModel",
     "QEFFAutoModelForCausalLM",
     "QEffAutoPeftModelForCausalLM",
-    "QEffAutoLoraModelForCausalLM",
     "QEFFCommonLoader",
 ]
diff --git a/QEfficient/lora/auto.py b/QEfficient/lora/auto.py
@@ -24,8 +24,8 @@
 
 class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
     """
-    QEff class for loading models with multiple LoRA adapters.
-    Once exported and compiled, the qpc can perform mixed batch inference with provided prompt_to_lora_id_mapping.
+    QEff class for loading models with multiple LoRA adapters. Currently only Mistral and Llama model are supported.
+    Once exported and compiled, the qpc can perform mixed batch inference with provided `prompt_to_adapter_mapping`.
 
     Args:
         :model (nn.Module): PyTorch model
@@ -34,21 +34,20 @@ class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
         :adapter_configs (Dict): A dictionary contains lora_name to lora_configs mapping
         :max_num_adapters (int): Total number of active adapters that to be exported and compiled
         :active_adapter_to_id (Dict): A dictionary contains active adapter's lora_name to lora_id mapping
+        :lora_rank (int): The consistent lora rank across all active adapters
+        :target_modules_for_all_adapters (List[str]): The consistent set of target modules across all active adapters
 
     .. code-block:: python
 
-        from QEfficient import QEffAutoLoraModelForCausalLM
+        from QEfficient.lora import QEffAutoLoraModelForCausalLM
 
         m = QEffAutoPeftModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
         m.load_adapter("predibase/gsm8k", "gsm8k")
         m.load_adapter("predibase/magicoder", "magicoder")
-        gsm8k_id = m.set_adapter("gsm8k")
-        magicoder_id = m.set_adapter("magicoder")
-        m.export(full_batch_size=3)
         m.compile(num_cores=16, device_group=[0])
 
         prompts=["code prompt", "math prompt", "generic"]
-        m.generate(prompts, device_group=[0], prompt_to_lora_id_mapping=[magicoder_id,gsm8k_id,0])
+        m.generate(prompts, device_group=[0], prompt_to_adapter_mapping=["magicoder","gsm8k_id","base"])
 
     """
 
@@ -188,12 +187,10 @@ def unload_adapter(self, adapter_name: str):
 
         return True
 
-    def get_adapter_id(self, adapter_name):
-        "get the adapter_id that maps to the adapter_name"
+    def set_adapter(self, adapter_name: str):
+        raise NotImplementedError("Set adapter is not supported in finite_adapters mode")
 
-        return self.active_adapter_to_id[adapter_name]
-
-    def load_adapter_weights_to_model(self):
+    def _load_adapter_weights_to_model(self):
         "Loads adapter weights to the model's multilora layer in a stacked format"
 
         num_hidden_layers = len(self.model.model.layers)
@@ -256,7 +253,7 @@ def load_adapter_weights_to_model(self):
                 module.lora_b_weights.copy_(stacked_lora_b)
                 module.lora_scalings.copy_(stacked_lora_s)
 
-    def init_adapter_model(self):
+    def _init_adapter_model(self):
         "Initialize the fixed lora model with multiple adapter weigths standby"
 
         # assume all adapters have same target_modules and ranks
@@ -275,12 +272,23 @@ def init_adapter_model(self):
         )
 
         # load_weight to model
-        self.load_adapter_weights_to_model()
+        self._load_adapter_weights_to_model()
 
     def export(self, export_dir: Optional[str] = None) -> str:
+        """
+        Exports the model to ``ONNX`` format using ``torch.onnx.export``.
+        We currently don't support exporting non-transformed models. Please refer to the ``convert_to_cloud_bertstyle`` function in the **Low-Level API** for a legacy function that supports this."
+
+        ``Optional`` Args:
+            does not any arguments.
+
+        Returns:
+            :str: Path of the generated ``ONNX`` graph.
+        """
+
         # initialize the adapter model
         assert self.max_num_adapters, "Please use load_adapter() to add at least one adapter; otherwise, refer to QEFFAutoModelForCausalLM for base model usage"
-        self.init_adapter_model()
+        self._init_adapter_model()
 
         bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
         seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
@@ -338,18 +346,21 @@ def generate(
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
             :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
+            :prompt_to_adapter_mapping (List[str]): A list of adapter names that maps to the prompts, specifying which adapter the prompt wants to apply. "base" for base model (no adapter).
         """
         if runtime != "AI_100":
             raise ValueError("Only AI_100 runtime is supported right now via generate API")
         if not isinstance(self.qpc_path, Path):
             raise TypeError("Please run compile API first!")
         generation_len = kwargs.pop("generation_len", None)
-        prompt_to_lora_id_mapping = kwargs.pop("prompt_to_lora_id_mapping", [0 for _ in range(len(prompts))])
+        prompt_to_adapter_mapping = kwargs.pop("prompt_to_adapter_mapping", ["base" for _ in range(len(prompts))])
         return QEfficient.cloud_ai_100_exec_kv(
             tokenizer,
             self.qpc_path,
             prompt=prompts,
             device_id=device_id,
             generation_len=generation_len,
-            prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
+            prompt_to_lora_id_mapping=[
+                self.active_adapter_to_id[name] if name != "base" else 0 for name in prompt_to_adapter_mapping
+            ],
         )
diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 import torch
-from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM, load_peft_weights
+from peft import AutoPeftModelForCausalLM, PeftConfig, PeftModelForCausalLM, load_peft_weights
 from torch import nn
 from transformers import GenerationConfig, StoppingCriteria, StoppingCriteriaList
 from transformers.generation.streamers import BaseStreamer
@@ -21,6 +21,7 @@
 from QEfficient.base.onnx_transforms import FP16ClipTransform, OnnxTransform, SplitTensorsTransform
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.generation.cloud_infer import QAICInferenceSession
+from QEfficient.lora import QEffAutoLoraModelForCausalLM
 from QEfficient.peft.onnx_transforms import AdapterWeightsToInputsTransform
 from QEfficient.peft.pytorch_transforms import PeftModelInputsTransform
 from QEfficient.transformers.pytorch_transforms import CustomOpsTransform, KVCacheTransform
@@ -38,6 +39,7 @@ class QEffAutoPeftModelForCausalLM(QEFFBaseModel):
 
     Args:
         :model (nn.Module): PyTorch model
+        :finite_adapters (bool): set True to enable finite adapter mode with QEffAutoLoraModelForCausalLM class. Please refer to QEffAutoLoraModelForCausalLM for API specification.
 
     .. code-block:: python
 
@@ -152,7 +154,17 @@ def from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs):
         if kwargs.get("use_cache") is False:
             warnings.warn("Overriding to use_cache=True")
         kwargs["use_cache"] = True
-        obj = cls._from_pretrained(pretrained_name_or_path, *args, **kwargs)
+
+        if kwargs.pop("finite_adapters", False):  # initialize through finite_adapters class
+            obj = QEffAutoLoraModelForCausalLM.from_pretrained(
+                pretrained_model_name_or_path=PeftConfig.from_pretrained(
+                    pretrained_name_or_path
+                ).base_model_name_or_path,
+                **kwargs,
+            )
+            obj.load_adapter(pretrained_name_or_path, list(args)[0])
+        else:
+            obj = cls._from_pretrained(pretrained_name_or_path, *args, **kwargs)
         return obj
 
     def export(self, export_dir: Optional[str] = None) -> str:
diff --git a/examples/lora_models.py b/examples/lora_models.py
@@ -7,7 +7,7 @@
 
 ## This example works on continuous batching with different lora adapters in the same batch ##
 
-from QEfficient import QEffAutoLoraModelForCausalLM
+from QEfficient import QEffAutoPeftModelForCausalLM
 from QEfficient.utils import load_hf_tokenizer
 
 base_model_name = "mistralai/Mistral-7B-v0.1"
@@ -17,37 +17,22 @@
 device_group = [0]
 
 ## STEP 1 -- init base model
-
-# **Option1**: Download model weights from hugging face & Init it with QEffAuto model to apply QEff transforms
-# model_hf = AutoModelForCausalLM.from_pretrained(base_model_name)
-# qeff_model = QEffAutoLoraModelForCausalLM(model_hf, continuous_batching=True)
-
-# **Option2**: Initialize the model using from_pretrained() method
-qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(
-    pretrained_model_name_or_path=base_model_name, continuous_batching=True
+qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained(
+    "predibase/gsm8k", "gsm8k", continuous_batching=True, finite_adapters=True
 )
 
-# (alternative) non-cb initialization
-# qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(pretrained_model_name_or_path=base_model_name, continuous_batching=False)
+# (alternative) non-cb compilation
+# qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/gsm8k", "gsm8k", continuous_batching=False, finite_adapters=True)
 
 ## STEP 2 -- load adapter adapter
-adapter_id_gsm8k = qeff_model.load_adapter("predibase/gsm8k", "gsm8k")
-print(f"Activating gsm8k as adapter_id {adapter_id_gsm8k}")
-
-adapter_id_tldr = qeff_model.load_adapter("predibase/tldr_content_gen", "tldr_content_gen")
-print(f"Activating tldr_content_gen as adapter_id {adapter_id_tldr}")
+qeff_model.load_adapter("predibase/tldr_content_gen", "tldr_content_gen")
 
-adapter_id_dbpedia = qeff_model.load_adapter("predibase/dbpedia", "dbpedia")
-print(f"Activating dbpedia as adapter_id {adapter_id_dbpedia}")
+qeff_model.load_adapter("predibase/dbpedia", "dbpedia")
 
 # STEP 2 (optional) -- unload adapter
 unload_status = qeff_model.unload_adapter("dbpedia")
 print(f"Unloading dbpedia success: {unload_status}")
 
-# get adapter id
-# NOTE: should rely on get_adapter_id in case the id obtained at set_adpater() get updated
-gsm8k_id = qeff_model.get_adapter_id("gsm8k")
-tldr_id = qeff_model.get_adapter_id("tldr_content_gen")
 
 ## STEP 3 -- export & compile qeff model
 qpc_path = qeff_model.compile(
@@ -71,10 +56,6 @@
 #                               mxint8_kv_cache=True)
 
 ## STEP 4 -- run inference on the generate function
-# prompt_to_lora_id_mapping is a list of lora_id of which the size matches num of prompts
-# and is a one-on-one mapping for the prompt-to-loraid
-# e.g., prompt_to_lora_id_mapping = [{adapter_id_0}, {adapter_id_1}, {adapter_id_0}, {adapter_id_1}, ...]
-# setting 0 means using base model
 prompts = [
     """Please answer the following question: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?\n\nAnswer:""",
     """The following headline is the headline of a news report. Please write the content of the news passage based on only this headline.\n\nHeadline: Harvard shrank its insect-inspired microrobot to the size of a penny\n\nContent:""",
@@ -90,7 +71,16 @@
     tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
     prompts=prompts,
     device_id=device_group,
-    prompt_to_lora_id_mapping=[gsm8k_id, tldr_id, gsm8k_id, 0, gsm8k_id, tldr_id, gsm8k_id, tldr_id],
+    prompt_to_adapter_mapping=[
+        "gsm8k",
+        "tldr_content_gen",
+        "gsm8k",
+        "base",
+        "gsm8k",
+        "tldr_content_gen",
+        "gsm8k",
+        "tldr_content_gen",
+    ],
 )
 
 
diff --git a/tests/lora/test_lora_model.py b/tests/lora/test_lora_model.py
@@ -12,7 +12,8 @@
 from peft import LoraConfig
 from transformers import AutoConfig, AutoModelForCausalLM
 
-from QEfficient import QEffAutoLoraModelForCausalLM
+from QEfficient import QEffAutoPeftModelForCausalLM
+from QEfficient.lora import QEffAutoLoraModelForCausalLM
 from QEfficient.utils import load_hf_tokenizer
 
 configs = [
@@ -74,6 +75,18 @@ def test_auto_lora_model_for_causal_lm_from_pretrained(base_model_name, adapter_
     assert len(qeff_model.active_adapter_to_id) == 0
 
 
+# test peft model initialization using from_pretrained approach
+@pytest.mark.parametrize("base_model_name,adapter_id_0,adapter_id_1", model_samples)
+def test_auto_peft_model_for_causal_lm_from_pretrained(base_model_name, adapter_id_0, adapter_id_1):
+    qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained(adapter_id_0, "id_0", finite_adapters=True)
+
+    assert qeff_model.base_model_name == base_model_name
+    assert len(qeff_model.adapter_weights) == 1
+    assert len(qeff_model.adapter_configs) == 1
+    assert qeff_model.max_num_adapters == 1
+    assert len(qeff_model.active_adapter_to_id) == 1
+
+
 # test the init assertion for models that are not supported
 @pytest.mark.parametrize("base_model_name", ["distilbert/distilgpt2"])
 def test_auto_lora_model_for_causal_lm_init_from_unsupported_model(base_model_name):
@@ -156,27 +169,6 @@ def test_auto_lora_model_for_causal_lm_hash():
     assert model_hash_0_1 != model_hash_0_0
 
 
-# test load_adapter() and get_adapter_id()
-@pytest.mark.parametrize("base_model_name,adapter_id_0,adapter_id_1", model_samples[:1])
-def test_auto_lora_model_for_causal_lm_load_get_adapter_id_check(base_model_name, adapter_id_0, adapter_id_1):
-    qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(base_model_name, num_hidden_layers=1)
-
-    set_id_0 = qeff_model.load_adapter(adapter_id_0, "adapter_0")
-    set_id_1 = qeff_model.load_adapter(adapter_id_1, "adapter_1")
-    assert set_id_1 == set_id_0 + 1
-
-    qeff_model.load_adapter(adapter_id_1, "adapter_2")
-    qeff_model.unload_adapter("adapter_1")
-
-    update_id_0 = qeff_model.get_adapter_id("adapter_0")
-    update_id_2 = qeff_model.get_adapter_id("adapter_2")
-    assert set_id_0 == update_id_0
-    assert set_id_1 == update_id_2
-
-    with pytest.raises(KeyError):
-        qeff_model.get_adapter_id("adapter_1")
-
-
 # test download_adapter(), load_adapter() and unload_adapter()
 @pytest.mark.parametrize("base_model_name,adapter_id_0,adapter_id_1", model_samples[1:])
 def test_auto_lora_model_for_causal_lm_load_unload_adapter(base_model_name, adapter_id_0, adapter_id_1):
@@ -196,8 +188,8 @@ def test_auto_lora_model_for_causal_lm_load_unload_adapter(base_model_name, adap
 def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name, adapter_id_0, adapter_id_1, tmp_path):
     qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(base_model_name, num_hidden_layers=1)
 
-    id_0 = qeff_model.load_adapter(adapter_id_0, "adapter_0")
-    id_1 = qeff_model.load_adapter(adapter_id_1, "adapter_1")
+    qeff_model.load_adapter(adapter_id_0, "adapter_0")
+    qeff_model.load_adapter(adapter_id_1, "adapter_1")
 
     # export
     start = perf_counter()
@@ -225,5 +217,5 @@ def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name,
         tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
         prompts=prompts,
         device_id=[0],
-        prompt_to_lora_id_mapping=[id_0, id_1, id_0, 0],
+        prompt_to_adapter_mapping=["adapter_0", "adapter_1", "adapter_0", "base"],
     )