Address review comments

quic-jouachen · quic-jouachen · commit cdf5f6b9da2c · 2024-11-14T14:02:15.000-08:00
Signed-off-by: Jou-An Chen &lt;quic_jouachen@quicinc.com&gt;
diff --git a/QEfficient/lora/auto.py b/QEfficient/lora/auto.py
@@ -29,13 +29,7 @@ class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
 
     Args:
         :model (nn.Module): PyTorch model
-        :base_model_name (str): Model card name for base model
-        :adapter_weights (Dict): A dictionary contains lora_name to lora_weight mapping
-        :adapter_configs (Dict): A dictionary contains lora_name to lora_configs mapping
-        :max_num_adapters (int): Total number of active adapters that to be exported and compiled
-        :active_adapter_to_id (Dict): A dictionary contains active adapter's lora_name to lora_id mapping
-        :lora_rank (int): The consistent lora rank across all active adapters
-        :target_modules_for_all_adapters (List[str]): The consistent set of target modules across all active adapters
+        :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later.
 
     .. code-block:: python
 
@@ -60,7 +54,6 @@ def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs
         self.base_model_name = self.model.model.config._name_or_path
         self.adapter_weights = {}
         self.adapter_configs = {}
-        self.max_num_adapters = 0
         self.active_adapter_to_id = {}
 
         self.lora_rank = 0
@@ -101,11 +94,15 @@ def download_adapter(
         adapter_weight: Optional[dict] = None,
         adapter_config: Optional[PeftConfig] = None,
     ):
-        """Loads a new adapter from huggingface hub or local path into CPU cache
+        """
+        Loads a new adapter from huggingface hub or local path into CPU cache
 
-        Args:
+        ``Mandatory`` Args:
             :adapter_model_id (str): Adapter model ID from huggingface hub or local path
-            :adapter_name (str): Adapter name to be used to set this adapter as current
+            :adapter_name (str): Adapter name to be used to downloaded this adapter
+        ``Optional`` Args:
+            :adapter_weight (dict): Adapter weight tensors in dictionary format
+            :adapter_config (PeftConfig): Adapter config in the format of PeftConfig
         """
 
         # check if adapter name already loaded
@@ -128,7 +125,16 @@ def load_adapter(
         adapter_weight: Optional[dict] = None,
         adapter_config: Optional[PeftConfig] = None,
     ):
-        "Load adapter into CPU cache and Sets active adapter from one of the loaded adapters"
+        """
+        Load adapter into CPU cache and set it as active
+
+        ``Mandatory`` Args:
+            :adapter_model_id (str): Adapter model ID from huggingface hub or local path
+            :adapter_name (str): Adapter name to be used to load this adapter
+        ``Optional`` Args:
+            :adapter_weight (dict): Adapter weight tensors in dictionary format
+            :adapter_config (PeftConfig): Adapter config in the format of PeftConfig
+        """
 
         # check if adapter name already exist and activated
         if adapter_name in self.active_adapter_to_id.keys():
@@ -151,22 +157,23 @@ def load_adapter(
 
             # set active adapter id to current max if adapter_name is new
             if adapter_name not in self.active_adapter_to_id.keys():
-                self.active_adapter_to_id[adapter_name] = self.max_num_adapters + 1  # reserve 0 for base
-
-                # add active adapter to set
-                self.max_num_adapters = len(self.active_adapter_to_id)
+                self.active_adapter_to_id[adapter_name] = len(self.active_adapter_to_id) + 1  # reserve 0 for base
 
         return self.active_adapter_to_id[adapter_name]
 
     def unload_adapter(self, adapter_name: str):
-        "Deactivate adpater and remove it from CPU cache"
+        """
+        Deactivate adpater and remove it from CPU cache
+
+        ``Mandatory`` Args:
+            :adapter_name (str): Adapter name to be unloaded
+        """
 
         # step1: remove from active list if it's there
         if adapter_name not in self.active_adapter_to_id.keys():
             logger.info(f"Adapter name {adapter_name} is not set active yet")
             return False
 
-        self.max_num_adapters -= 1
         self.active_adapter_to_id.pop(adapter_name)
 
         # renumbering of active adapter id
@@ -197,9 +204,9 @@ def _load_adapter_weights_to_model(self):
         for i in range(num_hidden_layers):
             for target_module in self.target_modules_for_all_adapters:
                 # stack all adapters weights
-                a_tensor_list = list(range(self.max_num_adapters + 1))
-                b_tensor_list = list(range(self.max_num_adapters + 1))
-                s_tensor_list = list(range(self.max_num_adapters + 1))
+                a_tensor_list = list(range(len(self.active_adapter_to_id) + 1))
+                b_tensor_list = list(range(len(self.active_adapter_to_id) + 1))
+                s_tensor_list = list(range(len(self.active_adapter_to_id) + 1))
 
                 for lora_name, lora_id in self.active_adapter_to_id.items():
                     if target_module in ["q_proj", "k_proj", "v_proj", "o_proj"]:
@@ -256,10 +263,6 @@ def _load_adapter_weights_to_model(self):
     def _init_adapter_model(self):
         "Initialize the fixed lora model with multiple adapter weigths standby"
 
-        # assume all adapters have same target_modules and ranks
-        if self.max_num_adapters != len(self.active_adapter_to_id):
-            raise ValueError("Inconsistent max_num_adapters and active adapters")
-
         # set lora rank
         self.lora_rank = list(self.adapter_configs.values())[0].r
 
@@ -268,7 +271,7 @@ def _init_adapter_model(self):
 
         self.target_modules_for_all_adapters = list(self.adapter_configs.values())[0].target_modules
         _, transformed = TargetModulesTransform.apply(
-            self.model, self.target_modules_for_all_adapters, self.lora_rank, self.max_num_adapters
+            self.model, self.target_modules_for_all_adapters, self.lora_rank, len(self.active_adapter_to_id)
         )
 
         # load_weight to model
@@ -287,7 +290,11 @@ def export(self, export_dir: Optional[str] = None) -> str:
         """
 
         # initialize the adapter model
-        assert self.max_num_adapters, "Please use load_adapter() to add at least one adapter; otherwise, refer to QEFFAutoModelForCausalLM for base model usage"
+        if len(self.active_adapter_to_id) == 0:
+            raise ValueError(
+                "Please use load_adapter() to add at least one adapter; otherwise, refer to QEFFAutoModelForCausalLM for base model usage"
+            )
+
         self._init_adapter_model()
 
         bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
@@ -333,6 +340,7 @@ def generate(
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
         device_id: List[int] = None,
+        prompt_to_adapter_mapping: List[str] = None,
         runtime: str = "AI_100",
         **kwargs,
     ):
@@ -342,18 +350,28 @@ def generate(
         If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
 
         ``Mandatory`` Args:
+            :tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): The tokenizer used in the inference
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+            :prompt_to_adapter_mapping (List[str]): The sequence of the adapter names will be matched with sequence of prompts and corresponding adapters will be used for the prompts."base" for base model (no adapter).
         ``optional`` Args:
             :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
-            :prompt_to_adapter_mapping (List[str]): A list of adapter names that maps to the prompts, specifying which adapter the prompt wants to apply. "base" for base model (no adapter).
+
         """
         if runtime != "AI_100":
             raise ValueError("Only AI_100 runtime is supported right now via generate API")
         if not isinstance(self.qpc_path, Path):
             raise TypeError("Please run compile API first!")
         generation_len = kwargs.pop("generation_len", None)
-        prompt_to_adapter_mapping = kwargs.pop("prompt_to_adapter_mapping", ["base" for _ in range(len(prompts))])
+
+        if not prompt_to_adapter_mapping:
+            prompt_to_adapter_mapping = ["base" for _ in range(len(prompts))]
+
+        if len(prompt_to_adapter_mapping) != len(prompts):
+            raise RuntimeError(
+                f"Number of prompts should match number of prompt_to_adapter_mapping, got len(prompts) = {len(prompts)}, len(prompt_to_adapter_mapping) = {len(prompt_to_adapter_mapping)}"
+            )
+
         return QEfficient.cloud_ai_100_exec_kv(
             tokenizer,
             self.qpc_path,
diff --git a/examples/lora_models.py b/examples/lora_models.py
@@ -17,12 +17,14 @@
 device_group = [0]
 
 ## STEP 1 -- init base model
-qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained(
-    "predibase/gsm8k", "gsm8k", continuous_batching=True, finite_adapters=True
-)
+# qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained(
+#     "predibase/gsm8k", "gsm8k", continuous_batching=True, finite_adapters=True
+# )
 
 # (alternative) non-cb compilation
-# qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/gsm8k", "gsm8k", continuous_batching=False, finite_adapters=True)
+qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained(
+    "predibase/gsm8k", "gsm8k", continuous_batching=False, finite_adapters=True
+)
 
 ## STEP 2 -- load adapter adapter
 qeff_model.load_adapter("predibase/tldr_content_gen", "tldr_content_gen")
@@ -35,9 +37,20 @@
 
 
 ## STEP 3 -- export & compile qeff model
+# qpc_path = qeff_model.compile(
+#     batch_size=1,
+#     full_batch_size=full_batch_size,
+#     prefill_seq_len=seq_len,
+#     ctx_len=ctx_len,
+#     num_devices=len(device_group),
+#     num_cores=16,
+#     mxfp6_matmul=True,
+#     mxint8_kv_cache=True,
+# )
+
+# (alternative) non-cb compilation
 qpc_path = qeff_model.compile(
-    batch_size=1,
-    full_batch_size=full_batch_size,
+    batch_size=2,
     prefill_seq_len=seq_len,
     ctx_len=ctx_len,
     num_devices=len(device_group),
@@ -46,15 +59,6 @@
     mxint8_kv_cache=True,
 )
 
-# (alternative) non-cb compilation
-# qpc_path = qeff_model.compile(batch_size=2,
-#                               prefill_seq_len=seq_len,
-#                               ctx_len=ctx_len,
-#                               num_devices=len(device_group),
-#                               num_cores=16,
-#                               mxfp6_matmul=True,
-#                               mxint8_kv_cache=True)
-
 ## STEP 4 -- run inference on the generate function
 prompts = [
     """Please answer the following question: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?\n\nAnswer:""",
diff --git a/tests/lora/test_lora_model.py b/tests/lora/test_lora_model.py
@@ -59,7 +59,6 @@ def test_auto_lora_model_for_causal_lm_init(base_model_name, adapter_id_0, adapt
     assert qeff_model.base_model_name == base_model_name
     assert len(qeff_model.adapter_weights) == 0
     assert len(qeff_model.adapter_configs) == 0
-    assert qeff_model.max_num_adapters == 0
     assert len(qeff_model.active_adapter_to_id) == 0
 
 
@@ -71,7 +70,6 @@ def test_auto_lora_model_for_causal_lm_from_pretrained(base_model_name, adapter_
     assert qeff_model.base_model_name == base_model_name
     assert len(qeff_model.adapter_weights) == 0
     assert len(qeff_model.adapter_configs) == 0
-    assert qeff_model.max_num_adapters == 0
     assert len(qeff_model.active_adapter_to_id) == 0
 
 
@@ -83,7 +81,6 @@ def test_auto_peft_model_for_causal_lm_from_pretrained(base_model_name, adapter_
     assert qeff_model.base_model_name == base_model_name
     assert len(qeff_model.adapter_weights) == 1
     assert len(qeff_model.adapter_configs) == 1
-    assert qeff_model.max_num_adapters == 1
     assert len(qeff_model.active_adapter_to_id) == 1