Fix base model inference index INTMAX issue

quic-jouachen · quic-jouachen · commit c3c00fc76481 · 2024-10-17T17:03:20.000-07:00
Signed-off-by: Jou-An Chen &lt;quic_jouachen@quicinc.com&gt;
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -654,10 +654,10 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
 
                     generated_id_current_index[decode_batch_id] += 1
 
-                    if self.prompt_to_lora_id_mapping_decode:
-                        decode_inputs["lora_ids"][decode_batch_id] = self.prompt_to_lora_id_mapping_decode[
-                            batch_id_map[decode_batch_id]
-                        ]
+                if self.prompt_to_lora_id_mapping_decode:
+                    decode_inputs["lora_ids"][decode_batch_id] = self.prompt_to_lora_id_mapping_decode[
+                        batch_id_map[decode_batch_id]
+                    ]
 
         return decode_pause_time
 
diff --git a/QEfficient/lora/auto.py b/QEfficient/lora/auto.py
@@ -7,7 +7,6 @@
 
 import hashlib
 import os
-import sys
 from pathlib import Path
 from typing import Any, List, Optional
 
@@ -24,8 +23,6 @@
 from QEfficient.utils.constants import QEFF_MODELS_DIR
 from QEfficient.utils.logging_utils import logger
 
-INTMAX = sys.maxsize
-
 
 class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
     """
@@ -54,7 +51,7 @@ class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
         m.compile(num_cores=16, device_group=[0])
 
         prompts=["code prompt", "math prompt", "generic"]
-        m.generate(prompts, device_group=[0], prompt_to_lora_id_mapping=[magicoder_id,gsm8k_id,INTMAX])
+        m.generate(prompts, device_group=[0], prompt_to_lora_id_mapping=[magicoder_id,gsm8k_id,0])
 
     """
 
@@ -148,7 +145,7 @@ def load_adapter(self, adapter_model_id: str, adapter_name: str, **kwargs: Any):
 
         # set active adapter id to current max if adapter_name is new
         if adapter_name not in self.active_adapter_to_id.keys():
-            self.active_adapter_to_id[adapter_name] = self.max_num_adapters
+            self.active_adapter_to_id[adapter_name] = self.max_num_adapters + 1  # reserve 0 for base
 
             # add active adapter to set
             self.active_adapters.add(adapter_name)
@@ -168,7 +165,7 @@ def unload_adapter(self, adapter_name: str):
 
         # renumbering of active adapter id
         for index, (key, value) in enumerate(self.active_adapter_to_id.items()):
-            self.active_adapter_to_id[key] = index
+            self.active_adapter_to_id[key] = index + 1
 
         logger.warning(f"Deleting {adapter_name} from active adapters.")
         if self.onnx_path or self.qpc_path:
@@ -203,9 +200,9 @@ def load_adapter_weights_to_model(self):
         for i in range(num_hidden_layers):
             for target_module in self.target_modules_for_all_adapters:
                 # stack all adapters weights
-                a_tensor_list = list(range(self.max_num_adapters))
-                b_tensor_list = list(range(self.max_num_adapters))
-                c_tensor_list = list(range(self.max_num_adapters))
+                a_tensor_list = list(range(self.max_num_adapters + 1))
+                b_tensor_list = list(range(self.max_num_adapters + 1))
+                c_tensor_list = list(range(self.max_num_adapters + 1))
 
                 for lora_name, lora_id in self.active_adapter_to_id.items():
                     if (
@@ -232,12 +229,18 @@ def load_adapter_weights_to_model(self):
                         dtype=torch.float16,
                     )
 
+                # dummy zero tensor for base model
+                a_tensor_list[0] = torch.zeros_like(a_tensor_list[1])
+                b_tensor_list[0] = torch.zeros_like(b_tensor_list[1])
+                c_tensor_list[0] = torch.zeros_like(c_tensor_list[1])
+
+                # stack weight tensors
                 stacked_lora_A = (
                     torch.stack(a_tensor_list, dim=0).unsqueeze(1).transpose(2, 3)
-                )  # <num_adapters, 1, in_feature, r>
+                )  # <num_loras, 1, in_feature, r>
                 stacked_lora_B = (
                     torch.stack(b_tensor_list, dim=0).unsqueeze(1).transpose(2, 3)
-                )  # <num_adapters, 1, r, out_feature>
+                )  # <num_loras, 1, r, out_feature>
                 stacked_lora_C = (
                     torch.stack(c_tensor_list, dim=0).unsqueeze(1).unsqueeze(2).unsqueeze(3)
                 )  # <num_loras, 1, 1, 1>
@@ -308,6 +311,7 @@ def export(self, **kwargs) -> str:
         export_dir = kwargs.get("export_dir", None)
 
         # obtain all necessary information to initialize the model
+        assert self.max_num_adapters, "Please use load_adapter() to add at least one adapter; otherwise, refer to QEFFAutoModelForCausalLM for base model usage"
         self.init_adapter_model()
 
         assert self.is_transformed, "Please first run transform on the QEFFAutoModelForCausalLM object"
@@ -411,7 +415,7 @@ def export_and_compile(
     def run_cloud_ai_100(self, prompts: List[str], device_id: List[int] = None, **kwargs):
         assert isinstance(self.qpc_path, str), "Please run compile API first!"
         generation_len = kwargs.pop("generation_len", None)
-        default_mapping = [INTMAX for _ in range(len(prompts))]
+        default_mapping = [0 for _ in range(len(prompts))]
         prompt_to_lora_id_mapping = kwargs.pop("prompt_to_lora_id_mapping", default_mapping)
         return QEfficient.cloud_ai_100_exec_kv(
             self.tokenizer,
diff --git a/QEfficient/lora/layers.py b/QEfficient/lora/layers.py
@@ -21,14 +21,14 @@ def multilora_init(self, lora_rank, max_num_adapters):
         self.lora_rank = lora_rank
 
         self.lora_weight_A = nn.Parameter(
-            self.weight.new_zeros(self.max_num_adapters, 1, self.in_features, self.lora_rank)
+            self.weight.new_zeros(self.max_num_adapters + 1, 1, self.in_features, self.lora_rank)
         )
         self.lora_weight_A.requires_grad = False
         self.lora_weight_B = nn.Parameter(
-            self.weight.new_zeros(self.max_num_adapters, 1, self.lora_rank, self.out_features)
+            self.weight.new_zeros(self.max_num_adapters + 1, 1, self.lora_rank, self.out_features)
         )
         self.lora_weight_B.requires_grad = False
-        self.lora_weight_C = torch.full((self.max_num_adapters, 1, 1, 1), 1.0, dtype=torch.float)
+        self.lora_weight_C = torch.full((self.max_num_adapters + 1, 1, 1, 1), 1.0, dtype=torch.float)
 
         nn.init.kaiming_uniform_(self.lora_weight_A, a=math.sqrt(5))
         nn.init.zeros_(self.lora_weight_B)
diff --git a/examples/lora_models.py b/examples/lora_models.py
@@ -7,12 +7,9 @@
 
 ## This example works on continuous batching with different lora adapters in the same batch ##
 
-import sys
 
 from QEfficient import QEffAutoLoraModelForCausalLM
 
-INTMAX = sys.maxsize
-
 base_model_name = "mistralai/Mistral-7B-v0.1"
 seq_len = 128
 ctx_len = 256
@@ -67,7 +64,7 @@
 # prompt_to_lora_id_mapping is a list of lora_id of which the size matches num of prompts
 # and is a one-on-one mapping for the prompt-to-loraid
 # e.g., prompt_to_lora_id_mapping = [{adapter_id_0}, {adapter_id_1}, {adapter_id_0}, {adapter_id_1}, ...]
-# setting INTMAX means using base model
+# setting 0 means using base model
 prompts = [
     """Please answer the following question: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?\n\nAnswer:""",
     """The following headline is the headline of a news report. Please write the content of the news passage based on only this headline.\n\nHeadline: Harvard shrank its insect-inspired microrobot to the size of a penny\n\nContent:""",
@@ -81,9 +78,11 @@
 qeff_model.generate(
     prompts,
     device_group,
-    prompt_to_lora_id_mapping=[gsm8k_id, tldr_id, gsm8k_id, INTMAX, gsm8k_id, tldr_id, gsm8k_id, tldr_id],
+    prompt_to_lora_id_mapping=[0, 0, 0, 0, 0, 0, 0, 0],
 )
 
+# [gsm8k_id, tldr_id, gsm8k_id, 0, gsm8k_id, tldr_id, gsm8k_id, tldr_id]
+
 """
 expected response:
 
diff --git a/tests/lora/test_lora_model.py b/tests/lora/test_lora_model.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
-import sys
 from pathlib import Path
 from time import perf_counter
 
@@ -15,8 +14,6 @@
 
 from QEfficient import QEffAutoLoraModelForCausalLM
 
-INTMAX = sys.maxsize
-
 configs = [
     pytest.param(
         AutoConfig.for_model(
@@ -226,4 +223,4 @@ def test_auto_lora_model_for_causal_lm_export_compile_generate(base_model_name,
 
     # test generate
     prompts = ["hello!", "hi", "hello, my name is", "hey"]
-    qeff_model.generate(prompts, [0], prompt_to_lora_id_mapping=[id_0, id_1, id_0, INTMAX])
+    qeff_model.generate(prompts, [0], prompt_to_lora_id_mapping=[id_0, id_1, id_0, 0])