quic
diff --git a/‎QEfficient/peft/auto.py‎
Lines changed: 2 additions & 1 deletion b/‎QEfficient/peft/auto.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎QEfficient/lora/__init__.py‎ renamed to ‎QEfficient/peft/lora/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎QEfficient/lora/__init__.py‎ renamed to ‎QEfficient/peft/lora/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎QEfficient/lora/auto.py‎ renamed to ‎QEfficient/peft/lora/auto.py‎
Lines changed: 52 additions & 34 deletions b/‎QEfficient/lora/auto.py‎ renamed to ‎QEfficient/peft/lora/auto.py‎
Lines changed: 52 additions & 34 deletions
diff --git a/‎QEfficient/lora/layers.py‎ renamed to ‎QEfficient/peft/lora/layers.py‎ b/‎QEfficient/lora/layers.py‎ renamed to ‎QEfficient/peft/lora/layers.py‎
diff --git a/‎QEfficient/lora/lora_model.py‎ renamed to ‎QEfficient/peft/lora/lora_model.py‎ b/‎QEfficient/lora/lora_model.py‎ renamed to ‎QEfficient/peft/lora/lora_model.py‎
diff --git a/‎QEfficient/lora/pytorch_transforms.py‎ renamed to ‎QEfficient/peft/lora/pytorch_transforms.py‎ b/‎QEfficient/lora/pytorch_transforms.py‎ renamed to ‎QEfficient/peft/lora/pytorch_transforms.py‎
diff --git a/‎examples/lora_models.py‎
Lines changed: 12 additions & 8 deletions b/‎examples/lora_models.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎tests/lora/test_lora_model.py‎
Lines changed: 2 additions & 7 deletions b/‎tests/lora/test_lora_model.py‎
Lines changed: 2 additions & 7 deletions
@@ -21,7 +21,7 @@
 from QEfficient.base.onnx_transforms import FP16ClipTransform, OnnxTransform, SplitTensorsTransform
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.lora import QEffAutoLoraModelForCausalLM
+from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM
 from QEfficient.peft.onnx_transforms import AdapterWeightsToInputsTransform
 from QEfficient.peft.pytorch_transforms import PeftModelInputsTransform
 from QEfficient.transformers.pytorch_transforms import CustomOpsTransform, KVCacheTransform
@@ -147,6 +147,7 @@ def from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs):
         """
         Args:
             :pretrained_name_or_path (str): Model card name from huggingface or local path to model directory.
+            :finite_adapters (bool): set True to enable finite adapter mode with QEffAutoLoraModelForCausalLM class. Please refer to QEffAutoLoraModelForCausalLM for API specification.
             :args, kwargs: Additional arguments to pass to peft.AutoPeftModelForCausalLM.
         """
         if kwargs.get("full_batch_size"):
 
@@ -5,7 +5,7 @@
 #
 # ----------------------------------------------------------------------------
 
-from QEfficient.lora.auto import QEffAutoLoraModelForCausalLM
+from QEfficient.peft.lora.auto import QEffAutoLoraModelForCausalLM
 
 __all__ = [
     "QEffAutoLoraModelForCausalLM",
 
@@ -29,17 +29,11 @@ class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
 
     Args:
         :model (nn.Module): PyTorch model
-        :base_model_name (str): Model card name for base model
-        :adapter_weights (Dict): A dictionary contains lora_name to lora_weight mapping
-        :adapter_configs (Dict): A dictionary contains lora_name to lora_configs mapping
-        :max_num_adapters (int): Total number of active adapters that to be exported and compiled
-        :active_adapter_to_id (Dict): A dictionary contains active adapter's lora_name to lora_id mapping
-        :lora_rank (int): The consistent lora rank across all active adapters
-        :target_modules_for_all_adapters (List[str]): The consistent set of target modules across all active adapters
+        :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later.
 
     .. code-block:: python
 
-        from QEfficient.lora import QEffAutoLoraModelForCausalLM
+        from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM
 
         m = QEffAutoPeftModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
         m.load_adapter("predibase/gsm8k", "gsm8k")
@@ -53,14 +47,13 @@ class QEffAutoLoraModelForCausalLM(QEFFAutoModelForCausalLM):
 
     def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs) -> None:
         super().__init__(model, continuous_batching)
-        assert (
-            type(self.model).__name__ == "QEffMistralForCausalLM" or type(self.model).__name__ == "QEffLlamaForCausalLM"
-        ), f"Only QEffMistralForCausalLM and QEffLlamaForCausalLM model are supported but get {type(self.model).__name__}"
+        if self.model.__class__.__name__ not in ["QEffMistralForCausalLM", "QEffLlamaForCausalLM"]:
+            raise NotImplementedError(
+                f"Only QEffMistralForCausalLM and QEffLlamaForCausalLM model are supported but get {self.model.__class__.__name__}"
+            )
 
-        self.base_model_name = self.model.model.config._name_or_path
         self.adapter_weights = {}
         self.adapter_configs = {}
-        self.max_num_adapters = 0
         self.active_adapter_to_id = {}
 
         self.lora_rank = 0
@@ -101,11 +94,15 @@ def download_adapter(
         adapter_weight: Optional[dict] = None,
         adapter_config: Optional[PeftConfig] = None,
     ):
-        """Loads a new adapter from huggingface hub or local path into CPU cache
+        """
+        Loads a new adapter from huggingface hub or local path into CPU cache
 
-        Args:
+        ``Mandatory`` Args:
             :adapter_model_id (str): Adapter model ID from huggingface hub or local path
-            :adapter_name (str): Adapter name to be used to set this adapter as current
+            :adapter_name (str): Adapter name to be used to downloaded this adapter
+        ``Optional`` Args:
+            :adapter_weight (dict): Adapter weight tensors in dictionary format
+            :adapter_config (PeftConfig): Adapter config in the format of PeftConfig
         """
 
         # check if adapter name already loaded
@@ -128,7 +125,16 @@ def load_adapter(
         adapter_weight: Optional[dict] = None,
         adapter_config: Optional[PeftConfig] = None,
     ):
-        "Load adapter into CPU cache and Sets active adapter from one of the loaded adapters"
+        """
+        Load adapter into CPU cache and set it as active
+
+        ``Mandatory`` Args:
+            :adapter_model_id (str): Adapter model ID from huggingface hub or local path
+            :adapter_name (str): Adapter name to be used to load this adapter
+        ``Optional`` Args:
+            :adapter_weight (dict): Adapter weight tensors in dictionary format
+            :adapter_config (PeftConfig): Adapter config in the format of PeftConfig
+        """
 
         # check if adapter name already exist and activated
         if adapter_name in self.active_adapter_to_id.keys():
@@ -151,22 +157,23 @@ def load_adapter(
 
             # set active adapter id to current max if adapter_name is new
             if adapter_name not in self.active_adapter_to_id.keys():
-                self.active_adapter_to_id[adapter_name] = self.max_num_adapters + 1  # reserve 0 for base
-
-                # add active adapter to set
-                self.max_num_adapters = len(self.active_adapter_to_id)
+                self.active_adapter_to_id[adapter_name] = len(self.active_adapter_to_id) + 1  # reserve 0 for base
 
         return self.active_adapter_to_id[adapter_name]
 
     def unload_adapter(self, adapter_name: str):
-        "Deactivate adpater and remove it from CPU cache"
+        """
+        Deactivate adpater and remove it from CPU cache
+
+        ``Mandatory`` Args:
+            :adapter_name (str): Adapter name to be unloaded
+        """
 
         # step1: remove from active list if it's there
         if adapter_name not in self.active_adapter_to_id.keys():
             logger.info(f"Adapter name {adapter_name} is not set active yet")
             return False
 
-        self.max_num_adapters -= 1
         self.active_adapter_to_id.pop(adapter_name)
 
         # renumbering of active adapter id
@@ -197,9 +204,9 @@ def _load_adapter_weights_to_model(self):
         for i in range(num_hidden_layers):
             for target_module in self.target_modules_for_all_adapters:
                 # stack all adapters weights
-                a_tensor_list = list(range(self.max_num_adapters + 1))
-                b_tensor_list = list(range(self.max_num_adapters + 1))
-                s_tensor_list = list(range(self.max_num_adapters + 1))
+                a_tensor_list = list(range(len(self.active_adapter_to_id) + 1))
+                b_tensor_list = list(range(len(self.active_adapter_to_id) + 1))
+                s_tensor_list = list(range(len(self.active_adapter_to_id) + 1))
 
                 for lora_name, lora_id in self.active_adapter_to_id.items():
                     if target_module in ["q_proj", "k_proj", "v_proj", "o_proj"]:
@@ -256,10 +263,6 @@ def _load_adapter_weights_to_model(self):
     def _init_adapter_model(self):
         "Initialize the fixed lora model with multiple adapter weigths standby"
 
-        # assume all adapters have same target_modules and ranks
-        if self.max_num_adapters != len(self.active_adapter_to_id):
-            raise ValueError("Inconsistent max_num_adapters and active adapters")
-
         # set lora rank
         self.lora_rank = list(self.adapter_configs.values())[0].r
 
@@ -268,7 +271,7 @@ def _init_adapter_model(self):
 
         self.target_modules_for_all_adapters = list(self.adapter_configs.values())[0].target_modules
         _, transformed = TargetModulesTransform.apply(
-            self.model, self.target_modules_for_all_adapters, self.lora_rank, self.max_num_adapters
+            self.model, self.target_modules_for_all_adapters, self.lora_rank, len(self.active_adapter_to_id)
         )
 
         # load_weight to model
@@ -287,7 +290,11 @@ def export(self, export_dir: Optional[str] = None) -> str:
         """
 
         # initialize the adapter model
-        assert self.max_num_adapters, "Please use load_adapter() to add at least one adapter; otherwise, refer to QEFFAutoModelForCausalLM for base model usage"
+        if len(self.active_adapter_to_id) == 0:
+            raise ValueError(
+                "Please use load_adapter() to add at least one adapter; otherwise, refer to QEFFAutoModelForCausalLM for base model usage"
+            )
+
         self._init_adapter_model()
 
         bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
@@ -333,6 +340,7 @@ def generate(
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
         device_id: List[int] = None,
+        prompt_to_adapter_mapping: List[str] = None,
         runtime: str = "AI_100",
         **kwargs,
     ):
@@ -342,18 +350,28 @@ def generate(
         If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
 
         ``Mandatory`` Args:
+            :tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): The tokenizer used in the inference
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+            :prompt_to_adapter_mapping (List[str]): The sequence of the adapter names will be matched with sequence of prompts and corresponding adapters will be used for the prompts."base" for base model (no adapter).
         ``optional`` Args:
             :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
-            :prompt_to_adapter_mapping (List[str]): A list of adapter names that maps to the prompts, specifying which adapter the prompt wants to apply. "base" for base model (no adapter).
+
         """
         if runtime != "AI_100":
             raise ValueError("Only AI_100 runtime is supported right now via generate API")
         if not isinstance(self.qpc_path, Path):
             raise TypeError("Please run compile API first!")
         generation_len = kwargs.pop("generation_len", None)
-        prompt_to_adapter_mapping = kwargs.pop("prompt_to_adapter_mapping", ["base" for _ in range(len(prompts))])
+
+        if not prompt_to_adapter_mapping:
+            prompt_to_adapter_mapping = ["base" for _ in range(len(prompts))]
+
+        if len(prompt_to_adapter_mapping) != len(prompts):
+            raise RuntimeError(
+                f"Number of prompts should match number of prompt_to_adapter_mapping, got len(prompts) = {len(prompts)}, len(prompt_to_adapter_mapping) = {len(prompt_to_adapter_mapping)}"
+            )
+
         return QEfficient.cloud_ai_100_exec_kv(
             tokenizer,
             self.qpc_path,
 
@@ -22,7 +22,9 @@
 )
 
 # (alternative) non-cb compilation
-# qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/gsm8k", "gsm8k", continuous_batching=False, finite_adapters=True)
+# qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained(
+#     "predibase/gsm8k", "gsm8k", continuous_batching=False, finite_adapters=True
+# )
 
 ## STEP 2 -- load adapter adapter
 qeff_model.load_adapter("predibase/tldr_content_gen", "tldr_content_gen")
@@ -47,13 +49,15 @@
 )
 
 # (alternative) non-cb compilation
-# qpc_path = qeff_model.compile(batch_size=2,
-#                               prefill_seq_len=seq_len,
-#                               ctx_len=ctx_len,
-#                               num_devices=len(device_group),
-#                               num_cores=16,
-#                               mxfp6_matmul=True,
-#                               mxint8_kv_cache=True)
+# qpc_path = qeff_model.compile(
+#     batch_size=2,
+#     prefill_seq_len=seq_len,
+#     ctx_len=ctx_len,
+#     num_devices=len(device_group),
+#     num_cores=16,
+#     mxfp6_matmul=True,
+#     mxint8_kv_cache=True,
+# )
 
 ## STEP 4 -- run inference on the generate function
 prompts = [
 
@@ -13,7 +13,7 @@
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from QEfficient import QEffAutoPeftModelForCausalLM
-from QEfficient.lora import QEffAutoLoraModelForCausalLM
+from QEfficient.peft.lora import QEffAutoLoraModelForCausalLM
 from QEfficient.utils import load_hf_tokenizer
 
 configs = [
@@ -56,10 +56,8 @@ def test_auto_lora_model_for_causal_lm_init(base_model_name, adapter_id_0, adapt
     model_hf = AutoModelForCausalLM.from_pretrained(base_model_name)
     qeff_model = QEffAutoLoraModelForCausalLM(model_hf)
 
-    assert qeff_model.base_model_name == base_model_name
     assert len(qeff_model.adapter_weights) == 0
     assert len(qeff_model.adapter_configs) == 0
-    assert qeff_model.max_num_adapters == 0
     assert len(qeff_model.active_adapter_to_id) == 0
 
 
@@ -68,10 +66,8 @@ def test_auto_lora_model_for_causal_lm_init(base_model_name, adapter_id_0, adapt
 def test_auto_lora_model_for_causal_lm_from_pretrained(base_model_name, adapter_id_0, adapter_id_1):
     qeff_model = QEffAutoLoraModelForCausalLM.from_pretrained(pretrained_model_name_or_path=base_model_name)
 
-    assert qeff_model.base_model_name == base_model_name
     assert len(qeff_model.adapter_weights) == 0
     assert len(qeff_model.adapter_configs) == 0
-    assert qeff_model.max_num_adapters == 0
     assert len(qeff_model.active_adapter_to_id) == 0
 
 
@@ -80,10 +76,9 @@ def test_auto_lora_model_for_causal_lm_from_pretrained(base_model_name, adapter_
 def test_auto_peft_model_for_causal_lm_from_pretrained(base_model_name, adapter_id_0, adapter_id_1):
     qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained(adapter_id_0, "id_0", finite_adapters=True)
 
-    assert qeff_model.base_model_name == base_model_name
+    assert isinstance(qeff_model, QEffAutoLoraModelForCausalLM)
     assert len(qeff_model.adapter_weights) == 1
     assert len(qeff_model.adapter_configs) == 1
-    assert qeff_model.max_num_adapters == 1
     assert len(qeff_model.active_adapter_to_id) == 1
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`#`
`6`	`6`	`# ----------------------------------------------------------------------------`
`7`	`7`
`8`		`-from QEfficient.lora.auto import QEffAutoLoraModelForCausalLM`
	`8`	`+from QEfficient.peft.lora.auto import QEffAutoLoraModelForCausalLM`
`9`	`9`
`10`	`10`	`__all__ = [`
`11`	`11`	`"QEffAutoLoraModelForCausalLM",`