octoml
diff --git a/‎python/mlc_chat/compiler/loader/huggingface_loader.py
Lines changed: 1 addition & 1 deletion b/‎python/mlc_chat/compiler/loader/huggingface_loader.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/mlc_chat/compiler/model/llama_loader.py
Lines changed: 74 additions & 0 deletions b/‎python/mlc_chat/compiler/model/llama_loader.py
Lines changed: 74 additions & 0 deletions
diff --git a/‎python/mlc_chat/compiler/model/llama_quantization.py
Lines changed: 18 additions & 1 deletion b/‎python/mlc_chat/compiler/model/llama_quantization.py
Lines changed: 18 additions & 1 deletion
diff --git a/‎python/mlc_chat/compiler/model/model.py
Lines changed: 1 addition & 0 deletions b/‎python/mlc_chat/compiler/model/model.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/mlc_chat/compiler/quantization/__init__.py
Lines changed: 1 addition & 0 deletions b/‎python/mlc_chat/compiler/quantization/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -83,7 +83,7 @@ def __init__(
         self.cached_files = {}
         self.torch_to_path = {}
         self.quantize_param_map = quantize_param_map
-        if path.suffix in (".bin", ".safetensors"):
+        if path.suffix in (".bin", ".safetensors", ".pt"):
             self._load_file(path)
             for name in self.cached_files[path].keys():
                 self.torch_to_path[name] = path
 
@@ -9,6 +9,7 @@
 from ..loader import ExternMapping
 from ..quantization import Quantization
 from .llama_model import LlamaConfig, LlamaForCasualLM
+from .llama_quantization import awq_quant
 
 
 def huggingface(model_config: LlamaConfig, quantization: Quantization) -> ExternMapping:
@@ -82,3 +83,76 @@ def huggingface(model_config: LlamaConfig, quantization: Quantization) -> Extern
                 ),
             )
     return mapping
+
+
+def awq(model_config: LlamaConfig, quantization: Quantization) -> ExternMapping:
+    """Returns a parameter mapping that maps from the names of MLC LLM parameters to
+    the names of AWQ parameters.
+    Parameters
+    ----------
+    model_config : LlamaConfig
+        The configuration of the Llama model.
+
+    quantization : Quantization
+        The quantization configuration.
+
+    Returns
+    -------
+    param_map : ExternMapping
+        The parameter mapping from MLC to AWQ.
+    """
+    model, _ = awq_quant(model_config, quantization)
+    _, _named_params = model.export_tvm(spec=model.get_default_spec())
+    named_parameters = dict(_named_params)
+
+    mapping = ExternMapping()
+
+    for i in range(model_config.num_hidden_layers):
+        # Add QKV in self attention
+        attn = f"model.layers.{i}.self_attn"
+        for quantize_suffix in ["qweight", "qzeros", "scales"]:
+            mlc_name = f"{attn}.qkv_proj.{quantize_suffix}"
+            assert mlc_name in named_parameters
+            mlc_param = named_parameters[mlc_name]
+            mapping.add_mapping(
+                mlc_name,
+                [
+                    f"{attn}.q_proj.{quantize_suffix}",
+                    f"{attn}.k_proj.{quantize_suffix}",
+                    f"{attn}.v_proj.{quantize_suffix}",
+                ],
+                functools.partial(
+                    lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+
+        # Concat gate and up in MLP
+        mlp = f"model.layers.{i}.mlp"
+        for quantize_suffix in ["qweight", "qzeros", "scales"]:
+            mlc_name = f"{mlp}.gate_up_proj.{quantize_suffix}"
+            assert mlc_name in named_parameters
+            mlc_param = named_parameters[mlc_name]
+            mapping.add_mapping(
+                mlc_name,
+                [
+                    f"{mlp}.gate_proj.{quantize_suffix}",
+                    f"{mlp}.up_proj.{quantize_suffix}",
+                ],
+                functools.partial(
+                    lambda gate, up, dtype: np.concatenate([gate, up], axis=0).astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+
+        # inv_freq is not used in the model
+        mapping.add_unused(f"{attn}.rotary_emb.inv_freq")
+
+    for mlc_name, mlc_param in named_parameters.items():
+        if mlc_name not in mapping.param_map:
+            mapping.add_mapping(
+                mlc_name,
+                [mlc_name],
+                functools.partial(lambda x, dtype: x.astype(dtype), dtype=mlc_param.dtype),
+            )
+    return mapping
@@ -5,7 +5,7 @@
 from tvm.relax.frontend import nn
 
 from ..loader import QuantizeMapping
-from ..quantization import GroupQuantize
+from ..quantization import AWQQuantize, GroupQuantize
 from .llama_model import LlamaConfig, LlamaForCasualLM
 
 
@@ -15,6 +15,23 @@ def group_quant(
 ) -> Tuple[nn.Module, QuantizeMapping]:
     """Quantize a Llama2 model using group quantization."""
     model: nn.Module = LlamaForCasualLM(model_config)
+    model.to(quantization.model_dtype)
+    quant_map = QuantizeMapping({}, {})
+    model = quantization.quantize_model(
+        model,
+        quant_map,
+        "",
+    )
+    return model, quant_map
+
+
+def awq_quant(
+    model_config: LlamaConfig,
+    quantization: AWQQuantize,
+) -> Tuple[nn.Module, QuantizeMapping]:
+    """Quantize a Llama2 model using Activation-aware Weight Quantization(AWQ)."""
+    model: nn.Module = LlamaForCasualLM(model_config)
+    model.to(quantization.model_dtype)
     quant_map = QuantizeMapping({}, {})
     model = quantization.quantize_model(
         model,
 
@@ -58,6 +58,7 @@ class Model:
         source={
             "huggingface-torch": llama_loader.huggingface,
             "huggingface-safetensor": llama_loader.huggingface,
+            "awq": llama_loader.awq,
         },
         quantize={
             "group-quant": llama_quantization.group_quant,
 
@@ -1,3 +1,4 @@
 """A subpackage for quantization and dequantization algorithms"""
+from .awq_quantization import AWQQuantize
 from .group_quantization import GroupQuantize
 from .quantization import QUANTIZATION, Quantization