Update

junrushao · junrushao · commit e25717b0f1d9 · 2023-10-18T22:44:02.000-07:00
diff --git a/python/mlc_chat/compiler/model/llama.py b/python/mlc_chat/compiler/model/llama.py
@@ -2,43 +2,18 @@
 Implementation for Llama2 architecture.
 TODO: add docstring
 """
-import dataclasses
 import math
-from typing import Any, Dict, Optional
+from typing import Optional
 
 from tvm import te, tir
 from tvm.relax.frontend import nn
 from tvm.relax.frontend.nn import Tensor, op
 
-from ...support.config import ConfigBase
+from .llama_config import LlamaConfig
 
 # pylint: disable=invalid-name,missing-docstring
 
 
-@dataclasses.dataclass
-class LlamaConfig(ConfigBase):  # pylint: disable=too-many-instance-attributes
-    hidden_act: str
-    hidden_size: int
-    intermediate_size: int
-    num_attention_heads: int
-    num_hidden_layers: int
-    rms_norm_eps: float
-    vocab_size: int
-    max_sequence_length: int = 2048
-    position_embedding_base: int = 10000
-    num_key_value_heads: int = 0
-    kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
-    head_dim: int = 0
-
-    def __post_init__(self):
-        if self.num_key_value_heads == 0:
-            self.num_key_value_heads = self.num_attention_heads
-        if self.head_dim == 0:
-            self.head_dim = self.hidden_size // self.num_attention_heads
-        assert self.num_attention_heads % self.num_key_value_heads == 0
-        assert self.head_dim * self.num_attention_heads == self.hidden_size
-
-
 class RotaryEmbedding(nn.Module):
     def __init__(self, config: LlamaConfig):
         super().__init__()
diff --git a/python/mlc_chat/compiler/model/llama_config.py b/python/mlc_chat/compiler/model/llama_config.py
@@ -1,4 +1,40 @@
 """Common configuration for Llama models."""
+import dataclasses
+from typing import Any, Dict
+
+from ...support.config import ConfigBase
+
+
+@dataclasses.dataclass
+class LlamaConfig(ConfigBase):  # pylint: disable=too-many-instance-attributes
+    """Configuration of the Llama model."""
+
+    hidden_act: str
+    hidden_size: int
+    intermediate_size: int
+    num_attention_heads: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    vocab_size: int
+    max_sequence_length: int = 2048
+    position_embedding_base: int = 10000
+    num_key_value_heads: int = 0
+    kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
+    head_dim: int = 0
+
+    def __post_init__(self):
+        if self.num_key_value_heads == 0:
+            self.num_key_value_heads = self.num_attention_heads
+        if self.head_dim == 0:
+            self.head_dim = self.hidden_size // self.num_attention_heads
+        assert self.num_attention_heads % self.num_key_value_heads == 0
+        assert self.head_dim * self.num_attention_heads == self.hidden_size
+
+    @staticmethod
+    def from_predefined(name: str) -> "LlamaConfig":
+        """Create a LlamaConfig from a predefined configuration."""
+        return LlamaConfig.from_dict(CONFIG[name])
+
 
 CONFIG = {
     "llama2_7b": {
diff --git a/python/mlc_chat/compiler/parameter/__init__.py b/python/mlc_chat/compiler/parameter/__init__.py
@@ -2,4 +2,5 @@
 A subpackage of the compiler that represents mapping between external parameters, quantized
 parameters and parameters in MLC-defined models.
 """
+from .hf_torch_loader import HFTorchLoader
 from .mapping import ExternMapping, QuantizeMapping
diff --git a/python/mlc_chat/compiler/parameter/hf_torch_loader.py b/python/mlc_chat/compiler/parameter/hf_torch_loader.py
@@ -4,17 +4,17 @@
 import json
 import logging
 import time
-from collections import defaultdict
+from collections import OrderedDict, defaultdict
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Dict, Iterator, List, Set, Tuple
 
 import numpy as np
 from tqdm import tqdm
-from tqdm.contrib.logging import logging_redirect_tqdm
 from tvm.runtime import NDArray
+from tvm.runtime.ndarray import array as as_ndarray
 
-from .mapping import ExternMapping, QuantizeMapping
+from .mapping import ExternMapping
 
 logger = logging.getLogger(__name__)
 
@@ -140,22 +140,32 @@ def __init__(
         _check_parameter_usage(extern_param_map, set(self.torch_to_path.keys()))
 
     def load(self) -> Iterator[Tuple[str, NDArray]]:
+        """Load the parameters and yield the MLC parameter and its value."""
         mlc_names = _loading_order(self.extern_param_map, self.torch_to_path)
-        with logging_redirect_tqdm():
-            for mlc_name in tqdm(mlc_names):
-                param = self._load_mlc_param(mlc_name)
-                yield mlc_name, param
+        for mlc_name in tqdm(mlc_names):
+            param = self._load_mlc_param(mlc_name)
+            yield mlc_name, param
         cached_files = list(self.cached_files.keys())
         for path in cached_files:
             self._unload_file(path)
-        #     logger.info(
-        #         "Time used in PyTorch loading: %.3f sec. Total %.3f GB loaded",
-        #         self.stats_load_time_sec,
-        #         self.stats_load_data_gb,
-        #     )
+
+        logger.info(
+            "Time used: "
+            "PyTorch loading: %.3f sec; "
+            "Pre-quantization mapping: %.3f sec; "
+            "Quantization: %.3f sec",
+            self.stats.load_time_sec,
+            self.stats.map_time_sec,
+            self.stats.quant_time_sec,
+        )
+        logger.info(
+            "Memory usage: Total size loaded from disk: %.3f GB; Peak memory usage: %.3f GB",
+            self.stats.total_memory_gb,
+            self.stats.max_memory_gb,
+        )
 
     def _load_mlc_param(self, mlc_name: str) -> np.ndarray:
-        torch_names = self.extern_param_map.name_map[mlc_name]
+        torch_names = self.extern_param_map.param_map[mlc_name]
         files_required = {self.torch_to_path[p] for p in torch_names}
         files_existing = set(self.cached_files.keys())
         files_to_load = files_required - files_existing
@@ -176,6 +186,7 @@ def _load_mlc_param(self, mlc_name: str) -> np.ndarray:
         with self.stats.timer("map_time_sec"):
             param = self.extern_param_map.map_func[mlc_name](*torch_params)
         logger.info('  Parameter: "%s", shape: %s, dtype: %s', mlc_name, param.shape, param.dtype)
+        param = as_ndarray(param)
         return param
 
     def _load_file(self, path: Path) -> None:
@@ -197,7 +208,7 @@ def _unload_file(self, path: Path) -> None:
 
 
 def _check_parameter_usage(param_map: ExternMapping, torch_weights: Set[str]):
-    used_torch_names = set(sum(param_map.name_map.values(), ()))
+    used_torch_names = set(sum(param_map.param_map.values(), ()))
     # Check 1. All PyTorch parameters in the weight files are used unless explicitly specified
     unused_torch_names = torch_weights - used_torch_names - param_map.unused_params
     if unused_torch_names:
@@ -233,16 +244,17 @@ def _loading_order(param_map: ExternMapping, torch_to_path: Dict[str, Path]) ->
         path_to_torch[path].append(torch_name)
     # Step 2. Build a map from torch parameters to MLC parameters
     torch_to_mlc = defaultdict(list)
-    for mlc_name, torch_names in param_map.name_map.items():
+    for mlc_name, torch_names in param_map.param_map.items():
         for torch_name in torch_names:
             torch_to_mlc[torch_name].append(mlc_name)
     # Step 3. Construct the ordering that ensures file locality
-    order = []
+    order = OrderedDict()
     for _, torch_names in path_to_torch.items():
         for torch_name in torch_names:
             for mlc_name in torch_to_mlc[torch_name]:
-                order.append(mlc_name)
-    return order
+                if mlc_name not in order:
+                    order[mlc_name] = 1
+    return list(order.keys())
 
 
 __all__ = ["HFTorchLoader"]
diff --git a/python/mlc_chat/support/tqdm.py b/python/mlc_chat/support/tqdm.py
@@ -0,0 +1,38 @@
+"""Utils to better use tqdm"""
+import contextlib
+import inspect
+import io
+
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm as _redirect_logging
+
+
+@contextlib.contextmanager
+def _redirect_print():
+    old_print = print
+
+    def new_print(*args, **kwargs):
+        with io.StringIO() as output:
+            kwargs["file"] = output
+            kwargs["end"] = ""
+            old_print(*args, **kwargs)
+            content = output.getvalue()
+        tqdm.write(content)
+
+    try:
+        inspect.builtins.print = new_print
+        yield
+    finally:
+        inspect.builtins.print = old_print
+
+
+@contextlib.contextmanager
+def redirect():
+    """Redirect tqdm output to logging and print."""
+
+    with _redirect_logging():
+        with _redirect_print():
+            yield
+
+
+__all__ = ["tqdm", "redirect"]
diff --git a/tests/python/model/test_llama.py b/tests/python/model/test_llama.py
@@ -1,12 +1,11 @@
 # pylint: disable=invalid-name,missing-docstring
 import pytest
 from mlc_chat.compiler.model.llama import LlamaConfig, LlamaForCasualLM
-from mlc_chat.compiler.model.llama_config import CONFIG
 
 
 @pytest.mark.parametrize("model_name", ["llama2_7b", "llama2_13b", "llama2_70b"])
 def test_llama2_creation(model_name: str):
-    config = LlamaConfig.from_dict(CONFIG[model_name])
+    config = LlamaConfig.from_predefined(model_name)
     model = LlamaForCasualLM(config)
     mod, named_params = model.export_tvm(spec=model.get_default_spec())
     mod.show(black_format=False)
diff --git a/tests/python/parameter/hf_torch_loader.py b/tests/python/parameter/hf_torch_loader.py
diff --git a/tests/python/parameter/test_hf_torch_loader.py b/tests/python/parameter/test_hf_torch_loader.py
@@ -0,0 +1,42 @@
+# pylint: disable=missing-docstring
+import logging
+from pathlib import Path
+
+import pytest
+from mlc_chat.compiler.model.llama import LlamaConfig
+from mlc_chat.compiler.model.llama_parameter import hf_torch
+from mlc_chat.compiler.parameter import HFTorchLoader
+from mlc_chat.support import tqdm
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    style="{",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    format="[{asctime}] {levelname} {filename}:{lineno}: {message}",
+)
+
+
+@pytest.mark.parametrize(
+    "base_path",
+    [
+        "./dist/models/Llama-2-7b-hf",
+        "./dist/models/Llama-2-13b-hf",
+        "./dist/models/Llama-2-70b-hf",
+    ],
+)
+def test_load_llama(base_path: str):
+    base_path = Path(base_path)
+    path_config = base_path / "config.json"
+    path_params = base_path / "pytorch_model.bin.index.json"
+
+    config = LlamaConfig.from_file(path_config)
+    loader = HFTorchLoader(path=path_params, extern_param_map=hf_torch(config))
+    with tqdm.redirect():
+        for _name, _param in loader.load():
+            ...
+
+
+if __name__ == "__main__":
+    test_load_llama(base_path="./dist/models/Llama-2-7b-hf")
+    test_load_llama(base_path="./dist/models/Llama-2-13b-hf")
+    test_load_llama(base_path="./dist/models/Llama-2-70b-hf")