cleanup mlx code (#1101)

Qubitium · web-flow · commit 26791063602e · 2025-01-19T02:17:51.000+08:00
* cleanup mlx code

* format
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -34,11 +34,10 @@
 from huggingface_hub import list_repo_files  # noqa: E402
 from transformers import AutoConfig  # noqa: E402
 
-from ..quantization import QUANT_CONFIG_FILENAME, FORMAT  # noqa: E402
+from ..quantization import QUANT_CONFIG_FILENAME  # noqa: E402
 from ..utils import BACKEND, EVAL  # noqa: E402
 from ..utils.logger import setup_logger  # noqa: E402
 from ..utils.model import check_and_get_model_type  # noqa: E402
-from ..nn_modules.qlinear.torch import TorchQuantLinear
 from .base import BaseGPTQModel, QuantizeConfig  # noqa: E402
 from .definitions.baichuan import BaiChuanGPTQ  # noqa: E402
 from .definitions.bloom import BloomGPTQ  # noqa: E402
@@ -341,28 +340,29 @@ def eval(
             return results
         else:
             raise ValueError("Eval framework support: EVAL.LM_EVAL, EVAL.EVALPLUS")
-        
+
     @staticmethod
     def export(model_id_or_path: str, target_path: str, format: str, trust_remote_code: bool = False):
         # load config
         config = AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
 
         if not config.quantization_config:
             raise ValueError("Model is not quantized")
-        
+
         gptq_config = config.quantization_config
-                    
+
         # load gptq model
         gptq_model = GPTQModel.load(model_id_or_path, backend=BACKEND.TORCH)
 
         if format == "mlx":
             try:
-                from mlx_lm.utils import save_weights, save_config
+                from mlx_lm.utils import save_config, save_weights
+
                 from ..utils.mlx import convert_gptq_to_mlx_weights
             except ImportError:
                 raise ValueError("MLX not installed. Please install via `pip install gptqmodel[mlx] --no-build-isolation`.")
-            
-            mlx_weights, mlx_config = convert_gptq_to_mlx_weights(model_id_or_path, gptq_model.model, gptq_config)
+
+            mlx_weights, mlx_config = convert_gptq_to_mlx_weights(model_id_or_path, gptq_model, gptq_config)
 
             save_weights(target_path, mlx_weights, donate_weights=True)
 
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -15,17 +15,15 @@
 
 from __future__ import annotations
 
-import copy
 import json
 import os
 import shutil
 import time
-from typing import Any, Dict, List, Optional, Union, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 from packaging import version
-from torch import autocast
 from transformers import AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizerBase, modeling_utils
 
 from ..nn_modules.hooked_linear import replace_linear_with_hooked_linear
@@ -36,11 +34,12 @@
 from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory
 from ..utils.importer import select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.model import (MODALITY, check_to_quantized, find_layers, get_device, get_module_by_name_prefix,
-                           get_moe_layer_modules, move_to, nested_move_to, normalize_tokenizer, pack_model, get_module)
+from ..utils.model import (MODALITY, check_to_quantized, find_layers, get_device,
+                           get_module, get_module_by_name_prefix, get_moe_layer_modules,
+                           move_to, nested_move_to, normalize_tokenizer, pack_model)
 from ..utils.progress import ProgressBar
 from ..utils.torch import torch_empty_cache
-from ._const import CPU, DEVICE, CUDA, SUPPORTS_MODULE_TYPES
+from ._const import CPU, DEVICE, SUPPORTS_MODULE_TYPES
 from .loader import ModelLoader
 from .writer import (QUANT_LOG_DAMP, QUANT_LOG_FWD_TIME, QUANT_LOG_LAYER,
                      QUANT_LOG_LOSS, QUANT_LOG_MODULE, QUANT_LOG_TIME, ModelWriter)
@@ -402,8 +401,8 @@ def collate_batch(batch):
                 tied_keys = self.model._tied_weights_keys
                 for item in tied_keys:
                     if self.lm_head in item:
-                        raise NotImplementedError(f"quantizing lm_head with tied weights has not been supported "
-                                                  f"currently")
+                        raise NotImplementedError("quantizing lm_head with tied weights has not been supported "
+                                                  "currently")
 
             lm_head_module = get_module(self.model, key=self.lm_head)
             if get_module(self.model, key=self.lm_head) is None:
@@ -566,7 +565,7 @@ def store_lm_head_input_hook(_, args, kwargs):
         for i in layer_pb:
             is_lm_head = i >= layer_count
             if is_lm_head:
-                layer_pb.set_description(f"Quantizing lm_head")
+                layer_pb.set_description("Quantizing lm_head")
                 layer = get_module(self.model, key=self.lm_head)
                 if self.quantize_config.lm_head and not self.quantize_config.lm_head_low_gpu_mem_usage:
                     layer_inputs = lm_head_inputs
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
@@ -470,8 +470,7 @@ def skip(*args, **kwargs):
                 )
 
             t = time.time()
-            logger.info(
-                f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`.")
+            logger.info(f"Converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`.")
             model = convert_gptq_v1_to_v2_format(
                 model,
                 quantize_config=quantize_config,
@@ -578,9 +577,10 @@ def skip(*args, **kwargs):
         if backend == BACKEND.MLX:
             import tempfile
             try:
-                from ..utils.mlx import convert_gptq_to_mlx_weights, mlx_generate
-                from mlx_lm.utils import save_weights, save_config
                 from mlx_lm import load
+                from mlx_lm.utils import save_config, save_weights
+
+                from ..utils.mlx import convert_gptq_to_mlx_weights, mlx_generate
             except ModuleNotFoundError as exception:
                 raise type(exception)(
                     "GPTQModel load mlx model required dependencies are not installed.",
@@ -593,7 +593,7 @@ def skip(*args, **kwargs):
                 save_weights(temp_dir, mlx_weights, donate_weights=True)
                 save_config(mlx_config, config_path=temp_dir + "/config.json")
                 tokenizer.save_pretrained(temp_dir)
-                
+
                 model, _ = load(temp_dir)
 
                 cls.generate = lambda _, **kwargs: mlx_generate(model=model, tokenizer=tokenizer, **kwargs)
diff --git a/gptqmodel/nn_modules/hooked_linear.py b/gptqmodel/nn_modules/hooked_linear.py
@@ -16,6 +16,7 @@
 import torch
 from transformers.pytorch_utils import Conv1D
 
+
 # Models using conv1d: gpt2
 class HookedConv1D(Conv1D):
     def __init__(self, nf: int, nx: int) -> None:
diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -318,7 +318,7 @@ def validate_device(cls, device: DEVICE):
         if device == DEVICE.CUDA:
             if IS_ROCM:
                 raise NotImplementedError("Marlin kernel is not supported on ROCm.")
-                
+
             if CUDA_VISIBLE_DEVICES is None:
                 has_cuda_v8 = all(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count()))
             else:
diff --git a/gptqmodel/nn_modules/qlinear/utils.py b/gptqmodel/nn_modules/qlinear/utils.py
@@ -1,5 +1,6 @@
 import torch
 
+
 # Copied from https://github.com/IST-DASLab/marlin/pull/1
 @torch.no_grad()
 def unpack_4bit_to_32bit_signed(qweight, qzeros):
@@ -35,4 +36,4 @@ def dequantize_4bits_weight(layer):
     scales = scales.repeat_interleave(group_size, dim=0)
     unpacked_qzeros = unpacked_qzeros.repeat_interleave(group_size, dim=0)
     unpacked_qweight = (unpacked_qweight - unpacked_qzeros) * scales
-    return unpacked_qweight.T, unpacked_qzeros
+    return unpacked_qweight.T, unpacked_qzeros
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
@@ -19,12 +19,11 @@
 import re
 from dataclasses import dataclass, field, fields
 from importlib.metadata import version as pkg_version
-from os.path import isdir, join
+from os.path import join
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from packaging import version
-from transformers.utils.hub import cached_file
 
 from ..utils.logger import setup_logger
 
@@ -367,7 +366,7 @@ def to_dict(self):
 
     def calculate_bits_per_weight(self):
         bpw = ((self.group_size * self.bits) + 16 * 2) / self.group_size
-        logger.info(f"Effective BPW (bits per weight): {bpw} bits")
+        logger.info(f"Effective Quantization BPW (bits per weight): {bpw} bpw, based on [bits: {self.bits}, group_size: {self.group_size}]")
 
 @dataclass
 class AutoRoundQuantizeConfig(QuantizeConfig):
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
@@ -127,7 +127,7 @@ def quantize(
         start = time.time()
         if self.device.type not in ["mps", "cpu"]:
             self.layer.weight.data = self.layer.weight.data.cpu()
-            
+
         # TODO: waiting for pytorch implementation of ops for MPS
         if sys.platform == "darwin" and os.getenv("PYTORCH_ENABLE_MPS_FALLBACK") != "1":
             raise RuntimeError("For MacOS you must set env `PYTORCH_ENABLE_MPS_FALLBACK=1` before running quantization.")
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
@@ -19,7 +19,6 @@
 
 import torch
 
-from .rocm import IS_ROCM
 from ..models._const import DEVICE, normalize_device
 from ..nn_modules.qlinear import BaseQuantLinear
 from ..nn_modules.qlinear.bitblas import BitBLASQuantLinear
@@ -33,6 +32,7 @@
 from ..quantization import FORMAT
 from ..utils.logger import setup_logger
 from . import BACKEND
+from .rocm import IS_ROCM
 from .torch import HAS_CUDA, HAS_MPS, HAS_XPU
 
 message_logged = False
diff --git a/gptqmodel/utils/logger.py b/gptqmodel/utils/logger.py
@@ -15,7 +15,9 @@
 
 import logging
 
+# global static/shared logger instance
 logger = None
+
 def setup_logger():
     global logger
     if logger is not None:
@@ -27,6 +29,6 @@ def setup_logger():
     handler.setFormatter(formatter)
     logger.propagate = False
     logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
+    logger.setLevel(logging.DEBUG)
 
     return logger
diff --git a/gptqmodel/utils/marlin.py b/gptqmodel/utils/marlin.py
@@ -17,12 +17,12 @@
 import torch
 from accelerate.utils import find_tied_parameters
 
-from .rocm import IS_ROCM
 from ..nn_modules.qlinear.marlin import MarlinQuantLinear, _get_perms, unpack_qzeros
 from ..quantization import FORMAT, QuantizeConfig
 from ..utils.logger import setup_logger
 from .model import recurse_getattr, recurse_setattr
 from .progress import ProgressBar
+from .rocm import IS_ROCM
 from .torch import torch_empty_cache
 
 logger = setup_logger()
diff --git a/gptqmodel/utils/mlx.py b/gptqmodel/utils/mlx.py
@@ -1,23 +1,23 @@
-import gc
-import logging
-
-import mlx.core.metal
+import torch
 
+from ..models import BaseGPTQModel
+from ..nn_modules.qlinear.torch import TorchQuantLinear
+from ..quantization import FORMAT, QuantizeConfig
+from .logger import setup_logger
 from .progress import ProgressBar
 from .torch import torch_empty_cache
-from ..quantization import QuantizeConfig, FORMAT
-from transformers import PreTrainedModel
-from ..nn_modules.qlinear.torch import TorchQuantLinear
-import torch
+
 try:
     import mlx.core as mx
     from mlx_lm import generate
-    from mlx_lm.utils import _get_classes, load_config, quantize_model, get_model_path
+    from mlx_lm.utils import _get_classes, get_model_path, load_config, quantize_model
     MLX_AVAILABLE = True
 except ImportError:
     MLX_AVAILABLE = False
 
-def convert_gptq_to_mlx_weights(model_id_or_path: str, gptq_model: PreTrainedModel, gptq_config: QuantizeConfig):
+logger = setup_logger()
+
+def convert_gptq_to_mlx_weights(model_id_or_path: str, model: BaseGPTQModel, gptq_config: QuantizeConfig):
     if not MLX_AVAILABLE:
         raise ValueError("MLX not installed. Please install via `pip install gptqmodel[mlx] --no-build-isolation`.")
 
@@ -33,31 +33,27 @@ def convert_gptq_to_mlx_weights(model_id_or_path: str, gptq_model: PreTrainedMod
             if config != {}:
                 if config["bits"] not in [2, 3, 4, 8]:
                     raise ValueError(f'Model bits {config["bits"]} in dynamic, it not in [2,3,4,8]')
-                
+
     # mlx does not support group_size = -1, 16, so we need to convert it to 64, 64 is the default group_size for mlx
     if gptq_config["group_size"] in [-1, 16]:
         gptq_config["group_size"] = 64
 
     config = load_config(get_model_path(model_id_or_path))
 
-    # Initialize MLX model
-    model_class, model_args_class = _get_classes(config=config)
-    mlx_model = model_class(model_args_class.from_dict(config))
-
     # Convert weights
     weights = {}
     n = 1
-    pb = ProgressBar(gptq_model.named_modules(), total=len(list(gptq_model.named_modules())))
+    pb = ProgressBar(model.model.named_modules(), prefix="Converting to mlx:", total=len(list(model.model.named_modules())))
     for name, module in pb:
-        pb.set_description(f" Converting to mlx: {name}")
+        pb.set_description(f"{name}")
         if isinstance(module, TorchQuantLinear):
             weights[f"{name}.weight"] = mx.array(
                 module.dequantize_weight().T.detach().to("cpu", torch.float16).numpy()
             )
 
             module._empty_gptq_only_weights()
 
-            if n % 14 == 0:
+            if n % 10 == 0:
                 # Below saves memory but also make each iter slower: test call every N loop
                 torch_empty_cache()
 
@@ -69,18 +65,27 @@ def convert_gptq_to_mlx_weights(model_id_or_path: str, gptq_model: PreTrainedMod
                 module.weight.detach().to("cpu", torch.float16).numpy()
             )
 
+            n += 1
+
         if hasattr(module, "bias"):
             if module.bias is not None:
                 weights[f"{name}.bias"] = mx.array(
                     module.bias.detach().to("cpu", torch.float16).numpy()
                 )
 
+    del model.model
     torch_empty_cache()
 
+    # Initialize MLX model
+    model_class, model_args_class = _get_classes(config=config)
+    mlx_model = model_class(model_args_class.from_dict(config))
+
     # Load and quantize weights
+    logger.info("Starting MLX quantization...")
     mlx_model.load_weights(list(weights.items()))
     weights, mlx_config = quantize_model(mlx_model, config, q_group_size=gptq_config["group_size"],
                                      q_bits=gptq_config["bits"])
+    logger.info("MLX quantization completed")
 
     return weights, mlx_config
 
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -16,7 +16,6 @@
 from __future__ import annotations
 
 import functools
-import gc
 import hashlib
 import json
 import operator
@@ -36,8 +35,8 @@
 from transformers import AutoConfig, PretrainedConfig
 from transformers.utils.hub import cached_file
 
-from ..models._const import CPU, DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS, \
-    SUPPORTS_MODULE_TYPES
+from ..models._const import (CPU, DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH,
+                             EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS, SUPPORTS_MODULE_TYPES)
 from ..nn_modules.qlinear import BaseQuantLinear
 from ..nn_modules.qlinear.exllama import ExllamaQuantLinear
 from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear
diff --git a/gptqmodel/utils/openai_server.py b/gptqmodel/utils/openai_server.py
@@ -14,10 +14,11 @@
 # limitations under the License.
 
 import threading
-import torch
 import time
 import uuid
 
+import torch
+
 try:
     import uvicorn
     from fastapi import FastAPI, HTTPException
@@ -136,4 +137,4 @@ def wait_until_ready(self, timeout: int = 30, check_interval: float = 0.1):
         while not self.uvicorn_server.started:
             if time.time() - start_time > timeout:
                 raise TimeoutError("GPTQModel OpenAi server failed to start within the specified time.")
-            time.sleep(check_interval)
+            time.sleep(check_interval)
diff --git a/gptqmodel/utils/rocm.py b/gptqmodel/utils/rocm.py
@@ -15,4 +15,4 @@
 
 import torch
 
-IS_ROCM = torch.version.hip is not None
+IS_ROCM = torch.version.hip is not None
diff --git a/setup.py b/setup.py
@@ -236,7 +236,7 @@ def get_version_tag() -> str:
             )
             extensions.append(marlin_kernel)
         elif not HAS_CUDA_V8:
-            print(f"marlin kernel only supports compute capability >= 8.0, there's no such cuda device, skipped.")
+            print("marlin kernel only supports compute capability >= 8.0, there's no such cuda device, skipped.")
         extensions += [
             # TODO: VC++: error lnk2001 unresolved external symbol cublasHgemm
             cpp_ext.CUDAExtension(
diff --git a/tests/models/model_test.py b/tests/models/model_test.py
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
diff --git a/tests/test_lm_head.py b/tests/test_lm_head.py
diff --git a/tests/test_mlx.py b/tests/test_mlx.py
diff --git a/tests/test_mlx_generate.py b/tests/test_mlx_generate.py
diff --git a/tests/test_openai_server.py b/tests/test_openai_server.py
diff --git a/tests/test_packing.py b/tests/test_packing.py