ModelCloud · Qubitium · Mar 3, 2025 · Mar 3, 2025
diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py
@@ -23,11 +23,13 @@
 
 import torch
 from datasets import Dataset, load_dataset
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from logbar import LogBar
 from transformers import AutoTokenizer, GenerationConfig
 from transformers.generation.logits_process import LogitsProcessor
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
+
+
 logger = LogBar.shared()
 
 random.seed(0)

diff --git a/examples/benchmark/ipex.py b/examples/benchmark/ipex.py
@@ -20,6 +20,7 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
+
 try:
     from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
     bind_cores_for_best_perf()
@@ -29,6 +30,7 @@
 
 import argparse
 
+
 parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
 parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
 parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")

diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
@@ -17,9 +17,11 @@
 import argparse
 import os
 
-from gptqmodel.utils import Perplexity
 from transformers import AutoTokenizer
 
+from gptqmodel.utils import Perplexity
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 if __name__ == "__main__":

diff --git a/examples/evaluation/run_language_modeling_task.py b/examples/evaluation/run_language_modeling_task.py
@@ -18,10 +18,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import LanguageModelingTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer
+
 
 DATASET = "tatsu-lab/alpaca"
 WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"

diff --git a/examples/evaluation/run_sequence_classification_task.py b/examples/evaluation/run_sequence_classification_task.py
@@ -19,10 +19,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import SequenceClassificationTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer
+
 
 DATASET = "cardiffnlp/tweet_sentiment_multilingual"
 TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"

diff --git a/examples/evaluation/run_text_summarization_task.py b/examples/evaluation/run_text_summarization_task.py
@@ -19,10 +19,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer, GenerationConfig
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import TextSummarizationTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer, GenerationConfig
+
 
 os.system("pip install py7zr")
 

diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py
@@ -16,6 +16,7 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+
 tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))

diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py
@@ -19,9 +19,11 @@
 import sys
 from argparse import ArgumentParser
 
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./TinyLlama/TinyLlama-1.1B-Chat-v1.0-4bit-128g"

diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py
@@ -16,9 +16,11 @@
 
 import os
 
-from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

diff --git a/examples/quantization/basic_usage_autoround.py b/examples/quantization/basic_usage_autoround.py
@@ -15,9 +15,11 @@
 # limitations under the License.
 
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import GPTQModel
 from gptqmodel.quantization.config import AutoRoundQuantizeConfig  # noqa: E402
-from transformers import AutoTokenizer
+
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"

diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py
@@ -16,9 +16,11 @@
 
 import torch
 from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig
+
+
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"
 

diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py
@@ -16,6 +16,7 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 
+
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]

diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
@@ -22,6 +22,7 @@
 from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
 
+
 if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
     try:
         from modelscope.utils.hf_util.patcher import patch_hub

diff --git a/gptqmodel/adapter/adapter.py b/gptqmodel/adapter/adapter.py
@@ -10,7 +10,7 @@
 from .peft import LoraConfig
 from .remote import resolve_path
 
-logger = setup_logger()
+log = setup_logger()
 LORA_MERGED_WEIGHT_PATHS = [None, ""]
 HF_ADAPTER_FILE_NAME = "adapter_model.safetensors"
 HF_ADAPTER_CONFIG_FILE_NAME = "adapter_config.json"
@@ -30,7 +30,7 @@ def get(cls, path: str) -> Optional[Tuple[LoraConfig, Dict[str, torch.Tensor]]]:
 
     @classmethod
     def reset(cls):
-        logger.info("Adapter Cache: Resetting cache")
+        log.info("Adapter Cache: Resetting cache")
         cls.cache = {}
 
     @classmethod
@@ -181,10 +181,10 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N
             # we have consumed all modules
             if len(lora_weights) == 0:
                 AdapterCache.remove(self.path)
-                logger.info("Adapter: Consumed all Lora weights")
+                log.info("Adapter: Consumed all Lora weights")
 
         else:
-            logger.warn(f"Adapter: Lora weights not found for `{weight_key}`")
+            log.warn(f"Adapter: Lora weights not found for `{weight_key}`")
 
         assert lora_A is not None and lora_B is not None, f"Adapter: `lora_A` and `lora_B` must both be present in the weights: actual = `{lora_A}` and `{lora_B}`"
 
@@ -198,7 +198,7 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N
         # print(f"Adapter: {self.name()}, loaded lora_A shape: {lora_A.shape}")
         # print(f"Adapter: {self.name()}, loaded lora_B shape: {lora_B.shape}")
         if lora_A.dtype != torch.float16 or lora_A.dtype != torch.float16:
-            logger.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.")
+            log.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.")
 
         self.lora_A = lora_A.to(device=device, dtype=torch.float16)
         self.lora_B = lora_B.to(device=device, dtype=torch.float16)
@@ -216,7 +216,7 @@ def dynamic_rank_override(self, lora_cfg: LoraConfig, weight_key: str) -> bool:
                 # first do string full match, then suffix match, then regex match
                 if weight_key == k or k.endswith(weight_key) or re.match(k, weight_key):
                     self.rank = v
-                    logger.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.")
+                    log.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.")
                     return True
 
         return False

diff --git a/gptqmodel/eora/eora.py b/gptqmodel/eora/eora.py
@@ -22,7 +22,7 @@
 from ..looper.named_module import NamedModule
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+log = setup_logger()
 
 def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.dtype], sample_size: int):
     inp = input[0].to(dtype=torch.float32)
@@ -54,7 +54,7 @@ def eora_compute_lora(
     L, Q = torch.linalg.eigh(raw_scaling_diag_matrix)
     if (L < 0).any():
         ## When expanding the calibration data size for EoRA, I suggest maintaining the balance by allocating 50% to general input (C4) and the remaining 50% to downstream task data.
-        logger.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.")
+        log.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.")
         minimum = torch.min(L[L > 0])
         L[L < 0] = minimum
 
@@ -64,7 +64,7 @@ def eora_compute_lora(
     try:
         scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
     except Exception:
-        logger.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert?
+        log.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert?
         scaling_diag_matrix += 1e-6 * torch.eye(scaling_diag_matrix.shape[0]).to(device)
         scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
 

diff --git a/gptqmodel/looper/dequantize_processor.py b/gptqmodel/looper/dequantize_processor.py
@@ -22,7 +22,7 @@
 from ..utils.logger import setup_logger
 from ..utils.torch import torch_compile
 
-logger = setup_logger()
+log = setup_logger()
 
 class DequantizeProcessor(LoopProcessor):
     def __init__(self, quantized_modules: Dict[str, TorchQuantLinear]):

diff --git a/gptqmodel/looper/eora_processor.py b/gptqmodel/looper/eora_processor.py
@@ -34,7 +34,7 @@
 from ..utils.model import move_to
 from ..utils.torch import torch_compile, torch_sync
 
-logger = setup_logger()
+log = setup_logger()
 
 
 class EoraProcessor(LoopProcessor):
@@ -182,7 +182,7 @@ def process(self, module: NamedModule):
             stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name)
 
         self.log.append(stat)
-        logger.info(stat)
+        log.info(stat)
 
         # logger.info(f"Quantizing module END: {name}, {gptq[name].shape()}")
         self.result_save(module.full_name, {

diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py
@@ -32,7 +32,7 @@
 from ..utils.model import move_to, pack_model
 from ..utils.torch import torch_sync
 
-logger = setup_logger()
+log = setup_logger()
 
 class GPTQProcessor(LoopProcessor):
     def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func,
@@ -90,7 +90,7 @@ def preprocess(self, module: NamedModule, buffered_fwd: bool):
         # deepseek has massive # of sub-modules per layer, causing vram pressure
         # buffered mode is slower due to gpu<->cpu movement
         if buffered_fwd:  # TODO tweak this number for masive MoE
-            logger.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`")
+            log.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`")
             tmp.fwd_inputs_buffered = True
 
         tmp.quantizer.configure(
@@ -160,7 +160,7 @@ def process(self, module: NamedModule):
             stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name)
 
         self.log.append(stat)
-        logger.info(stat)
+        log.info(stat)
 
         self.result_save(module.full_name, {
             "scale": move_to(scale, device=CPU, stream=self.stream),

diff --git a/gptqmodel/looper/loop_processor.py b/gptqmodel/looper/loop_processor.py
@@ -27,7 +27,7 @@
 from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+log = setup_logger()
 
 
 # LoopProcessor is a singleton(), not per module instance
@@ -91,7 +91,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare
             min_calibration_dataset_size = 256
             min_calibration_dataset_input_ids_avg_length = 256
             if len(calibration_dataset) < min_calibration_dataset_size:
-                logger.warning(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
+                log.warn(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
                                f"Current: {len(calibration_dataset)}.")
 
             calibration_dataset = prepare_dataset_func(calibration_dataset=calibration_dataset,
@@ -119,7 +119,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare
             avg = total_input_ids_length / len(calibration_dataset)
 
             if avg < min_calibration_dataset_input_ids_avg_length:
-                logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
+                log.warn(f"The average length of input_ids of calibration_dataset should be greater than "
                                f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.")
 
             self.num_batches = len(calibration_dataset)

diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
@@ -34,7 +34,7 @@
                            get_moe_layer_modules, move_to, nested_move_to)
 from ..utils.torch import torch_empty_cache
 
-logger = setup_logger()
+log = setup_logger()
 
 class ModuleLooper():
     def __init__(self, model: BaseGPTQModel, processors: List[LoopProcessor]):
@@ -192,7 +192,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal
                                                   num_experts=num_experts)
 
         layer_count = len(layers)
-        quant_modules_pb = (logger.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count))
+        quant_modules_pb = (log.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count))
                             .manual()
                             .set(left_steps_offset=1))
 
@@ -419,15 +419,15 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal
                 # ignore log
                 pass
             else:
-                logger.info(f"{reverse_p.name()} summary:\n{reverse_p.log}")
+                log.info(f"{reverse_p.name()} summary:\n{reverse_p.log}")
 
             processor_name = reverse_p.name()
             total_log[processor_name] = reverse_p.log
             if processor_name == "gptq":
                 self.gptq_model.quant_log = reverse_p.log
 
             for module_log in reverse_p.log:
-                logger.info(module_log)
+                log.info(module_log)
             reverse_p.log_plotly()
 
             reverse_p.finalize(model=self.gptq_model, **kwargs)

diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py
@@ -25,6 +25,7 @@
 from ..utils.rocm import IS_ROCM
 from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU
 
+
 CPU = device("cpu")
 CUDA = device("cuda")
 CUDA_0 = device("cuda:0")