diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py
index 65f12b5cf..1fe340e87 100644
--- a/examples/benchmark/generation_speed.py
+++ b/examples/benchmark/generation_speed.py
@@ -23,11 +23,13 @@
 
 import torch
 from datasets import Dataset, load_dataset
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from logbar import LogBar
 from transformers import AutoTokenizer, GenerationConfig
 from transformers.generation.logits_process import LogitsProcessor
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
+
+
 logger = LogBar.shared()
 
 random.seed(0)
diff --git a/examples/benchmark/ipex.py b/examples/benchmark/ipex.py
index f6d495788..170e96728 100644
--- a/examples/benchmark/ipex.py
+++ b/examples/benchmark/ipex.py
@@ -20,6 +20,7 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
+
 try:
     from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
     bind_cores_for_best_perf()
@@ -29,6 +30,7 @@
 
 import argparse
 
+
 parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
 parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
 parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")
diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
index edadcb32f..0968d5193 100644
--- a/examples/benchmark/perplexity.py
+++ b/examples/benchmark/perplexity.py
@@ -17,9 +17,11 @@
 import argparse
 import os
 
-from gptqmodel.utils import Perplexity
 from transformers import AutoTokenizer
 
+from gptqmodel.utils import Perplexity
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 if __name__ == "__main__":
diff --git a/examples/evaluation/run_language_modeling_task.py b/examples/evaluation/run_language_modeling_task.py
index fce213b48..f31d6fa2d 100644
--- a/examples/evaluation/run_language_modeling_task.py
+++ b/examples/evaluation/run_language_modeling_task.py
@@ -18,10 +18,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import LanguageModelingTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer
+
 
 DATASET = "tatsu-lab/alpaca"
 WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
diff --git a/examples/evaluation/run_sequence_classification_task.py b/examples/evaluation/run_sequence_classification_task.py
index 36d0324c3..38790bc84 100644
--- a/examples/evaluation/run_sequence_classification_task.py
+++ b/examples/evaluation/run_sequence_classification_task.py
@@ -19,10 +19,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import SequenceClassificationTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer
+
 
 DATASET = "cardiffnlp/tweet_sentiment_multilingual"
 TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
diff --git a/examples/evaluation/run_text_summarization_task.py b/examples/evaluation/run_text_summarization_task.py
index a1edb620a..a4abb9829 100644
--- a/examples/evaluation/run_text_summarization_task.py
+++ b/examples/evaluation/run_text_summarization_task.py
@@ -19,10 +19,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer, GenerationConfig
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import TextSummarizationTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer, GenerationConfig
+
 
 os.system("pip install py7zr")
 
diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py
index bc9bed650..4b8fc18d9 100644
--- a/examples/inference/run_transformers.py
+++ b/examples/inference/run_transformers.py
@@ -16,6 +16,7 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+
 tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py
index 6ea5cbd5d..5d08066cd 100644
--- a/examples/inference/run_with_different_backends.py
+++ b/examples/inference/run_with_different_backends.py
@@ -19,9 +19,11 @@
 import sys
 from argparse import ArgumentParser
 
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./TinyLlama/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py
index 39eada708..6819bc4fe 100644
--- a/examples/quantization/basic_usage.py
+++ b/examples/quantization/basic_usage.py
@@ -16,9 +16,11 @@
 
 import os
 
-from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
diff --git a/examples/quantization/basic_usage_autoround.py b/examples/quantization/basic_usage_autoround.py
index 436a18ba1..0c27ed7b1 100644
--- a/examples/quantization/basic_usage_autoround.py
+++ b/examples/quantization/basic_usage_autoround.py
@@ -15,9 +15,11 @@
 # limitations under the License.
 
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import GPTQModel
 from gptqmodel.quantization.config import AutoRoundQuantizeConfig  # noqa: E402
-from transformers import AutoTokenizer
+
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py
index ac1ba63d9..95ba908ad 100644
--- a/examples/quantization/basic_usage_wikitext2.py
+++ b/examples/quantization/basic_usage_wikitext2.py
@@ -16,9 +16,11 @@
 
 import torch
 from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig
+
+
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"
 
diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py
index c9e15b5fb..75b1e7a74 100755
--- a/examples/quantization/transformers_usage.py
+++ b/examples/quantization/transformers_usage.py
@@ -16,6 +16,7 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 
+
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
index 4a13698b4..68f4e6ef7 100644
--- a/gptqmodel/__init__.py
+++ b/gptqmodel/__init__.py
@@ -22,6 +22,7 @@
 from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
 
+
 if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
     try:
         from modelscope.utils.hf_util.patcher import patch_hub
diff --git a/gptqmodel/adapter/adapter.py b/gptqmodel/adapter/adapter.py
index af7ca0ed2..f6cea791e 100644
--- a/gptqmodel/adapter/adapter.py
+++ b/gptqmodel/adapter/adapter.py
@@ -10,7 +10,7 @@
 from .peft import LoraConfig
 from .remote import resolve_path
 
-logger = setup_logger()
+log = setup_logger()
 LORA_MERGED_WEIGHT_PATHS = [None, ""]
 HF_ADAPTER_FILE_NAME = "adapter_model.safetensors"
 HF_ADAPTER_CONFIG_FILE_NAME = "adapter_config.json"
@@ -30,7 +30,7 @@ def get(cls, path: str) -> Optional[Tuple[LoraConfig, Dict[str, torch.Tensor]]]:
 
     @classmethod
     def reset(cls):
-        logger.info("Adapter Cache: Resetting cache")
+        log.info("Adapter Cache: Resetting cache")
         cls.cache = {}
 
     @classmethod
@@ -181,10 +181,10 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N
             # we have consumed all modules
             if len(lora_weights) == 0:
                 AdapterCache.remove(self.path)
-                logger.info("Adapter: Consumed all Lora weights")
+                log.info("Adapter: Consumed all Lora weights")
 
         else:
-            logger.warn(f"Adapter: Lora weights not found for `{weight_key}`")
+            log.warn(f"Adapter: Lora weights not found for `{weight_key}`")
 
         assert lora_A is not None and lora_B is not None, f"Adapter: `lora_A` and `lora_B` must both be present in the weights: actual = `{lora_A}` and `{lora_B}`"
 
@@ -198,7 +198,7 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N
         # print(f"Adapter: {self.name()}, loaded lora_A shape: {lora_A.shape}")
         # print(f"Adapter: {self.name()}, loaded lora_B shape: {lora_B.shape}")
         if lora_A.dtype != torch.float16 or lora_A.dtype != torch.float16:
-            logger.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.")
+            log.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.")
 
         self.lora_A = lora_A.to(device=device, dtype=torch.float16)
         self.lora_B = lora_B.to(device=device, dtype=torch.float16)
@@ -216,7 +216,7 @@ def dynamic_rank_override(self, lora_cfg: LoraConfig, weight_key: str) -> bool:
                 # first do string full match, then suffix match, then regex match
                 if weight_key == k or k.endswith(weight_key) or re.match(k, weight_key):
                     self.rank = v
-                    logger.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.")
+                    log.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.")
                     return True
 
         return False
diff --git a/gptqmodel/eora/eora.py b/gptqmodel/eora/eora.py
index 6f1ea09f0..956b13ef3 100644
--- a/gptqmodel/eora/eora.py
+++ b/gptqmodel/eora/eora.py
@@ -22,7 +22,7 @@
 from ..looper.named_module import NamedModule
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+log = setup_logger()
 
 def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.dtype], sample_size: int):
     inp = input[0].to(dtype=torch.float32)
@@ -54,7 +54,7 @@ def eora_compute_lora(
     L, Q = torch.linalg.eigh(raw_scaling_diag_matrix)
     if (L < 0).any():
         ## When expanding the calibration data size for EoRA, I suggest maintaining the balance by allocating 50% to general input (C4) and the remaining 50% to downstream task data.
-        logger.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.")
+        log.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.")
         minimum = torch.min(L[L > 0])
         L[L < 0] = minimum
 
@@ -64,7 +64,7 @@ def eora_compute_lora(
     try:
         scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
     except Exception:
-        logger.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert?
+        log.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert?
         scaling_diag_matrix += 1e-6 * torch.eye(scaling_diag_matrix.shape[0]).to(device)
         scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
 
diff --git a/gptqmodel/looper/dequantize_processor.py b/gptqmodel/looper/dequantize_processor.py
index 26f7fd674..7ef71a0f1 100644
--- a/gptqmodel/looper/dequantize_processor.py
+++ b/gptqmodel/looper/dequantize_processor.py
@@ -22,7 +22,7 @@
 from ..utils.logger import setup_logger
 from ..utils.torch import torch_compile
 
-logger = setup_logger()
+log = setup_logger()
 
 class DequantizeProcessor(LoopProcessor):
     def __init__(self, quantized_modules: Dict[str, TorchQuantLinear]):
diff --git a/gptqmodel/looper/eora_processor.py b/gptqmodel/looper/eora_processor.py
index 5da732acc..6eee539f6 100644
--- a/gptqmodel/looper/eora_processor.py
+++ b/gptqmodel/looper/eora_processor.py
@@ -34,7 +34,7 @@
 from ..utils.model import move_to
 from ..utils.torch import torch_compile, torch_sync
 
-logger = setup_logger()
+log = setup_logger()
 
 
 class EoraProcessor(LoopProcessor):
@@ -182,7 +182,7 @@ def process(self, module: NamedModule):
             stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name)
 
         self.log.append(stat)
-        logger.info(stat)
+        log.info(stat)
 
         # logger.info(f"Quantizing module END: {name}, {gptq[name].shape()}")
         self.result_save(module.full_name, {
diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py
index c57da50e5..e7dc7740f 100644
--- a/gptqmodel/looper/gptq_processor.py
+++ b/gptqmodel/looper/gptq_processor.py
@@ -32,7 +32,7 @@
 from ..utils.model import move_to, pack_model
 from ..utils.torch import torch_sync
 
-logger = setup_logger()
+log = setup_logger()
 
 class GPTQProcessor(LoopProcessor):
     def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func,
@@ -90,7 +90,7 @@ def preprocess(self, module: NamedModule, buffered_fwd: bool):
         # deepseek has massive # of sub-modules per layer, causing vram pressure
         # buffered mode is slower due to gpu<->cpu movement
         if buffered_fwd:  # TODO tweak this number for masive MoE
-            logger.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`")
+            log.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`")
             tmp.fwd_inputs_buffered = True
 
         tmp.quantizer.configure(
@@ -160,7 +160,7 @@ def process(self, module: NamedModule):
             stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name)
 
         self.log.append(stat)
-        logger.info(stat)
+        log.info(stat)
 
         self.result_save(module.full_name, {
             "scale": move_to(scale, device=CPU, stream=self.stream),
diff --git a/gptqmodel/looper/loop_processor.py b/gptqmodel/looper/loop_processor.py
index 13fba7c86..eb01996dd 100644
--- a/gptqmodel/looper/loop_processor.py
+++ b/gptqmodel/looper/loop_processor.py
@@ -27,7 +27,7 @@
 from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+log = setup_logger()
 
 
 # LoopProcessor is a singleton(), not per module instance
@@ -91,7 +91,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare
             min_calibration_dataset_size = 256
             min_calibration_dataset_input_ids_avg_length = 256
             if len(calibration_dataset) < min_calibration_dataset_size:
-                logger.warning(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
+                log.warn(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
                                f"Current: {len(calibration_dataset)}.")
 
             calibration_dataset = prepare_dataset_func(calibration_dataset=calibration_dataset,
@@ -119,7 +119,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare
             avg = total_input_ids_length / len(calibration_dataset)
 
             if avg < min_calibration_dataset_input_ids_avg_length:
-                logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
+                log.warn(f"The average length of input_ids of calibration_dataset should be greater than "
                                f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.")
 
             self.num_batches = len(calibration_dataset)
diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
index d897517b9..16684a008 100644
--- a/gptqmodel/looper/module_looper.py
+++ b/gptqmodel/looper/module_looper.py
@@ -34,7 +34,7 @@
                            get_moe_layer_modules, move_to, nested_move_to)
 from ..utils.torch import torch_empty_cache
 
-logger = setup_logger()
+log = setup_logger()
 
 class ModuleLooper():
     def __init__(self, model: BaseGPTQModel, processors: List[LoopProcessor]):
@@ -192,7 +192,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal
                                                   num_experts=num_experts)
 
         layer_count = len(layers)
-        quant_modules_pb = (logger.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count))
+        quant_modules_pb = (log.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count))
                             .manual()
                             .set(left_steps_offset=1))
 
@@ -419,7 +419,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal
                 # ignore log
                 pass
             else:
-                logger.info(f"{reverse_p.name()} summary:\n{reverse_p.log}")
+                log.info(f"{reverse_p.name()} summary:\n{reverse_p.log}")
 
             processor_name = reverse_p.name()
             total_log[processor_name] = reverse_p.log
@@ -427,7 +427,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal
                 self.gptq_model.quant_log = reverse_p.log
 
             for module_log in reverse_p.log:
-                logger.info(module_log)
+                log.info(module_log)
             reverse_p.log_plotly()
 
             reverse_p.finalize(model=self.gptq_model, **kwargs)
diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py
index 083418973..6152e9dfa 100644
--- a/gptqmodel/models/_const.py
+++ b/gptqmodel/models/_const.py
@@ -25,6 +25,7 @@
 from ..utils.rocm import IS_ROCM
 from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU
 
+
 CPU = device("cpu")
 CUDA = device("cuda")
 CUDA_0 = device("cuda:0")
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index a3c1a5aea..ef611839a 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -20,18 +20,20 @@
 
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 if not os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None):
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'expandable_segments:True'
-    logger.info("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.")
+    log.info("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.")
 
 if not os.environ.get("CUDA_DEVICE_ORDER", None):
     os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
-    logger.info("ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.")
+    log.info("ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.")
 
 import sys  # noqa: E402
 
+
 # TODO: waiting for pytorch implementgation of aten ops for MPS
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -106,6 +108,7 @@
 from .definitions.xverse import XverseGPTQ  # noqa: E402
 from .definitions.yi import YiGPTQ  # noqa: E402
 
+
 # make quants and inference more determinisitc
 torch.manual_seed(787)
 random.seed(787)
@@ -247,13 +250,13 @@ def from_pretrained(
     ) -> BaseGPTQModel:
         if hasattr(AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code),
                    "quantization_config"):
-            logger.warning("Model is already quantized, will use `from_quantized` to load quantized model.\n"
+            log.warn("Model is already quantized, will use `from_quantized` to load quantized model.\n"
                            "If you want to quantize the model, please pass un_quantized model path or id, and use "
                            "`from_pretrained` with `quantize_config`.")
             return cls.from_quantized(model_id_or_path, trust_remote_code=trust_remote_code)
 
         if quantize_config and quantize_config.dynamic:
-            logger.warning(
+            log.warn(
                 "GPTQModel's per-module `dynamic` quantization feature is currently not upstreamed to hf/vllm/sglang. If you're using vllm, you need to install this PR: https://github.com/vllm-project/vllm/pull/7086")
 
         model_type = check_and_get_model_type(model_id_or_path, trust_remote_code)
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 590b851d7..b74ba22b0 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -28,8 +28,14 @@
 from packaging import version
 from packaging.version import Version
 from tokenicer import Tokenicer
-from transformers import (AutoModelForCausalLM, AutoProcessor, PreTrainedModel,
-                          PreTrainedTokenizerBase, ProcessorMixin, modeling_utils)
+from transformers import (
+    AutoModelForCausalLM,
+    AutoProcessor,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    modeling_utils,
+)
 
 from ..adapter.adapter import Adapter
 from ..nn_modules.hooked_linear import replace_linear_with_hooked_linear
@@ -43,13 +49,31 @@
 from ..utils.hf import autofix_hf_model_config
 from ..utils.importer import select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.model import (MODALITY, check_to_quantized, find_modules, get_device, get_module,
-                           get_module_by_name_prefix, get_moe_layer_modules, move_to, nested_move_to, pack_model)
+from ..utils.model import (
+    MODALITY,
+    check_to_quantized,
+    find_modules,
+    get_device,
+    get_module,
+    get_module_by_name_prefix,
+    get_moe_layer_modules,
+    move_to,
+    nested_move_to,
+    pack_model,
+)
 from ..utils.torch import torch_compile, torch_empty_cache
 from ._const import CALIBRATION_DATASET_CONCAT_CHAR, CPU, DEFAULT_MAX_SHARD_SIZE, DEVICE, SUPPORTS_MODULE_TYPES
 from .loader import ModelLoader
-from .writer import (PROCESS_LOG_FWD_TIME, PROCESS_LOG_LAYER, PROCESS_LOG_MODULE,
-                     PROCESS_LOG_TIME, QUANT_LOG_DAMP, QUANT_LOG_LOSS, ModelWriter)
+from .writer import (
+    PROCESS_LOG_FWD_TIME,
+    PROCESS_LOG_LAYER,
+    PROCESS_LOG_MODULE,
+    PROCESS_LOG_TIME,
+    QUANT_LOG_DAMP,
+    QUANT_LOG_LOSS,
+    ModelWriter,
+)
+
 
 # pytorch 2.6.0 fixes many compilation errors
 TORCH_MIN_VERSION_STR = '2.6.0'
@@ -63,7 +87,7 @@ def check_support_param_buffer_assignment(*args, **kwargs):
 # See https://github.com/huggingface/transformers/issues/34366
 modeling_utils.check_support_param_buffer_assignment = check_support_param_buffer_assignment
 
-logger = setup_logger()
+log = setup_logger()
 
 class BaseGPTQModel(nn.Module):
     # these modules are non-repeating and at the root level
@@ -183,10 +207,10 @@ def __init__(
                 if all(hasattr(m.adapter, name) for name in Lora.parameter_keys()):
                     loaded_loras += 1
 
-            logger.info(f"Adapter: `{loaded_loras}` EoRA/Lora adapters loaded for `{len(qmodules)}` modules.")
+            log.info(f"Adapter: `{loaded_loras}` EoRA/Lora adapters loaded for `{len(qmodules)}` modules.")
 
         # print kernel info:
-        logger.info(f"Kernel: loaded -> `[{', '.join(cls.__name__ for cls in self.kernels())}]`")
+        log.info(f"Kernel: loaded -> `[{', '.join(cls.__name__ for cls in self.kernels())}]`")
 
     def prepare_dataset(
         self,
@@ -566,7 +590,7 @@ def quantize_old(
         min_calibration_dataset_input_ids_avg_length = 256
 
         if len(calibration_dataset) < min_calibration_dataset_size:
-            logger.warning(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
+            log.warn(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
                            f"Current: {len(calibration_dataset)}.")
 
         if self.quantize_config.format == FORMAT.BITBLAS:
@@ -599,7 +623,7 @@ def quantize_old(
         avg = total_input_ids_length / len(calibration_dataset)
 
         if avg < min_calibration_dataset_input_ids_avg_length:
-            logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
+            log.warn(f"The average length of input_ids of calibration_dataset should be greater than "
                            f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.")
 
         if isinstance(self.quantize_config, AutoRoundQuantizeConfig):
@@ -820,7 +844,7 @@ def store_input_hook(_, args, kwargs):
         quantizers = {}
 
         layer_count = len(layers)
-        quant_modules_pb = logger.pb(range(layer_count + 1 if self.quantize_config.lm_head else layer_count)).manual()
+        quant_modules_pb = log.pb(range(layer_count + 1 if self.quantize_config.lm_head else layer_count)).manual()
         gpu_memorys = []
         cpu_memorys = []
         durations = []
@@ -881,7 +905,7 @@ def store_input_hook(_, args, kwargs):
                         layer_name = self.lm_head if is_lm_head_module else f"{self.layers_node}.{module_index}.{name}"
 
                         if self.quantize_config.dynamic_get(layer_name=layer_name) == False: # noqa: E712
-                            logger.info(f"skip module: {layer_name}")
+                            log.info(f"skip module: {layer_name}")
 
                             skipped_modules.append(name)
                             continue
@@ -903,7 +927,7 @@ def store_input_hook(_, args, kwargs):
                     # deepseek has massive # of sub-modules per layer, causing vram pressure
                     # buffered mode is slower due to gpu<->cpu movement
                     if buffered_fwd: # TODO tweak this number for masive MoE
-                        logger.info(f"Experimental: enabling fwd buffered mode for: `{name}`")
+                        log.info(f"Experimental: enabling fwd buffered mode for: `{name}`")
                         tmp.fwd_inputs_buffered = True
 
                     tmp.quantizer.configure(
@@ -1016,7 +1040,7 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
                         stat["dynamic"] = self.quantize_config.dynamic_get(layer_name=layer_name)
 
                     self.quant_log.append(stat)
-                    logger.info(stat)
+                    log.info(stat)
 
                     quantizers[layer_name] = (
                         gptq[name].quantizer.to(CPU),
@@ -1079,9 +1103,9 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
             if auto_gc:
                 torch_empty_cache()
 
-        logger.info(f"Quantization summary:\n{self.quant_log}")
+        log.info(f"Quantization summary:\n{self.quant_log}")
         for module_log in self.quant_log:
-            logger.info(module_log)
+            log.info(module_log)
         if task is not None:
             x = list(range(layer_count))
             gpu_fig = create_plotly(x=x, y=gpu_memorys, xaxis_title="layer", yaxis_title="GPU usage (GB)")
@@ -1153,7 +1177,7 @@ def push_to_hub(self,
                     exists_ok: bool = False,  # set to true if repo already exists
                     token: Optional[str] = None):
 
-        logger.error("`push_to_hub()` api cannot be used on the model instance. Please use `GPTQModel.push_to_hub()` static api instead.")
+        log.error("`push_to_hub()` api cannot be used on the model instance. Please use `GPTQModel.push_to_hub()` static api instead.")
 
     def save(
             self,
@@ -1199,31 +1223,31 @@ def kernels(self) -> List[Type[BaseQuantLinear]]:
         return list(loaded_kernels)
 
     def compile(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False):
-        logger.warn("Deprecation: `model.compile()` is deprecated. Please use `model.optimize()` instead.")
+        log.warn("Deprecation: `model.compile()` is deprecated. Please use `model.optimize()` instead.")
         return self.optimize(backend=backend, mode=mode, fullgraph=fullgraph)
 
     def optimize(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False):
         if not self.quantized:
-            logger.warning("model is not quantized, skip compiling...")
+            log.warn("model is not quantized, skip compiling...")
             return self
 
         if Version(torch.__version__) < PYTORCH_MIN_VERSION_WITH_COMPILE:
             self.compiled = False
-            logger.warning(f"To use compile(), you need to have torch version >= {TORCH_MIN_VERSION_STR}, please "
+            log.warn(f"To use compile(), you need to have torch version >= {TORCH_MIN_VERSION_STR}, please "
                            f"upgrade it by `pip install torch -U`")
             return self
 
         # needed by eora
         # torch._dynamo.config.capture_scalar_outputs = True
 
-        logger.info(f"Compiling qlinear modules with backend: `{backend}`, mode: `{mode}`")
+        log.info(f"Compiling qlinear modules with backend: `{backend}`, mode: `{mode}`")
         modules = find_modules(self.model, layers=[BaseQuantLinear])
         for name in modules.keys():
             modules[name].optimize(fullgraph=False, backend=backend, mode=mode)
 
         # supress errors until PyTorch fixed: https://github.com/pytorch/pytorch/issues/132635
         # torch._dynamo.config.suppress_errors = True
-        logger.info(f"Compiling model with backend: `{backend}`, mode: `{mode}`")
+        log.info(f"Compiling model with backend: `{backend}`, mode: `{mode}`")
 
         self.model = torch_compile(self.model, fullgraph=fullgraph, backend=backend, mode=mode)
 
diff --git a/gptqmodel/models/definitions/gemma2.py b/gptqmodel/models/definitions/gemma2.py
index c7b983402..c528e688c 100644
--- a/gptqmodel/models/definitions/gemma2.py
+++ b/gptqmodel/models/definitions/gemma2.py
@@ -18,7 +18,8 @@
 from ...utils.logger import setup_logger
 from ..base import BaseGPTQModel
 
-logger = setup_logger()
+
+log = setup_logger()
 
 SUPPORT_ERR = "Currently, only vLLM/SGLang with flashinfer enabled can correctly inference a quantized Gemma2-27B model. Pre-quantized model with sample vLLM code: https://huggingface.co/ModelCloud/gemma-2-27b-it-gptq-4bit ."
 
@@ -44,7 +45,7 @@ def __init__(self, *args, **kwargs):
             # The gemma-2 model 9b has 42 hidden layers, while the gemma-2 model 27b has 46 hidden layers.
             if num_hidden_layers > 42:
                 if not self.quantized:
-                    logger.warning(SUPPORT_ERR)
+                    log.warn(SUPPORT_ERR)
                     return
 
                 # quantized gemma-2 27b model only support vLLM/SGLang load.
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
index 6742d267b..6fc430aa5 100644
--- a/gptqmodel/models/loader.py
+++ b/gptqmodel/models/loader.py
@@ -23,6 +23,7 @@
 import torch
 import transformers
 
+
 if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
     try:
         from modelscope import snapshot_download
@@ -47,12 +48,23 @@
 from ..utils.importer import auto_select_device, normalize_device_device_map, select_quant_linear
 from ..utils.logger import setup_logger
 from ..utils.marlin import _validate_marlin_compatibility, _validate_marlin_device_support
-from ..utils.model import (auto_dtype, convert_gptq_v1_to_v2_format, find_modules, get_checkpoints,
-                           get_moe_layer_modules, gptqmodel_post_init, load_checkpoint_in_model_then_tie_weights,
-                           make_quant, simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes)
+from ..utils.model import (
+    auto_dtype,
+    convert_gptq_v1_to_v2_format,
+    find_modules,
+    get_checkpoints,
+    get_moe_layer_modules,
+    gptqmodel_post_init,
+    load_checkpoint_in_model_then_tie_weights,
+    make_quant,
+    simple_dispatch_model,
+    verify_model_hash,
+    verify_sharded_model_hashes,
+)
 from ._const import DEVICE, SUPPORTED_MODELS, normalize_device
 
-logger = setup_logger()
+
+log = setup_logger()
 
 ATTN_IMPLEMENTATION = "attn_implementation"
 USE_FLASH_ATTENTION_2 = "use_flash_attention_2"
@@ -191,7 +203,7 @@ def skip(*args, **kwargs):
                     model.seqlen = model_config[key]
                     break
         else:
-            logger.warning("Model: can't get model's sequence length from model config, will set to 4096.")
+            log.warn("Model: can't get model's sequence length from model config, will set to 4096.")
             model.seqlen = 4096
         model.eval()
 
@@ -395,7 +407,7 @@ def from_quantized(
                 verfieid = verify_model_hash(model_save_name, verify_hash)
             if not verfieid:
                 raise ValueError(f"Hash verification failed for {model_save_name}")
-            logger.info(f"Hash verification succeeded for {model_save_name}")
+            log.info(f"Hash verification succeeded for {model_save_name}")
 
         # == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
         def skip(*args, **kwargs):
@@ -432,7 +444,7 @@ def skip(*args, **kwargs):
                     elif is_flash_attn_2_available() and not has_attn_implementation:
                         args = {USE_FLASH_ATTENTION_2: True}
 
-                    logger.info("Optimize: Auto enabling flash attention2")
+                    log.info("Optimize: Auto enabling flash attention2")
 
             model = cls.loader.from_config(
                 config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, **args
@@ -457,7 +469,7 @@ def skip(*args, **kwargs):
                 ):
                     # log non-lm-head quantized modules only
                     if name is not cls.lm_head:
-                        logger.info(f"The layer {name} is not quantized.")
+                        log.info(f"The layer {name} is not quantized.")
                     del modules[name]
 
             preload_qlinear_kernel = make_quant(
@@ -571,7 +583,7 @@ def skip(*args, **kwargs):
                     model.seqlen = model_config[key]
                     break
         else:
-            logger.warning("can't get model's sequence length from model config, will set to 4096.")
+            log.warn("can't get model's sequence length from model config, will set to 4096.")
             model.seqlen = 4096
 
         # Any post-initialization that require device information, for example buffers initialization on device.
diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py
index 7100812d8..392859ade 100644
--- a/gptqmodel/models/writer.py
+++ b/gptqmodel/models/writer.py
@@ -37,19 +37,37 @@
 
 from ..adapter.adapter import HF_ADAPTER_FILE_NAME, HF_ADAPTER_WEIGHT_KEY_PREFIX, Lora
 from ..adapter.peft import LoraConfig
-from ..quantization.config import (FORMAT, META_FIELD_DAMP_AUTO_INCREMENT, META_FIELD_DAMP_PERCENT, META_FIELD_MSE,
-                                   META_FIELD_QUANTIZER, META_FIELD_STATIC_GROUPS, META_FIELD_TRUE_SEQUENTIAL,
-                                   META_FIELD_URI, META_QUANTIZER_GPTQMODEL, META_VALUE_URI, MIN_VERSION_WITH_V2)
+from ..quantization.config import (
+    FORMAT,
+    META_FIELD_DAMP_AUTO_INCREMENT,
+    META_FIELD_DAMP_PERCENT,
+    META_FIELD_MSE,
+    META_FIELD_QUANTIZER,
+    META_FIELD_STATIC_GROUPS,
+    META_FIELD_TRUE_SEQUENTIAL,
+    META_FIELD_URI,
+    META_QUANTIZER_GPTQMODEL,
+    META_VALUE_URI,
+    MIN_VERSION_WITH_V2,
+)
 from ..utils.backend import BACKEND
 from ..utils.logger import setup_logger
-from ..utils.model import (convert_gptq_v2_to_v1_format, copy_py_files, find_modules,
-                           get_model_files_size, get_moe_layer_modules, get_state_dict_for_save,
-                           load_checkpoint_in_model_then_tie_weights, make_quant)
+from ..utils.model import (
+    convert_gptq_v2_to_v1_format,
+    copy_py_files,
+    find_modules,
+    get_model_files_size,
+    get_moe_layer_modules,
+    get_state_dict_for_save,
+    load_checkpoint_in_model_then_tie_weights,
+    make_quant,
+)
 from ..utils.torch import torch_empty_cache
 from ..version import __version__
 from ._const import CPU, DEFAULT_MAX_SHARD_SIZE
 
-logger = setup_logger()
+
+log = setup_logger()
 
 PROCESS_LOG_NAME = "process"
 PROCESS_LOG_LAYER = "layer"
@@ -67,7 +85,7 @@ def save_pretrained(
             save_dir: str,
             **kwargs,
     ):
-        logger.warning("You are using save_pretrained, which will re-direct to save_quantized.")
+        log.warn("You are using save_pretrained, which will re-direct to save_quantized.")
         self.save_quantized(save_dir=save_dir, **kwargs)
 
     cls.save_pretrained = save_pretrained
@@ -96,7 +114,7 @@ def _eora_save(self, save_dir: str, model_save_dir: str):
                 for lora_key, lora_weight in d.items():
                     assert isinstance(lora_weight, torch.Tensor)
                     weights[f"{key}.{lora_key}"] = lora_weight
-                    logger.info(f"Adapter: EoRA weights found -> `{key}.{lora_key}`, rank = `{lora_rank}`")
+                    log.info(f"Adapter: EoRA weights found -> `{key}.{lora_key}`, rank = `{lora_rank}`")
 
             weight_file_path = f"{save_dir.removesuffix('/')}/{HF_ADAPTER_FILE_NAME}"
 
@@ -112,7 +130,7 @@ def _eora_save(self, save_dir: str, model_save_dir: str):
                                   rank_pattern=rank_pattern)
             lora_cfg.save_pretrained(save_dir=save_dir)
 
-            logger.info(f"Adapter: Saving EoRA weights to -> `{save_dir}`")
+            log.info(f"Adapter: Saving EoRA weights to -> `{save_dir}`")
             os.makedirs(os.path.dirname(save_dir), exist_ok=True)
             save_file(tensors=weights, filename=weight_file_path, metadata={"format": "pt"})
 
@@ -146,7 +164,7 @@ def save_quantized(
             if len(meta_quantizer.split(":")) == 2:
                 quantizers.append(meta_quantizer.replace(" ",""))
             else:
-                logger.warning(f"meta_quantizer: '{meta_quantizer}' format is invalid, expected: 'quantizer_name:version'")
+                log.warn(f"meta_quantizer: '{meta_quantizer}' format is invalid, expected: 'quantizer_name:version'")
 
         # write gptqmodel tooling fingerprint to config
         self.quantize_config.meta_set_versionable(
@@ -192,7 +210,7 @@ def save_quantized(
             raise ValueError("Save aborted as model is not quantized. Please call `quantize()` first.")
 
         if quantize_config.format == FORMAT.GPTQ_V2:
-            logger.warning(
+            log.warn(
                 f"Using 'format = {FORMAT.GPTQ_V2}': the serialized model is only supported by GPTQModel version >= {MIN_VERSION_WITH_V2}."
             )
 
@@ -273,7 +291,7 @@ def debug_saved_config(path):
         model_save_name = model_base_name + ".safetensors"
 
         if not self.qlinear_kernel.SUPPORTS_SHARDS and max_shard_size is not None:
-            logger.warning("Sharding is not supported for this quant. Disabling sharding.")
+            log.warn("Sharding is not supported for this quant. Disabling sharding.")
             max_shard_size = None
 
         if max_shard_size is None:
@@ -282,7 +300,7 @@ def debug_saved_config(path):
             elif not isinstance(safetensors_metadata, dict):
                 raise TypeError("safetensors_metadata must be a dictionary.")
             else:
-                logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                log.debug(f"Received safetensors_metadata: {safetensors_metadata}")
                 new_safetensors_metadata = {}
                 converted_keys = False
                 for key, value in safetensors_metadata.items():
@@ -296,13 +314,13 @@ def debug_saved_config(path):
                                 f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}"
                             )
                         if new_key in new_safetensors_metadata:
-                            logger.warning(
+                            log.warn(
                                 f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting."
                             )
                         new_safetensors_metadata[new_key] = new_value
                 safetensors_metadata = new_safetensors_metadata
                 if converted_keys:
-                    logger.debug(
+                    log.debug(
                         f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}"
                     )
 
@@ -342,7 +360,7 @@ def debug_saved_config(path):
                 elif not isinstance(safetensors_metadata, dict):
                     raise TypeError("safetensors_metadata must be a dictionary.")
                 else:
-                    logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                    log.debug(f"Received safetensors_metadata: {safetensors_metadata}")
                     new_safetensors_metadata = {}
                     converted_keys = False
                     for key, value in safetensors_metadata.items():
@@ -355,12 +373,12 @@ def debug_saved_config(path):
                                 raise TypeError(
                                     f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
                             if new_key in new_safetensors_metadata:
-                                logger.warning(
+                                log.warn(
                                     f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
                             new_safetensors_metadata[new_key] = new_value
                     safetensors_metadata = new_safetensors_metadata
                     if converted_keys:
-                        logger.debug(
+                        log.debug(
                             f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")
 
                 # Format is required to enable Accelerate to load the metadata
@@ -394,9 +412,9 @@ def debug_saved_config(path):
             size_diff_mb = pre_quantized_size_mb - total_size_mb
             size_diff_gb = size_diff_mb / 1024
             percent_diff = (size_diff_mb / pre_quantized_size_mb) * 100
-            logger.info(f"Pre-Quantized model size: {pre_quantized_size_mb:.2f}MB, {pre_quantized_size_gb:.2f}GB")
-            logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
-            logger.info(f"Size difference: {size_diff_mb:.2f}MB, {size_diff_gb:.2f}GB - {percent_diff:.2f}%")
+            log.info(f"Pre-Quantized model size: {pre_quantized_size_mb:.2f}MB, {pre_quantized_size_gb:.2f}GB")
+            log.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
+            log.info(f"Size difference: {size_diff_mb:.2f}MB, {size_diff_gb:.2f}GB - {percent_diff:.2f}%")
 
         # need to copy .py files for model/tokenizers not yet merged to HF transformers
         if self.trust_remote_code:
@@ -457,7 +475,7 @@ def skip(*args, **kwargs):
                 ):
                     # log non-lm-head quantizerd modules only
                     if name is not self.lm_head:
-                        logger.info(f"The layer {name} is not quantized.")
+                        log.info(f"The layer {name} is not quantized.")
                     del modules[name]
 
             make_quant(
diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py
index 6705e8594..60778766f 100644
--- a/gptqmodel/nn_modules/qlinear/__init__.py
+++ b/gptqmodel/nn_modules/qlinear/__init__.py
@@ -28,7 +28,8 @@
 from ...utils.backend import BACKEND
 from ...utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 class BaseQuantLinear(nn.Module):
     SUPPORTS_BITS: List[int] = None
@@ -346,7 +347,7 @@ def validate_device(cls, device: DEVICE):
     # override me, to perform any torch.compile logic on the kernel pre forward
     def optimize(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False):
         self.optimized = True
-        logger.info.once(f"Optimize: `{self.__class__.__name__}` compilation triggered.")
+        log.info.once(f"Optimize: `{self.__class__.__name__}` compilation triggered.")
         pass
 
 class PackableQuantLinear(BaseQuantLinear):
diff --git a/gptqmodel/nn_modules/qlinear/bitblas.py b/gptqmodel/nn_modules/qlinear/bitblas.py
index 18aeef7b3..f924c36ca 100644
--- a/gptqmodel/nn_modules/qlinear/bitblas.py
+++ b/gptqmodel/nn_modules/qlinear/bitblas.py
@@ -30,7 +30,8 @@
 from ...utils import BACKEND
 from ...utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 BITBLAS_TARGET = None
 BITBLAS_DATABASE_PATH = None
@@ -253,13 +254,13 @@ def _get_or_create_bitblas_operator(self, config, enable_tuning):
                 global_operator_cache.save_into_database(
                     BITBLAS_DATABASE_PATH, BITBLAS_TARGET
                 )
-                logger.info(
+                log.info(
                     "BitBLAS Tuning done, appended operator to global_operator_cache."
                 )
             else:
-                logger.info("BitBLAS Operator created.")
+                log.info("BitBLAS Operator created.")
         else:
-            logger.info("BitBLAS Operator found in global_operator_cache.")
+            log.info("BitBLAS Operator found in global_operator_cache.")
         return bitblas_matmul
 
     def reset_parameters(self):
diff --git a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
index 2f689846e..f3de0dd8c 100644
--- a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
+++ b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
@@ -23,7 +23,8 @@
 
 from ...utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 TARGET_MISSING_ERROR = (
     "TVM target not found. Please set the TVM target environment variable using `export TVM_TARGET=<target>`, "
@@ -44,10 +45,10 @@ def check_target(best, default):
 
     if check_target(best_match, "cuda") == best_match:
         match = best_match if score >= MATCH_THRESHOLD else "cuda"
-        logger.info(f"found best match: {match}")
+        log.info(f"found best match: {match}")
         return match
     else:
-        logger.warning(TARGET_MISSING_ERROR)
+        log.warn(TARGET_MISSING_ERROR)
         return "cuda"
 
 
diff --git a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
index ec7bb1166..e909159dc 100644
--- a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
+++ b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
@@ -24,7 +24,8 @@
 from ...utils.backend import BACKEND
 from ...utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 
 gptqmodel_cuda_import_exception = None
diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py
index ee4beb18f..29e446dfe 100644
--- a/gptqmodel/nn_modules/qlinear/exllama.py
+++ b/gptqmodel/nn_modules/qlinear/exllama.py
@@ -26,6 +26,7 @@
 from ...nn_modules.qlinear import PackableQuantLinear
 from ...utils.backend import BACKEND
 
+
 exllama_import_exception = None
 try:
     from gptqmodel_exllama_kernels import make_q4, q4_matmul
@@ -169,7 +170,7 @@ def forward(self, x: torch.Tensor):
 
         x_dtype = x.dtype
         if x_dtype != torch.float16:
-            logger.warning_once(
+            logger.warn.once(
                 f"Exllama kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
             )
 
diff --git a/gptqmodel/nn_modules/qlinear/exllama_eora.py b/gptqmodel/nn_modules/qlinear/exllama_eora.py
index e957df188..6c084cafd 100644
--- a/gptqmodel/nn_modules/qlinear/exllama_eora.py
+++ b/gptqmodel/nn_modules/qlinear/exllama_eora.py
@@ -25,6 +25,7 @@
 from ...nn_modules.qlinear import BaseQuantLinear
 from ...utils.logger import setup_logger
 
+
 exllama_v2v_import_exception = None
 
 try:
diff --git a/gptqmodel/nn_modules/qlinear/exllamav2.py b/gptqmodel/nn_modules/qlinear/exllamav2.py
index d08d2b266..788fb372a 100644
--- a/gptqmodel/nn_modules/qlinear/exllamav2.py
+++ b/gptqmodel/nn_modules/qlinear/exllamav2.py
@@ -26,13 +26,14 @@
 from ...utils.backend import BACKEND
 from ...utils.logger import setup_logger
 
+
 exllama_v2_import_exception = None
 try:
     from gptqmodel_exllamav2_kernels import gemm_half_q_half, make_q_matrix
 except ImportError as e:
     exllama_v2_import_exception = e
 
-logger = setup_logger()
+log = setup_logger()
 
 
 
@@ -225,7 +226,7 @@ def forward(self, x: torch.Tensor, force_cuda=False):
 
         x_dtype = x.dtype
         if x_dtype != torch.float16:
-            logger.warning_once(
+            log.warn.once(
                 f"Exllama v2 kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
             )
 
diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py
index 870f89639..390dee27c 100644
--- a/gptqmodel/nn_modules/qlinear/ipex.py
+++ b/gptqmodel/nn_modules/qlinear/ipex.py
@@ -25,7 +25,8 @@
 from ...utils.torch import torch_compile
 from . import PackableQuantLinear
 
-logger = setup_logger()
+
+log = setup_logger()
 
 BITS_DTYPE_MAPPING = {
     4: "int4_clip",
diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
index 69e084b35..142170be8 100644
--- a/gptqmodel/nn_modules/qlinear/marlin.py
+++ b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -30,13 +30,14 @@
 from ...utils.logger import setup_logger
 from ...utils.rocm import IS_ROCM
 
+
 marlin_import_exception = None
 try:
     import gptqmodel_marlin_kernels
 except ImportError as e:
     marlin_import_exception = e
 
-logger = setup_logger()
+log = setup_logger()
 
 GPTQ_MARLIN_TILE = 16
 GPTQ_MARLIN_MIN_THREAD_N = 64
@@ -225,7 +226,7 @@ def __init__(
         self.fp32 = True if self.backend in [BACKEND.MARLIN, BACKEND.AUTO] else False
 
         if not self.fp32:
-            logger.warn.once("Kernel: Marlin FP16 mode is activated with reduced accuracy. Use default Marlin model for improved inference quality.")
+            log.warn.once("Kernel: Marlin FP16 mode is activated with reduced accuracy. Use default Marlin model for improved inference quality.")
 
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(desc_act,
diff --git a/gptqmodel/nn_modules/qlinear/torch.py b/gptqmodel/nn_modules/qlinear/torch.py
index 255e18fa8..3bcfea203 100644
--- a/gptqmodel/nn_modules/qlinear/torch.py
+++ b/gptqmodel/nn_modules/qlinear/torch.py
@@ -26,7 +26,8 @@
 from ...utils.logger import setup_logger
 from ...utils.torch import torch_compile
 
-logger = setup_logger()
+
+log = setup_logger()
 
 class TorchQuantLinear(PackableQuantLinear):
     SUPPORTS_BITS = [2, 3, 4, 8]
diff --git a/gptqmodel/nn_modules/qlinear/tritonv2.py b/gptqmodel/nn_modules/qlinear/tritonv2.py
index 4484c6e6c..bc75029f6 100644
--- a/gptqmodel/nn_modules/qlinear/tritonv2.py
+++ b/gptqmodel/nn_modules/qlinear/tritonv2.py
@@ -25,6 +25,7 @@
 from ...utils.logger import setup_logger
 from . import PackableQuantLinear
 
+
 try:
     import triton
     import triton.language as tl
@@ -43,7 +44,7 @@ class TritonModuleMixin:
 TRITON_INSTALL_HINT = "Trying to use the triton backend, but it could not be imported. Please install triton by 'pip install gptqmodel[triton] --no-build-isolation'"
 TRITON_XPU_INSTALL_HINT = "Trying to use the triton backend and xpu device, but it could not be imported. Please install triton by [intel-xpu-backend-for-triton](https://github.com/intel/intel-xpu-backend-for-triton)"
 
-logger = setup_logger()
+log = setup_logger()
 
 
 class TritonV2QuantLinear(PackableQuantLinear, TritonModuleMixin):
diff --git a/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/gptqmodel/nn_modules/triton_utils/custom_autotune.py
index 72a9eedbe..9bce135cc 100644
--- a/gptqmodel/nn_modules/triton_utils/custom_autotune.py
+++ b/gptqmodel/nn_modules/triton_utils/custom_autotune.py
@@ -21,6 +21,7 @@
 
 import triton
 
+
 #  code based https://github.com/fpgaminer/GPTQ-triton
 """
 Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
diff --git a/gptqmodel/nn_modules/triton_utils/kernels.py b/gptqmodel/nn_modules/triton_utils/kernels.py
index 27ebfdffd..9d59fbb9a 100644
--- a/gptqmodel/nn_modules/triton_utils/kernels.py
+++ b/gptqmodel/nn_modules/triton_utils/kernels.py
@@ -22,7 +22,8 @@
 from ...utils.logger import setup_logger
 from . import custom_autotune
 
-logger = setup_logger()
+
+log = setup_logger()
 
 
 # code based https://github.com/fpgaminer/GPTQ-triton
diff --git a/gptqmodel/quantization/__init__.py b/gptqmodel/quantization/__init__.py
index ba5c5d889..7386923aa 100644
--- a/gptqmodel/quantization/__init__.py
+++ b/gptqmodel/quantization/__init__.py
@@ -14,7 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON, QUANT_CONFIG_FILENAME,
-                     QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
+from .config import (
+    FORMAT,
+    FORMAT_FIELD_CODE,
+    FORMAT_FIELD_JSON,
+    QUANT_CONFIG_FILENAME,
+    QUANT_METHOD,
+    QUANT_METHOD_FIELD,
+    BaseQuantizeConfig,
+    QuantizeConfig,
+)
 from .gptq import GPTQ
 from .quantizer import Quantizer, quantize
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
index f2b9734e2..b15d5aa0f 100644
--- a/gptqmodel/quantization/config.py
+++ b/gptqmodel/quantization/config.py
@@ -29,7 +29,8 @@
 from ..adapter.adapter import Lora, normalize_adapter
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 FORMAT_FIELD_CODE = "format"
 FORMAT_FIELD_JSON = "checkpoint_format"
@@ -138,7 +139,7 @@ def dynamic_get(dynamic: Dict[str, Dict[str, Union[int, bool]]], module_name: st
                     if isinstance(sub_value, Dict):
                         return sub_value.get(sub_key, default)
                     else:
-                        logger.info(f"QuantConfig: Dynamic `sub_key`: `{sub_key}` failed extraction from  `sub_value`: `{sub_value}`")
+                        log.info(f"QuantConfig: Dynamic `sub_key`: `{sub_key}` failed extraction from  `sub_value`: `{sub_value}`")
                 else:
                     return overrides.get(key, default)
     return default
@@ -333,7 +334,7 @@ def save_pretrained(self, save_dir: str, **kwargs):
         with open(join(save_dir, QUANT_CONFIG_FILENAME), "w", encoding="utf-8") as f:
             d = self.to_dict()
             json_str = json.dumps(d, indent=2)
-            logger.info(f"Saved Quantize Config: \n{json_str}")
+            log.info(f"Saved Quantize Config: \n{json_str}")
             f.write(json_str)
 
     @classmethod
@@ -386,17 +387,17 @@ def from_quant_config(cls, quantize_cfg, format: str = None):
             elif key in field_names:
                 normalized[key] = val
             else:
-                logger.info(f"QuantizeConfig: Ignoring unknown parameter in the quantization configuration: {key}.")
+                log.info(f"QuantizeConfig: Ignoring unknown parameter in the quantization configuration: {key}.")
 
         if format_auto_inferred:
-            logger.info(f"QuantizeConfig: `{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}")
+            log.info(f"QuantizeConfig: `{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}")
 
         if normalized[FORMAT_FIELD_CODE] in {FORMAT.BITBLAS}:
             # AWQ and Marlin do not reorder the rows.
             normalized["desc_act"] = False
 
         if "sym" not in normalized:
-            logger.warning(
+            log.warn(
                 "QuantizeConfig: config does not contain `sym` (symmetric quantization). This may result in silent errors. Defaulting to `sym=True`."
             )
 
@@ -483,7 +484,7 @@ def calculate_bits_per_weight(self):
         else:
             # there is only one scale int32 + one qzero int32 per entire module so overall it contributes to close to 0 bpw
             bpw = self.bits
-        logger.info(f"Estimated Quantization BPW (bits per weight): {bpw} bpw, based on [bits: {self.bits}, group_size: {self.group_size}]")
+        log.info(f"Estimated Quantization BPW (bits per weight): {bpw} bpw, based on [bits: {self.bits}, group_size: {self.group_size}]")
 
 @dataclass
 class AutoRoundQuantizeConfig(QuantizeConfig):
@@ -549,4 +550,4 @@ def to_dict(self):
 class BaseQuantizeConfig(QuantizeConfig):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        logger.warning("QuantizeConfig: BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.")
+        log.warn("QuantizeConfig: BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.")
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
index 45fe11dff..d4e0941f2 100644
--- a/gptqmodel/quantization/gptq.py
+++ b/gptqmodel/quantization/gptq.py
@@ -32,7 +32,8 @@
 from ..utils.torch import torch_sync
 from .quantizer import HF_OPTIMUM, Quantizer
 
-logger = setup_logger()
+
+log = setup_logger()
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
@@ -229,10 +230,10 @@ def quantize(
                 break
             except torch._C._LinAlgError as e:
                 if  self.qcfg.damp_auto_increment != 0:
-                    logger.warning(f"Quantization: Current `damp_percent = {damp_percent:.5f}` is too low, auto-incrementing by `{ self.qcfg.damp_auto_increment:.5f}`")
+                    log.warn(f"Quantization: Current `damp_percent = {damp_percent:.5f}` is too low, auto-incrementing by `{ self.qcfg.damp_auto_increment:.5f}`")
                     damp_percent +=  self.qcfg.damp_auto_increment
                 else:
-                    logger.warning("Quantization: Please increase damp or nsamples for calibration data to avoid the following quant error: current damp_percent=`{damp_percent:.5f}`")
+                    log.warn("Quantization: Please increase damp or nsamples for calibration data to avoid the following quant error: current damp_percent=`{damp_percent:.5f}`")
                     raise e
 
         if not (0 < damp_percent < 1):
diff --git a/gptqmodel/quantization/quantizer.py b/gptqmodel/quantization/quantizer.py
index 985870a1b..993228278 100644
--- a/gptqmodel/quantization/quantizer.py
+++ b/gptqmodel/quantization/quantizer.py
@@ -22,7 +22,8 @@
 from ..quantization import QuantizeConfig
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 HF_OPTIMUM = "hf_optimum"
 
diff --git a/gptqmodel/utils/bitblas.py b/gptqmodel/utils/bitblas.py
index c8250838f..10eeef894 100644
--- a/gptqmodel/utils/bitblas.py
+++ b/gptqmodel/utils/bitblas.py
@@ -25,7 +25,8 @@
 from .model import load_checkpoint_in_model_then_tie_weights
 from .torch import torch_empty_cache
 
-logger = setup_logger()
+
+log = setup_logger()
 
 def prepare_model_for_bitblas_load(
         model,
@@ -41,7 +42,7 @@ def prepare_model_for_bitblas_load(
     # The model (e.g. model.safetensors) is already serialized in the BitBLAS format, load it directly.
     if qcfg.format == FORMAT.BITBLAS:
         # if the checkpoint is already in bitblas format, we can load it directly.
-        logger.info(f"Loading a GPTQ model, detected BitBLAS serialized format at {model_save_name}.")
+        log.info(f"Loading a GPTQ model, detected BitBLAS serialized format at {model_save_name}.")
         model = convert_to_bitblas(model, quant_linear_class, qcfg, sym, desc_act, repack=False)
         load_checkpoint_in_model_then_tie_weights(
             model,
@@ -91,7 +92,7 @@ def convert_to_bitblas(model, model_quantlinear, qcfg: QuantizeConfig, sym: bool
 
         # Note that due to tvm compilation of per layer modules shapes, the first layer loop is
         # relatively much slower if caching is not available. estimate time remaining is highly inaccurate
-        for name, module in logger.pb(list(model.named_modules())).title(message):
+        for name, module in log.pb(list(model.named_modules())).title(message):
             if not isinstance(module, model_quantlinear):
                 continue
 
diff --git a/gptqmodel/utils/hf.py b/gptqmodel/utils/hf.py
index 8a7fc4d8e..b2d9c431b 100644
--- a/gptqmodel/utils/hf.py
+++ b/gptqmodel/utils/hf.py
@@ -2,26 +2,27 @@
 
 from ..utils.logger import setup_logger
 
-logger = setup_logger()
+
+log = setup_logger()
 
 # TODO FIXME! Pre-quantized use AutoModelForCausalLM.from_pretrained() but post-quantized use AutoModelForCausalLM.from_config()
 def autofix_hf_model_config(model: PreTrainedModel, path: str = None):
     if model.can_generate():
         # sync config first
         if path:
-            logger.info(f"Model: Loaded `generation_config`: {model.generation_config}")
+            log.info(f"Model: Loaded `generation_config`: {model.generation_config}")
             try:
                 cfg = GenerationConfig.from_pretrained(pretrained_model_name=path)
                 if cfg != model.generation_config:
                     model.generation_config = cfg
-                    logger.info(
+                    log.info(
                         "Model: Auto-fixed `generation_config` mismatch between model and `generation_config.json`.")
-                    logger.info(f"Model: Updated `generation_config`: {model.generation_config}")
+                    log.info(f"Model: Updated `generation_config`: {model.generation_config}")
                 else:
                     pass
                     # logger.info(f"Model: loaded `generation_config` matching `generation_config.json`.")
             except Exception:
-                logger.info("Model: `generation_config.json` not found. Skipped checking.")
+                log.info("Model: `generation_config.json` not found. Skipped checking.")
 
         # print(f"Before autofix_hf_model_config: {model.generation_config}")
         autofix_hf_generation_config(model.generation_config)
@@ -51,5 +52,5 @@ def autofix_hf_generation_config(cfg: GenerationConfig):
         # fix wrong do_sample
         if errors > 0:
             cfg.do_sample = True
-            logger.info("Model: Auto-Fixed `generation_config` by setting `do_sample=True`.")
+            log.info("Model: Auto-Fixed `generation_config` by setting `do_sample=True`.")
 
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
index ea7b7aca6..8db2bacae 100644
--- a/gptqmodel/utils/importer.py
+++ b/gptqmodel/utils/importer.py
@@ -19,6 +19,7 @@
 from typing import Dict, List, Optional, Type, Union
 
 import torch
+
 from gptqmodel.adapter.adapter import Adapter
 
 from ..models._const import DEVICE, normalize_device
@@ -38,8 +39,9 @@
 from .rocm import IS_ROCM
 from .torch import HAS_CUDA, HAS_MPS, HAS_XPU
 
+
 message_logged = False
-logger = setup_logger()
+log = setup_logger()
 
 AUTO_SELECT_BACKEND_ORDER = OrderedDict({
     BACKEND.MARLIN: MarlinQuantLinear, # optimized for bs > 1
@@ -197,7 +199,7 @@ def select_quant_linear(
                 adapter=adapter,
             )
             if os.environ.get("DEBUG") and not validate:
-                logger.info(f"skip {k} for {str(err)}")
+                log.info(f"skip {k} for {str(err)}")
             if validate:
                 if pack:
                     check_pack_func = issubclass(cls, PackableQuantLinear)
@@ -205,7 +207,7 @@ def select_quant_linear(
                         #if not message_logged:
                         #    logger.info(f"Auto pick kernel based on compatibility: {cls}")
                         #    message_logged = True
-                        logger.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`")
+                        log.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`")
                         validated_qlinears.append(cls)
                         if not multi_select:
                             return cls
@@ -213,7 +215,7 @@ def select_quant_linear(
                     #if not message_logged:
                     #    logger.info(f"Auto pick kernel based on compatibility: {cls}")
                     #    message_logged = True
-                    logger.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`")
+                    log.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`")
                     validated_qlinears.append(cls)
                     if not multi_select:
                         return cls
@@ -249,7 +251,7 @@ def select_quant_linear(
 
         cpu_vendor = Device("cpu").vendor
         if cpu_vendor != "intel":
-            logger.warning(f"Kernel: IPEX on cpu is only validated and optimized for Intel cpu with AVX512, AMX, or XMX. Current cpu vendor: `{cpu_vendor}`.")
+            log.warn(f"Kernel: IPEX on cpu is only validated and optimized for Intel cpu with AVX512, AMX, or XMX. Current cpu vendor: `{cpu_vendor}`.")
 
         qlinear = IPEXQuantLinear
     elif backend == BACKEND.TORCH:
diff --git a/gptqmodel/utils/marlin.py b/gptqmodel/utils/marlin.py
index cd1dc3c4c..4fbdfdf57 100644
--- a/gptqmodel/utils/marlin.py
+++ b/gptqmodel/utils/marlin.py
@@ -21,7 +21,8 @@
 from ..utils.logger import setup_logger
 from .rocm import IS_ROCM
 
-logger = setup_logger()
+
+log = setup_logger()
 
 # Validate marlin support
 def _validate_marlin_device_support() -> bool:
diff --git a/gptqmodel/utils/mlx.py b/gptqmodel/utils/mlx.py
index 24541400d..3a82a8aa1 100644
--- a/gptqmodel/utils/mlx.py
+++ b/gptqmodel/utils/mlx.py
@@ -6,9 +6,10 @@
 from ..models import BaseGPTQModel
 from ..nn_modules.qlinear.torch import TorchQuantLinear
 from ..quantization import FORMAT, QuantizeConfig
-from .logger import setup_logger
+from .log import setup_logger
 from .torch import torch_empty_cache
 
+
 try:
     import mlx.core as mx
     from mlx_lm import generate
@@ -17,7 +18,7 @@
 except ImportError:
     MLX_AVAILABLE = False
 
-logger = setup_logger()
+log = setup_logger()
 
 def convert_gptq_to_mlx_weights(model_id_or_path: str, model: Union[PreTrainedModel, BaseGPTQModel], gptq_config: QuantizeConfig, lm_head_name: str):
     if not MLX_AVAILABLE:
@@ -48,7 +49,7 @@ def convert_gptq_to_mlx_weights(model_id_or_path: str, model: Union[PreTrainedMo
     # Convert weights
     weights = {}
     n = 1
-    pb = logger.pb(model.named_modules()).title("Format: Converting to mlx ->").manual()
+    pb = log.pb(model.named_modules()).title("Format: Converting to mlx ->").manual()
     for name, module in pb:
         pb.subtitle(f"{name}").draw()
         if isinstance(module, TorchQuantLinear):
@@ -85,11 +86,11 @@ def convert_gptq_to_mlx_weights(model_id_or_path: str, model: Union[PreTrainedMo
     mlx_model = model_class(model_args_class.from_dict(config))
 
     # Load and quantize weights
-    logger.info("Starting MLX quantization...")
+    log.info("Starting MLX quantization...")
     mlx_model.load_weights(list(weights.items()))
     weights, mlx_config = quantize_model(mlx_model, config, q_group_size=gptq_config["group_size"],
                                      q_bits=gptq_config["bits"])
-    logger.info("MLX quantization completed")
+    log.info("MLX quantization completed")
 
     return weights, mlx_config
 
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
index 80137274a..d9861f4de 100644
--- a/gptqmodel/utils/model.py
+++ b/gptqmodel/utils/model.py
@@ -34,18 +34,25 @@
 import torch
 import torch.nn as nn
 import transformers
-from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear
-from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear
 from huggingface_hub import HfApi, hf_hub_download
 from packaging import version
 from transformers import AutoConfig, PretrainedConfig
 from transformers.pytorch_utils import id_tensor_storage
 from transformers.utils.hub import cached_file
 
+from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear
+from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear
+
 from ..adapter.adapter import Adapter
 from ..looper.named_module import NamedModule
-from ..models._const import (CPU, DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH,
-                             EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS, SUPPORTS_MODULE_TYPES)
+from ..models._const import (
+    CPU,
+    DEVICE,
+    EXLLAMA_DEFAULT_MAX_INPUT_LENGTH,
+    EXPERT_INDEX_PLACEHOLDER,
+    SUPPORTED_MODELS,
+    SUPPORTS_MODULE_TYPES,
+)
 from ..nn_modules.qlinear import BaseQuantLinear
 from ..nn_modules.qlinear.exllama import ExllamaQuantLinear
 from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear
@@ -54,10 +61,11 @@
 from ..quantization.config import FORMAT_FIELD_JSON, dynamic_get
 from .backend import BACKEND
 from .importer import select_quant_linear
-from .logger import setup_logger
+from .log import setup_logger
 from .torch import torch_empty_cache, torch_new_stream_ctx
 
-logger = setup_logger()
+
+log = setup_logger()
 
 def recurse_getattr(obj, attr: str):
     """
@@ -203,7 +211,7 @@ def make_quant(
         adapter=extension,
     )
 
-    logger.info(f"Kernel: candidates -> `[{', '.join(cls.__name__ for cls in quant_linear_candidates)}]`")
+    log.info(f"Kernel: candidates -> `[{', '.join(cls.__name__ for cls in quant_linear_candidates)}]`")
 
     # loop over actual QLinear init, catch errors and use fallbacks if applicable
     for cls in quant_linear_candidates:
@@ -228,10 +236,10 @@ def make_quant(
                 backend=backend,
                 adapter=qcfg.adapter,
             )
-            logger.info(f"Kernel: selected -> `{linear_cls.__name__}`.")
+            log.info(f"Kernel: selected -> `{linear_cls.__name__}`.")
             return linear_cls
         except NotImplementedError as e:
-            logger.info(f"Kernel: skipped -> `{cls}`.")
+            log.info(f"Kernel: skipped -> `{cls}`.")
 
             # only fallback to other quant linears when backend is auto.
             if backend not in [BACKEND.AUTO, BACKEND.AUTO_TRAINABLE]:
@@ -373,7 +381,7 @@ def convert_gptq_v1_to_v2_format(
     # Limit thread usage to avoid auto-parallizataion regression
     with tctl.threadpool_limits(limits=1):
         t = time.time()
-        logger.info(
+        log.info(
             f"Format: Converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to internal `{FORMAT.GPTQ_V2}`.")
 
         for _, submodule in model.named_modules():
@@ -454,7 +462,7 @@ def convert_gptq_v1_to_v2_format(
                 else:
                     raise NotImplementedError("Only 2,3,4,8 bits are supported.")
 
-        logger.info(f"Format: Conversion complete: {time.time() - t}s")
+        log.info(f"Format: Conversion complete: {time.time() - t}s")
 
     return model
 
@@ -556,7 +564,7 @@ def pack_model(
 
     model.to(CPU)
 
-    logger.info("Packing model...")
+    log.info("Packing model...")
 
     modules = find_modules(model)
 
@@ -582,7 +590,7 @@ def pack_model(
         max_workers = 1
 
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        with logger.pb(names).manual() as pb:
+        with log.pb(names).manual() as pb:
             def wrapper(name):
                 # TODO FIX, thread pool executor does not advance iterator
                 pb.next()
@@ -592,7 +600,7 @@ def wrapper(name):
             for _ in executor.map(wrapper, names):
                 pass
 
-    logger.info("Model packed.")
+    log.info("Model packed.")
     return quant_linear_cls
 
 
@@ -623,7 +631,7 @@ def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]):
 
     for shard_file, expected_hash in zip(shard_files, verify_hash):
         if not verify_model_hash(shard_file, expected_hash):
-            logger.info(f"Hash verification failed for {shard_file}")
+            log.info(f"Hash verification failed for {shard_file}")
             return False
     return True
 
@@ -721,7 +729,7 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon
                 max_input_len = max_input_length
         else:
             if max_input_length is not None:
-                logger.info(
+                log.info(
                     "Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored."
                 )
             max_input_len = 1
@@ -1019,7 +1027,7 @@ def get_state_dict_for_save(model: nn.Module) -> Dict:
                     del state_dict[name]
                     warn_names.add(name)
     if len(warn_names) > 0:
-        logger.warning_once(
+        log.warn.once(
             f"Removed shared tensor {warn_names} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading",
         )
     return state_dict
diff --git a/gptqmodel/utils/openai_server.py b/gptqmodel/utils/openai_server.py
index fa9b52177..dce41b413 100644
--- a/gptqmodel/utils/openai_server.py
+++ b/gptqmodel/utils/openai_server.py
@@ -20,6 +20,7 @@
 
 import torch
 
+
 try:
     import uvicorn
     from fastapi import FastAPI, HTTPException
diff --git a/gptqmodel/utils/perplexity.py b/gptqmodel/utils/perplexity.py
index 22de74157..f03d64ef7 100644
--- a/gptqmodel/utils/perplexity.py
+++ b/gptqmodel/utils/perplexity.py
@@ -21,6 +21,7 @@
 from datasets import load_dataset, load_from_disk
 from logbar import LogBar
 
+
 logger = LogBar.shared()
 
 class Perplexity:
diff --git a/gptqmodel/utils/rocm.py b/gptqmodel/utils/rocm.py
index 4bef3edbd..93da34dcb 100644
--- a/gptqmodel/utils/rocm.py
+++ b/gptqmodel/utils/rocm.py
@@ -16,4 +16,5 @@
 
 import torch
 
+
 IS_ROCM = torch.version.hip is not None
diff --git a/gptqmodel/utils/sglang.py b/gptqmodel/utils/sglang.py
index 3067994b5..7b655cc86 100644
--- a/gptqmodel/utils/sglang.py
+++ b/gptqmodel/utils/sglang.py
@@ -19,6 +19,7 @@
 import torch
 from transformers import AutoConfig
 
+
 try:
     import sglang as sgl
     SGLANG_AVAILABLE = True
diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py
index b725a1c57..2bced525a 100644
--- a/gptqmodel/utils/torch.py
+++ b/gptqmodel/utils/torch.py
@@ -22,6 +22,7 @@
 
 from ..utils.logger import setup_logger
 
+
 HAS_CUDA = False
 HAS_XPU = False
 HAS_MPS = False
@@ -29,7 +30,7 @@
 
 STREAM = None # cache
 
-logger = setup_logger()
+log = setup_logger()
 
 # reset dynamo cache on each model load since during ci loop model inference may exhuast cache
 torch._dynamo.reset()
@@ -62,7 +63,7 @@ def torch_compile(module: Union[torch.nn.Module, Callable], backend:str ="induct
     try:
         return torch.compile(module, backend=backend, mode=mode, fullgraph=fullgraph)
     except BaseException:
-        logger.warning(f"Failed to compile `{module}`")
+        log.warn(f"Failed to compile `{module}`")
         return module
 
 def torch_new_stream():
diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py
index a2ccc092d..ee41f5f14 100644
--- a/gptqmodel/utils/vllm.py
+++ b/gptqmodel/utils/vllm.py
@@ -18,6 +18,7 @@
 
 import torch
 
+
 try:
     from vllm import LLM, SamplingParams
 
diff --git a/setup.py b/setup.py
index 5752a3041..9e70703a9 100644
--- a/setup.py
+++ b/setup.py
@@ -24,6 +24,7 @@
 import torch
 from setuptools import find_packages, setup
 
+
 try:
     from setuptools.command.bdist_wheel import bdist_wheel as _bdist_wheel
 except BaseException:
diff --git a/tests/benchmark/benchmark.py b/tests/benchmark/benchmark.py
index 5aeb3f276..41f4ee9a5 100644
--- a/tests/benchmark/benchmark.py
+++ b/tests/benchmark/benchmark.py
@@ -15,9 +15,10 @@
 # limitations under the License.
 
 from benchmark_test import BenchmarkTest
-from gptqmodel import BACKEND
 from parameterized import parameterized  # noqa: E402
 
+from gptqmodel import BACKEND
+
 
 class TestInference(BenchmarkTest):
     @parameterized.expand(
diff --git a/tests/benchmark/benchmark_test.py b/tests/benchmark/benchmark_test.py
index 71c30da5e..cd8ca2bc8 100644
--- a/tests/benchmark/benchmark_test.py
+++ b/tests/benchmark/benchmark_test.py
@@ -17,13 +17,16 @@
 import os
 import time
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 import unittest  # noqa: E402
 
-from gptqmodel import GPTQModel  # noqa: E402
 from logbar import LogBar
 
+from gptqmodel import GPTQModel  # noqa: E402
+
+
 logger = LogBar.shared()
 
 class BenchmarkTest(unittest.TestCase):
diff --git a/tests/inference_speed.py b/tests/inference_speed.py
index 1e6d5102c..b5601e052 100644
--- a/tests/inference_speed.py
+++ b/tests/inference_speed.py
@@ -19,15 +19,18 @@
 
 from gptqmodel.utils.torch import torch_empty_cache
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 
 import unittest
 
-from gptqmodel import GPTQModel
 from logbar import LogBar
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel
+
+
 logger = LogBar.shared()
 
 class InferenceSpeed(unittest.TestCase):
diff --git a/tests/models/model_test.py b/tests/models/model_test.py
index c7f29b9d0..fb220604f 100644
--- a/tests/models/model_test.py
+++ b/tests/models/model_test.py
@@ -20,14 +20,17 @@
 from typing import Dict, List
 
 from device_smi import Device
+
 from gptqmodel.models._const import CUDA_0
 
+
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 from pathlib import Path  # noqa: E402
 
+
 sys.path.insert(0, f"{str(Path(__file__).resolve().parent.parent)}/models")  # noqa: E402
 import contextlib  # noqa: E402
 import shutil  # noqa: E402
@@ -37,6 +40,10 @@
 import torch.cuda  # noqa: E402
 import transformers  # noqa: E402
 from datasets import load_dataset  # noqa: E402
+from ovis.image_to_test_dataset import get_calib_dataset  # noqa: E402
+from packaging.version import Version  # noqa: E402
+from transformers import AutoProcessor, AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
@@ -44,9 +51,7 @@
 from gptqmodel.utils.eval import EVAL  # noqa: E402
 from gptqmodel.utils.model import MODALITY  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from ovis.image_to_test_dataset import get_calib_dataset  # noqa: E402
-from packaging.version import Version  # noqa: E402
-from transformers import AutoProcessor, AutoTokenizer  # noqa: E402
+
 
 RAND_SEED = 898
 
diff --git a/tests/models/test_gptbigcode.py b/tests/models/test_gptbigcode.py
index 78aa52276..bc465ffbb 100644
--- a/tests/models/test_gptbigcode.py
+++ b/tests/models/test_gptbigcode.py
@@ -17,6 +17,7 @@
 import importlib.util
 import os
 
+
 # TODO: find how ipex registered it jit interpreter
 # if intel_extension_for_pytorch was installed, @torch.jit.script in transformers/models/gpt_bigcode/modeling_gpt_bigcode.py will try to use ipex as torchScript interpreter.
 # However, in quantization, tensor were on gpu, which will throw RuntimeError: itensor_view_from_dense expects CPU tensor input
diff --git a/tests/models/test_opt.py b/tests/models/test_opt.py
index 3467ffd20..cf0303516 100644
--- a/tests/models/test_opt.py
+++ b/tests/models/test_opt.py
@@ -14,9 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from model_test import ModelTest
+
 from gptqmodel import BACKEND
 from gptqmodel.utils.importer import AUTO_SELECT_BACKEND_ORDER
-from model_test import ModelTest
 
 
 class TestOpt(ModelTest):
diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py
index a6b50c1c0..65ecf05c7 100644
--- a/tests/models/test_qwen2_vl.py
+++ b/tests/models/test_qwen2_vl.py
@@ -14,9 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ
 from model_test import ModelTest
 
+from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ
+
 
 class TestQwen2_VL(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Qwen2-VL-2B-Instruct"
diff --git a/tests/models/test_qwen_15_moe.py b/tests/models/test_qwen_15_moe.py
index a95fc4610..1ff0da2d6 100644
--- a/tests/models/test_qwen_15_moe.py
+++ b/tests/models/test_qwen_15_moe.py
@@ -1,6 +1,7 @@
 import unittest
 
 import torch
+
 from gptqmodel import BACKEND, GPTQModel
 
 
diff --git a/tests/tasks/mmlu/_generate_configs.py b/tests/tasks/mmlu/_generate_configs.py
index f613f7cd4..28b94616d 100644
--- a/tests/tasks/mmlu/_generate_configs.py
+++ b/tests/tasks/mmlu/_generate_configs.py
@@ -9,6 +9,7 @@
 import yaml
 from tqdm import tqdm
 
+
 eval_logger = logging.getLogger("lm-eval")
 
 
diff --git a/tests/test_adapter_config.py b/tests/test_adapter_config.py
index 6c09017e4..dc635087a 100644
--- a/tests/test_adapter_config.py
+++ b/tests/test_adapter_config.py
@@ -19,11 +19,13 @@
 from gptqmodel import QuantizeConfig
 from gptqmodel.adapter.adapter import Lora, normalize_adapter
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
+
 lora = "lora"
 
 class TestExtensionConfig(unittest.TestCase):
diff --git a/tests/test_asym_gptq_v1.py b/tests/test_asym_gptq_v1.py
index b115dfd1f..2c9a2176b 100644
--- a/tests/test_asym_gptq_v1.py
+++ b/tests/test_asym_gptq_v1.py
@@ -17,11 +17,13 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-from gptqmodel.quantization import FORMAT  # noqa: E402
 # -- end do not touch
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel.quantization import FORMAT  # noqa: E402
+
 
 class Test(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct"  # "meta-llama/Llama-3.2-1B-Instruct"
diff --git a/tests/test_bits.py b/tests/test_bits.py
index 097f6ca04..c8ee6c022 100644
--- a/tests/test_bits.py
+++ b/tests/test_bits.py
@@ -19,6 +19,7 @@
 
 from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import logging  # noqa: E402
@@ -26,6 +27,9 @@
 import traceback  # noqa: E402
 import unittest  # noqa: E402
 
+from lm_eval.utils import make_table  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear  # noqa: E402
@@ -36,8 +40,7 @@
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.utils.eval import EVAL  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_bits_new.py b/tests/test_bits_new.py
index 125169453..818387340 100644
--- a/tests/test_bits_new.py
+++ b/tests/test_bits_new.py
@@ -16,6 +16,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -23,13 +24,14 @@
 from typing import Optional  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
+from lm_eval.utils import make_table  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from tabulate import tabulate  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.adapter.adapter import Lora  # noqa: E402
 from gptqmodel.utils.eval import EVAL  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from tabulate import tabulate  # noqa: E402
 
 
 def bench(path: str, backend: BACKEND, adapter: Optional[Lora]):
diff --git a/tests/test_dynamic.py b/tests/test_dynamic.py
index 3e5874507..537e67b1a 100644
--- a/tests/test_dynamic.py
+++ b/tests/test_dynamic.py
@@ -19,20 +19,22 @@
 
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import json  # noqa: E402
 import tempfile  # noqa: E402
 
+from models.model_test import ModelTest  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.quantization import QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity, safetensor  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestDynamic(ModelTest):
diff --git a/tests/test_estimate_vram.py b/tests/test_estimate_vram.py
index ba9b76343..ca9dd5be7 100644
--- a/tests/test_estimate_vram.py
+++ b/tests/test_estimate_vram.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import unittest  # noqa: E402
diff --git a/tests/test_eval.py b/tests/test_eval.py
index c4d71ba7b..63bc89f19 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -16,18 +16,22 @@
 
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 import tempfile  # noqa: E402
-from typing import Type  # noqa: E402
-from typing import Union  # noqa: E402
+from typing import (
+    Type,  # noqa: E402
+    Union,  # noqa: E402
+)
 
-from gptqmodel import GPTQModel  # noqa: E402
-from gptqmodel.utils.eval import EVAL  # noqa: E402
 from lm_eval.tasks import TaskManager  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+from gptqmodel.utils.eval import EVAL  # noqa: E402
+
 
 class TestEval(ModelTest):
     @classmethod
diff --git a/tests/test_evalplus.py b/tests/test_evalplus.py
index 13d7251b7..775a41240 100644
--- a/tests/test_evalplus.py
+++ b/tests/test_evalplus.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
index b56a0eecc..e61cd96f1 100644
--- a/tests/test_flash_attention.py
+++ b/tests/test_flash_attention.py
@@ -17,14 +17,16 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 
-from gptqmodel import GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+
 
 class Test(ModelTest):
 
diff --git a/tests/test_group_size.py b/tests/test_group_size.py
index 719866080..47900066d 100644
--- a/tests/test_group_size.py
+++ b/tests/test_group_size.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import logging  # noqa: E402
@@ -24,6 +25,9 @@
 import traceback  # noqa: E402
 import unittest  # noqa: E402
 
+from lm_eval.utils import make_table  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear  # noqa: E402
@@ -34,8 +38,7 @@
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.utils.eval import EVAL  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_inference_speed.py b/tests/test_inference_speed.py
index ed9955b3f..4b97c85a7 100644
--- a/tests/test_inference_speed.py
+++ b/tests/test_inference_speed.py
@@ -17,12 +17,15 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-from gptqmodel.utils import BACKEND  # noqa: E402
 # -- end do not touch
 from inference_speed import InferenceSpeed  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 
+from gptqmodel.utils import BACKEND  # noqa: E402
+
+
 '''
 NATIVE_MODEL_ID = /monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortext-v1
 BITBLAS_NATIVE_MODEL_ID = /monster/data/model/opt-125M-autoround-lm_head-false-symTrue
diff --git a/tests/test_inference_speed_ipex.py b/tests/test_inference_speed_ipex.py
index 08cf088b9..0cd974eb1 100644
--- a/tests/test_inference_speed_ipex.py
+++ b/tests/test_inference_speed_ipex.py
@@ -17,13 +17,15 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
-from gptqmodel.utils import BACKEND
 from inference_speed import InferenceSpeed
 from parameterized import parameterized
 
+from gptqmodel.utils import BACKEND
+
 
 class TestInferenceSpeedIpex(InferenceSpeed):
     @parameterized.expand(
diff --git a/tests/test_ipex_xpu.py b/tests/test_ipex_xpu.py
index 50fb9b85c..ab235fdf6 100644
--- a/tests/test_ipex_xpu.py
+++ b/tests/test_ipex_xpu.py
@@ -17,14 +17,16 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # -- end do not touch
 
 import tempfile  # noqa: E402
 
+from models.model_test import ModelTest  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.models._const import DEVICE  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
 
 
 class TestsIPEX(ModelTest):
diff --git a/tests/test_kernel_output.py b/tests/test_kernel_output.py
index be94531ce..125836ffd 100644
--- a/tests/test_kernel_output.py
+++ b/tests/test_kernel_output.py
@@ -1,6 +1,10 @@
 import unittest
 
 import torch
+from logbar import LogBar
+from parameterized import parameterized
+from torch import Tensor
+
 from gptqmodel import BACKEND, GPTQModel
 from gptqmodel.adapter.adapter import Adapter, AdapterCache, Lora
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear
@@ -11,9 +15,7 @@
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear
 from gptqmodel.utils.model import find_modules
-from logbar import LogBar
-from parameterized import parameterized
-from torch import Tensor
+
 
 log = LogBar.shared()
 
diff --git a/tests/test_lm_eval.py b/tests/test_lm_eval.py
index 1ceaffaf1..99364b919 100644
--- a/tests/test_lm_eval.py
+++ b/tests/test_lm_eval.py
@@ -17,14 +17,16 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
+from lm_eval.utils import make_table  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel
 from gptqmodel.utils.eval import EVAL  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
 
 
 class TestLmEval(unittest.TestCase):
diff --git a/tests/test_lm_head.py b/tests/test_lm_head.py
index c5d39bacf..134362790 100644
--- a/tests/test_lm_head.py
+++ b/tests/test_lm_head.py
@@ -20,12 +20,14 @@
 
 from datasets import load_dataset
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-from gptqmodel import GPTQModel, QuantizeConfig  # noqa: E402
-from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 # -- end do not touch
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel import GPTQModel, QuantizeConfig  # noqa: E402
+from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
+
 
 class TestLmHeadLoad(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"  # "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"
diff --git a/tests/test_lora.py b/tests/test_lora.py
index 0e50794fb..4b1727fea 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -16,14 +16,16 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-from gptqmodel.adapter.adapter import Lora  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+from gptqmodel.adapter.adapter import Lora  # noqa: E402
+
 
 class Test(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128"
diff --git a/tests/test_mlx.py b/tests/test_mlx.py
index 32ca4125f..d3fa1137b 100644
--- a/tests/test_mlx.py
+++ b/tests/test_mlx.py
@@ -1,6 +1,7 @@
 import os
 import sys
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 if sys.platform == "darwin":
@@ -8,11 +9,12 @@
 
 import tempfile  # noqa: E402
 
-from gptqmodel import GPTQModel  # noqa: E402
 from mlx_lm import generate, load  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+
 
 class TestExport(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct/gptq_4bits_01-07_14-18-11_maxlen1024_ns1024_descFalse_damp0.1/"
diff --git a/tests/test_mlx_generate.py b/tests/test_mlx_generate.py
index f3484bfe1..f8581101b 100644
--- a/tests/test_mlx_generate.py
+++ b/tests/test_mlx_generate.py
@@ -1,14 +1,17 @@
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 import sys  # noqa: E402
 
+
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
 
 class TestMlxGenerate(ModelTest):
     @classmethod
diff --git a/tests/test_modelscope.py b/tests/test_modelscope.py
index 22fcf2663..7214a86b4 100644
--- a/tests/test_modelscope.py
+++ b/tests/test_modelscope.py
@@ -1,9 +1,11 @@
 import os
 
+
 os.environ["GPTQMODEL_USE_MODELSCOPE"] = "True"
-from gptqmodel import GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+
 
 class TestLoadModelscope(ModelTest):
 
diff --git a/tests/test_openai_server.py b/tests/test_openai_server.py
index 4b2e4f8c3..777ed650c 100644
--- a/tests/test_openai_server.py
+++ b/tests/test_openai_server.py
@@ -18,8 +18,10 @@
 import unittest
 
 import openai
+
 from gptqmodel import GPTQModel
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 class TestOpeniServer(unittest.TestCase):
diff --git a/tests/test_packable.py b/tests/test_packable.py
index 53eff32ee..27f593029 100644
--- a/tests/test_packable.py
+++ b/tests/test_packable.py
@@ -3,6 +3,9 @@
 from typing import Dict
 
 import torch
+from parameterized import parameterized
+from safetensors.torch import load_file
+
 from gptqmodel import BACKEND, GPTQModel
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
@@ -12,8 +15,6 @@
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.utils.model import convert_gptq_v2_to_v1_format, find_modules
-from parameterized import parameterized
-from safetensors.torch import load_file
 
 
 class TestPackable(unittest.TestCase):
diff --git a/tests/test_packing.py b/tests/test_packing.py
index 7b08099a4..b8a6970b4 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -17,17 +17,20 @@
 # -- do not touch
 import os
 
+from parameterized import parameterized
+
 from gptqmodel import BACKEND
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear
 from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear
-from parameterized import parameterized
+
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
+
 # isort: off
 import torch  # noqa: E402
 import torch.nn as nn  # noqa: E402
diff --git a/tests/test_packing_speed.py b/tests/test_packing_speed.py
index d6e0f699d..11eff2a62 100644
--- a/tests/test_packing_speed.py
+++ b/tests/test_packing_speed.py
@@ -19,6 +19,7 @@
 
 from gptqmodel import BACKEND
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -28,6 +29,7 @@
 import threadpoolctl  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 
+
 # isort: off
 import torch  # noqa: E402
 import torch.nn as nn  # noqa: E402
diff --git a/tests/test_parameter_count.py b/tests/test_parameter_count.py
index 599c5823a..260ac2541 100644
--- a/tests/test_parameter_count.py
+++ b/tests/test_parameter_count.py
@@ -2,11 +2,12 @@
 import tempfile
 
 import torch.cuda
-from gptqmodel import GPTQModel, QuantizeConfig
-from gptqmodel.utils.tensor import tensor_parameters
 from models.model_test import ModelTest
 from safetensors.torch import load_file
 
+from gptqmodel import GPTQModel, QuantizeConfig
+from gptqmodel.utils.tensor import tensor_parameters
+
 
 class TestsParameterCount(ModelTest):
     LLAMA_3_2_1B_PARAMETER_COUNT = 1235814400
@@ -19,11 +20,12 @@ class TestsParameterCount(ModelTest):
     def test_parameter_count(self):
         import os.path
 
-        from gptqmodel import QuantizeConfig
-        from gptqmodel.utils.tensor import tensor_parameters
         from huggingface_hub import hf_hub_download
         from safetensors.torch import load_file
 
+        from gptqmodel import QuantizeConfig
+        from gptqmodel.utils.tensor import tensor_parameters
+
         model_id = "/monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1"
         if os.path.isdir(model_id):
             file_path = os.path.join(model_id, "model.safetensors")
diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py
index a30e81d3c..98454aa26 100644
--- a/tests/test_perplexity.py
+++ b/tests/test_perplexity.py
@@ -18,6 +18,7 @@
 import os
 import time
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,13 +26,14 @@
 import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.quantization.config import FORMAT, QUANT_METHOD, AutoRoundQuantizeConfig, QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity  # noqa: E402
 from gptqmodel.utils.rocm import IS_ROCM  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
 
 
 class TestPerplexity(unittest.TestCase):
diff --git a/tests/test_post_quant_eora.py b/tests/test_post_quant_eora.py
index 152d8c410..c281c27a6 100644
--- a/tests/test_post_quant_eora.py
+++ b/tests/test_post_quant_eora.py
@@ -16,19 +16,21 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import tempfile  # noqa: E402
 from typing import Optional  # noqa: E402
 
+from lm_eval.utils import make_table  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from tabulate import tabulate  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.adapter.adapter import Lora  # noqa: E402
 from gptqmodel.utils.eval import EVAL  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from tabulate import tabulate  # noqa: E402
 
 
 def bench(path: str, backend: BACKEND, adapter: Optional[Lora]):
diff --git a/tests/test_q4_bitblas.py b/tests/test_q4_bitblas.py
index ee819ec39..0d04505b1 100644
--- a/tests/test_q4_bitblas.py
+++ b/tests/test_q4_bitblas.py
@@ -17,15 +17,17 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQ4BitBLAS(unittest.TestCase):
diff --git a/tests/test_q4_cuda.py b/tests/test_q4_cuda.py
index 51af7c270..31d45fcf0 100644
--- a/tests/test_q4_cuda.py
+++ b/tests/test_q4_cuda.py
@@ -17,16 +17,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
 
 class TestsQ4CUDA(ModelTest):
 
diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py
index b6135e75a..30353270a 100644
--- a/tests/test_q4_exllama_v1.py
+++ b/tests/test_q4_exllama_v1.py
@@ -17,20 +17,23 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 
 import torch  # noqa: E402
+from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, exllama_set_max_input_length  # noqa: E402
 from gptqmodel.models._const import EXLLAMA_DEFAULT_MAX_INPUT_LENGTH  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
 from gptqmodel.utils.importer import select_quant_linear  # noqa: E402
 from gptqmodel.utils.model import gptqmodel_post_init  # noqa: E402
-from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 REFERENCE = torch.Tensor(
     [
diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py
index cf3ecac42..45c2a1202 100644
--- a/tests/test_q4_exllama_v2.py
+++ b/tests/test_q4_exllama_v2.py
@@ -17,19 +17,22 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
+from test_q4_exllama_v1 import REFERENCE, get_diff  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
 from gptqmodel.utils.importer import select_quant_linear  # noqa: E402
 from gptqmodel.utils.model import gptqmodel_post_init  # noqa: E402
-from test_q4_exllama_v1 import REFERENCE, get_diff  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 GENERATE_EVAL_SIZE = 100
 
diff --git a/tests/test_q4_ipex.py b/tests/test_q4_ipex.py
index efdb3d0ca..1e78fff95 100644
--- a/tests/test_q4_ipex.py
+++ b/tests/test_q4_ipex.py
@@ -18,13 +18,15 @@
 import os
 import sys
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel import BACKEND  # noqa: E402
+
 
 class TestsIPEX(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"  # "bigscience/bloom-560m"
diff --git a/tests/test_q4_marlin.py b/tests/test_q4_marlin.py
index 044f1dfa4..9b8bbdf56 100644
--- a/tests/test_q4_marlin.py
+++ b/tests/test_q4_marlin.py
@@ -17,16 +17,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
+
 
 class TestQ4Marlin(ModelTest):
 
diff --git a/tests/test_q4_torch.py b/tests/test_q4_torch.py
index 0b4884147..3a9f68db7 100644
--- a/tests/test_q4_torch.py
+++ b/tests/test_q4_torch.py
@@ -17,14 +17,16 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
 
 class TestsQ4Torch(ModelTest):
     GENERATE_EVAL_SIZE_MIN = 20
diff --git a/tests/test_q4_torch_apple.py b/tests/test_q4_torch_apple.py
index e51fe5ba8..e9318100d 100644
--- a/tests/test_q4_torch_apple.py
+++ b/tests/test_q4_torch_apple.py
@@ -17,11 +17,12 @@
 import sys  # noqa: E402
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
 
 class TestsQ4Torch(ModelTest):
     GENERATE_EVAL_SIZE_MIN = 5
diff --git a/tests/test_q4_triton.py b/tests/test_q4_triton.py
index c0a7e9a2e..0da3238f8 100644
--- a/tests/test_q4_triton.py
+++ b/tests/test_q4_triton.py
@@ -17,15 +17,17 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
+
 
 class TestsQ4Triton(ModelTest):
     model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
diff --git a/tests/test_quant_and_eora.py b/tests/test_quant_and_eora.py
index 6a907d4df..813cdfb57 100644
--- a/tests/test_quant_and_eora.py
+++ b/tests/test_quant_and_eora.py
@@ -16,6 +16,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -23,13 +24,14 @@
 from typing import Optional  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
+from lm_eval.utils import make_table  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from tabulate import tabulate  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.adapter.adapter import Lora  # noqa: E402
 from gptqmodel.utils.eval import EVAL  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from tabulate import tabulate  # noqa: E402
 
 
 class Test(ModelTest):
diff --git a/tests/test_quant_and_eora_transformers.py b/tests/test_quant_and_eora_transformers.py
index 8071cac46..40086fd9a 100644
--- a/tests/test_quant_and_eora_transformers.py
+++ b/tests/test_quant_and_eora_transformers.py
@@ -21,6 +21,7 @@
 from safetensors.torch import load_file
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -28,15 +29,17 @@
 from typing import Optional  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
-from gptqmodel.adapter.adapter import HF_ADAPTER_FILE_NAME, HF_ADAPTER_WEIGHT_KEY_PREFIX, Lora  # noqa: E402
-from gptqmodel.utils.eval import EVAL  # noqa: E402
-from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 from lm_eval.utils import make_table  # noqa: E402
 from logbar import LogBar
 from models.model_test import ModelTest  # noqa: E402
 from tabulate import tabulate  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
+from gptqmodel.adapter.adapter import HF_ADAPTER_FILE_NAME, HF_ADAPTER_WEIGHT_KEY_PREFIX, Lora  # noqa: E402
+from gptqmodel.utils.eval import EVAL  # noqa: E402
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+
+
 log = LogBar.shared()
 
 
diff --git a/tests/test_quant_batch.py b/tests/test_quant_batch.py
index eace9e815..6ae851594 100644
--- a/tests/test_quant_batch.py
+++ b/tests/test_quant_batch.py
@@ -17,16 +17,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import tempfile  # noqa: E402
 
+from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization import QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQuantBatch(ModelTest):
diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index 9e6736859..910720900 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,15 +26,20 @@
 import tempfile  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
-from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
-from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,  # noqa: E402
-                                           AutoRoundQuantizeConfig, QuantizeConfig)
-from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
+from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
+from gptqmodel.quantization.config import (  # noqa: E402
+    META_FIELD_QUANTIZER,
+    META_QUANTIZER_GPTQMODEL,
+    AutoRoundQuantizeConfig,
+    QuantizeConfig,
+)
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+
 
 class TestQuantization(ModelTest):
 
diff --git a/tests/test_quant_formats_auto_round.py b/tests/test_quant_formats_auto_round.py
index 3b99d47e0..03f79bf4e 100644
--- a/tests/test_quant_formats_auto_round.py
+++ b/tests/test_quant_formats_auto_round.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,15 +26,20 @@
 import tempfile  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
-from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
-from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,  # noqa: E402
-                                           AutoRoundQuantizeConfig, QuantizeConfig)
-from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
+from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
+from gptqmodel.quantization.config import (  # noqa: E402
+    META_FIELD_QUANTIZER,
+    META_QUANTIZER_GPTQMODEL,
+    AutoRoundQuantizeConfig,
+    QuantizeConfig,
+)
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+
 
 class TestQuantization(ModelTest):
 
diff --git a/tests/test_quant_formats_ipex.py b/tests/test_quant_formats_ipex.py
index a2774d8ad..14780643e 100644
--- a/tests/test_quant_formats_ipex.py
+++ b/tests/test_quant_formats_ipex.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,15 +26,20 @@
 import tempfile  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
-from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
-from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,  # noqa: E402
-                                           AutoRoundQuantizeConfig, QuantizeConfig)
-from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
+from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
+from gptqmodel.quantization.config import (  # noqa: E402
+    META_FIELD_QUANTIZER,
+    META_QUANTIZER_GPTQMODEL,
+    AutoRoundQuantizeConfig,
+    QuantizeConfig,
+)
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+
 
 class TestQuantization(ModelTest):
 
diff --git a/tests/test_quant_time.py b/tests/test_quant_time.py
index b925a9c0b..ef3c1e2e0 100644
--- a/tests/test_quant_time.py
+++ b/tests/test_quant_time.py
@@ -16,13 +16,15 @@
 
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 import time  # noqa: E402
 
+from models.model_test import ModelTest  # noqa: E402
+
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization.config import QuantizeConfig  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
 
 
 class TestQuantTime(ModelTest):
diff --git a/tests/test_quant_trust_remote.py b/tests/test_quant_trust_remote.py
index 312800420..7437e42c7 100644
--- a/tests/test_quant_trust_remote.py
+++ b/tests/test_quant_trust_remote.py
@@ -17,18 +17,20 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import tempfile  # noqa: E402
 
 import transformers  # noqa: E402
-from gptqmodel import GPTQModel  # noqa: E402
-from gptqmodel.quantization import FORMAT, QuantizeConfig  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 from packaging.version import Version  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+from gptqmodel.quantization import FORMAT, QuantizeConfig  # noqa: E402
+
 
 class TestQuantWithTrustRemoteTrue(ModelTest):
     @classmethod
diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py
index 6f85bd14f..75ea967dc 100644
--- a/tests/test_save_loaded_quantized_model.py
+++ b/tests/test_save_loaded_quantized_model.py
@@ -17,15 +17,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
+
+
 MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
 
 class TestSave(unittest.TestCase):
diff --git a/tests/test_save_loaded_quantized_model_ipex.py b/tests/test_save_loaded_quantized_model_ipex.py
index 70a6e526a..92aef288e 100644
--- a/tests/test_save_loaded_quantized_model_ipex.py
+++ b/tests/test_save_loaded_quantized_model_ipex.py
@@ -17,15 +17,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
+
+
 MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
 
 class TestSave(unittest.TestCase):
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
index 8610e4af0..2df43e218 100644
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_sglang.py b/tests/test_sglang.py
index cbc8e6344..9883d09b0 100644
--- a/tests/test_sglang.py
+++ b/tests/test_sglang.py
@@ -16,14 +16,16 @@
 
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import importlib.util  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
 
 class TestLoadSglang(ModelTest):
 
diff --git a/tests/test_sharded.py b/tests/test_sharded.py
index fa57c045a..d5524fed4 100644
--- a/tests/test_sharded.py
+++ b/tests/test_sharded.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,9 +26,10 @@
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from gptqmodel import GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+
 
 class TestSharded(unittest.TestCase):
     MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
diff --git a/tests/test_tgi.py b/tests/test_tgi.py
index 55136f35d..c8be3e9b4 100644
--- a/tests/test_tgi.py
+++ b/tests/test_tgi.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import json  # noqa: E402
diff --git a/tests/test_tokenicer.py b/tests/test_tokenicer.py
index 243359367..a9dad21ad 100644
--- a/tests/test_tokenicer.py
+++ b/tests/test_tokenicer.py
@@ -16,13 +16,15 @@
 
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 import unittest  # noqa: E402
 
-from gptqmodel import GPTQModel, QuantizeConfig  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 
+from gptqmodel import GPTQModel, QuantizeConfig  # noqa: E402
+
 
 class TestTokenicer(unittest.TestCase):
 
diff --git a/tests/test_transformers.py b/tests/test_transformers.py
index a35a5c32a..3c287549c 100644
--- a/tests/test_transformers.py
+++ b/tests/test_transformers.py
@@ -15,15 +15,17 @@
 # limitations under the License.
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
 import transformers  # noqa: E402
-from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 from packaging.version import Version  # noqa: E402
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig  # noqa: E402
 
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+
 
 class TestTransformersIntegration(unittest.TestCase):
     INFERENCE_PROMPT = "Which city is the capital of France? The city name is "
diff --git a/tests/test_triton.py b/tests/test_triton.py
index cce0c09d1..2050ab6b6 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -25,9 +26,11 @@
 
 import torch  # noqa: E402
 import torch.utils.benchmark as benchmark  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
+
 MODEL_ID = "/monster/data/model/Llama-7B-GPTQ"
 DATASET_ID = "timdettmers/openassistant-guanaco"
 LEARNING_RATE = 3e-5
diff --git a/tests/test_triton_xpu.py b/tests/test_triton_xpu.py
index 110bea6bc..cf61879ad 100644
--- a/tests/test_triton_xpu.py
+++ b/tests/test_triton_xpu.py
@@ -17,14 +17,16 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # -- end do not touch
 
 import tempfile  # noqa: E402
 
+from models.model_test import ModelTest  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.models._const import DEVICE  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
 
 
 class TestTritonXPU(ModelTest):
diff --git a/tests/test_verify_hash.py b/tests/test_verify_hash.py
index e65f7af3e..1bc22f3c1 100644
--- a/tests/test_verify_hash.py
+++ b/tests/test_verify_hash.py
@@ -17,6 +17,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_vllm.py b/tests/test_vllm.py
index 16534b9cb..f84d133cb 100644
--- a/tests/test_vllm.py
+++ b/tests/test_vllm.py
@@ -17,17 +17,19 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import importlib.util  # noqa: E402
 import tempfile  # noqa: E402
 
+from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestLoadVLLM(ModelTest):