diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py index 65f12b5cf..1fe340e87 100644 --- a/examples/benchmark/generation_speed.py +++ b/examples/benchmark/generation_speed.py @@ -23,11 +23,13 @@ import torch from datasets import Dataset, load_dataset -from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from logbar import LogBar from transformers import AutoTokenizer, GenerationConfig from transformers.generation.logits_process import LogitsProcessor +from gptqmodel import BACKEND, GPTQModel, QuantizeConfig + + logger = LogBar.shared() random.seed(0) diff --git a/examples/benchmark/ipex.py b/examples/benchmark/ipex.py index f6d495788..170e96728 100644 --- a/examples/benchmark/ipex.py +++ b/examples/benchmark/ipex.py @@ -20,6 +20,7 @@ import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + try: from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf bind_cores_for_best_perf() @@ -29,6 +30,7 @@ import argparse + parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.") parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.") parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.") diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py index edadcb32f..0968d5193 100644 --- a/examples/benchmark/perplexity.py +++ b/examples/benchmark/perplexity.py @@ -17,9 +17,11 @@ import argparse import os -from gptqmodel.utils import Perplexity from transformers import AutoTokenizer +from gptqmodel.utils import Perplexity + + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if __name__ == "__main__": diff --git a/examples/evaluation/run_language_modeling_task.py b/examples/evaluation/run_language_modeling_task.py index fce213b48..f31d6fa2d 100644 --- a/examples/evaluation/run_language_modeling_task.py +++ b/examples/evaluation/run_language_modeling_task.py @@ -18,10 +18,12 @@ import datasets import torch +from transformers import AutoTokenizer + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from gptqmodel.eval_tasks import LanguageModelingTask from gptqmodel.utils.torch import torch_empty_cache -from transformers import AutoTokenizer + DATASET = "tatsu-lab/alpaca" WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n" diff --git a/examples/evaluation/run_sequence_classification_task.py b/examples/evaluation/run_sequence_classification_task.py index 36d0324c3..38790bc84 100644 --- a/examples/evaluation/run_sequence_classification_task.py +++ b/examples/evaluation/run_sequence_classification_task.py @@ -19,10 +19,12 @@ import datasets import torch +from transformers import AutoTokenizer + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from gptqmodel.eval_tasks import SequenceClassificationTask from gptqmodel.utils.torch import torch_empty_cache -from transformers import AutoTokenizer + DATASET = "cardiffnlp/tweet_sentiment_multilingual" TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:" diff --git a/examples/evaluation/run_text_summarization_task.py b/examples/evaluation/run_text_summarization_task.py index a1edb620a..a4abb9829 100644 --- a/examples/evaluation/run_text_summarization_task.py +++ b/examples/evaluation/run_text_summarization_task.py @@ -19,10 +19,12 @@ import datasets import torch +from transformers import AutoTokenizer, GenerationConfig + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig from gptqmodel.eval_tasks import TextSummarizationTask from gptqmodel.utils.torch import torch_empty_cache -from transformers import AutoTokenizer, GenerationConfig + os.system("pip install py7zr") diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py index bc9bed650..4b8fc18d9 100644 --- a/examples/inference/run_transformers.py +++ b/examples/inference/run_transformers.py @@ -16,6 +16,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0])) diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py index 6ea5cbd5d..5d08066cd 100644 --- a/examples/inference/run_with_different_backends.py +++ b/examples/inference/run_with_different_backends.py @@ -19,9 +19,11 @@ import sys from argparse import ArgumentParser -from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device from transformers import AutoTokenizer +from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device + + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" quantized_model_id = "./TinyLlama/TinyLlama-1.1B-Chat-v1.0-4bit-128g" diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py index 39eada708..6819bc4fe 100644 --- a/examples/quantization/basic_usage.py +++ b/examples/quantization/basic_usage.py @@ -16,9 +16,11 @@ import os -from gptqmodel import GPTQModel, QuantizeConfig, get_best_device from transformers import AutoTokenizer +from gptqmodel import GPTQModel, QuantizeConfig, get_best_device + + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0" diff --git a/examples/quantization/basic_usage_autoround.py b/examples/quantization/basic_usage_autoround.py index 436a18ba1..0c27ed7b1 100644 --- a/examples/quantization/basic_usage_autoround.py +++ b/examples/quantization/basic_usage_autoround.py @@ -15,9 +15,11 @@ # limitations under the License. import torch +from transformers import AutoTokenizer + from gptqmodel import GPTQModel from gptqmodel.quantization.config import AutoRoundQuantizeConfig # noqa: E402 -from transformers import AutoTokenizer + pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0" quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g" diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py index ac1ba63d9..95ba908ad 100644 --- a/examples/quantization/basic_usage_wikitext2.py +++ b/examples/quantization/basic_usage_wikitext2.py @@ -16,9 +16,11 @@ import torch from datasets import load_dataset -from gptqmodel import GPTQModel, QuantizeConfig from transformers import AutoTokenizer +from gptqmodel import GPTQModel, QuantizeConfig + + pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0" quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g" diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py index c9e15b5fb..75b1e7a74 100755 --- a/examples/quantization/transformers_usage.py +++ b/examples/quantization/transformers_usage.py @@ -16,6 +16,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig + model_id = "facebook/opt-125m" tokenizer = AutoTokenizer.from_pretrained(model_id) dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py index 4a13698b4..68f4e6ef7 100644 --- a/gptqmodel/__init__.py +++ b/gptqmodel/__init__.py @@ -22,6 +22,7 @@ from .utils.exllama import exllama_set_max_input_length from .version import __version__ + if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']: try: from modelscope.utils.hf_util.patcher import patch_hub diff --git a/gptqmodel/adapter/adapter.py b/gptqmodel/adapter/adapter.py index af7ca0ed2..f6cea791e 100644 --- a/gptqmodel/adapter/adapter.py +++ b/gptqmodel/adapter/adapter.py @@ -10,7 +10,7 @@ from .peft import LoraConfig from .remote import resolve_path -logger = setup_logger() +log = setup_logger() LORA_MERGED_WEIGHT_PATHS = [None, ""] HF_ADAPTER_FILE_NAME = "adapter_model.safetensors" HF_ADAPTER_CONFIG_FILE_NAME = "adapter_config.json" @@ -30,7 +30,7 @@ def get(cls, path: str) -> Optional[Tuple[LoraConfig, Dict[str, torch.Tensor]]]: @classmethod def reset(cls): - logger.info("Adapter Cache: Resetting cache") + log.info("Adapter Cache: Resetting cache") cls.cache = {} @classmethod @@ -181,10 +181,10 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N # we have consumed all modules if len(lora_weights) == 0: AdapterCache.remove(self.path) - logger.info("Adapter: Consumed all Lora weights") + log.info("Adapter: Consumed all Lora weights") else: - logger.warn(f"Adapter: Lora weights not found for `{weight_key}`") + log.warn(f"Adapter: Lora weights not found for `{weight_key}`") assert lora_A is not None and lora_B is not None, f"Adapter: `lora_A` and `lora_B` must both be present in the weights: actual = `{lora_A}` and `{lora_B}`" @@ -198,7 +198,7 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N # print(f"Adapter: {self.name()}, loaded lora_A shape: {lora_A.shape}") # print(f"Adapter: {self.name()}, loaded lora_B shape: {lora_B.shape}") if lora_A.dtype != torch.float16 or lora_A.dtype != torch.float16: - logger.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.") + log.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.") self.lora_A = lora_A.to(device=device, dtype=torch.float16) self.lora_B = lora_B.to(device=device, dtype=torch.float16) @@ -216,7 +216,7 @@ def dynamic_rank_override(self, lora_cfg: LoraConfig, weight_key: str) -> bool: # first do string full match, then suffix match, then regex match if weight_key == k or k.endswith(weight_key) or re.match(k, weight_key): self.rank = v - logger.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.") + log.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.") return True return False diff --git a/gptqmodel/eora/eora.py b/gptqmodel/eora/eora.py index 6f1ea09f0..956b13ef3 100644 --- a/gptqmodel/eora/eora.py +++ b/gptqmodel/eora/eora.py @@ -22,7 +22,7 @@ from ..looper.named_module import NamedModule from ..utils.logger import setup_logger -logger = setup_logger() +log = setup_logger() def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.dtype], sample_size: int): inp = input[0].to(dtype=torch.float32) @@ -54,7 +54,7 @@ def eora_compute_lora( L, Q = torch.linalg.eigh(raw_scaling_diag_matrix) if (L < 0).any(): ## When expanding the calibration data size for EoRA, I suggest maintaining the balance by allocating 50% to general input (C4) and the remaining 50% to downstream task data. - logger.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.") + log.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.") minimum = torch.min(L[L > 0]) L[L < 0] = minimum @@ -64,7 +64,7 @@ def eora_compute_lora( try: scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix) except Exception: - logger.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert? + log.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert? scaling_diag_matrix += 1e-6 * torch.eye(scaling_diag_matrix.shape[0]).to(device) scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix) diff --git a/gptqmodel/looper/dequantize_processor.py b/gptqmodel/looper/dequantize_processor.py index 26f7fd674..7ef71a0f1 100644 --- a/gptqmodel/looper/dequantize_processor.py +++ b/gptqmodel/looper/dequantize_processor.py @@ -22,7 +22,7 @@ from ..utils.logger import setup_logger from ..utils.torch import torch_compile -logger = setup_logger() +log = setup_logger() class DequantizeProcessor(LoopProcessor): def __init__(self, quantized_modules: Dict[str, TorchQuantLinear]): diff --git a/gptqmodel/looper/eora_processor.py b/gptqmodel/looper/eora_processor.py index 5da732acc..6eee539f6 100644 --- a/gptqmodel/looper/eora_processor.py +++ b/gptqmodel/looper/eora_processor.py @@ -34,7 +34,7 @@ from ..utils.model import move_to from ..utils.torch import torch_compile, torch_sync -logger = setup_logger() +log = setup_logger() class EoraProcessor(LoopProcessor): @@ -182,7 +182,7 @@ def process(self, module: NamedModule): stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name) self.log.append(stat) - logger.info(stat) + log.info(stat) # logger.info(f"Quantizing module END: {name}, {gptq[name].shape()}") self.result_save(module.full_name, { diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py index c57da50e5..e7dc7740f 100644 --- a/gptqmodel/looper/gptq_processor.py +++ b/gptqmodel/looper/gptq_processor.py @@ -32,7 +32,7 @@ from ..utils.model import move_to, pack_model from ..utils.torch import torch_sync -logger = setup_logger() +log = setup_logger() class GPTQProcessor(LoopProcessor): def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func, @@ -90,7 +90,7 @@ def preprocess(self, module: NamedModule, buffered_fwd: bool): # deepseek has massive # of sub-modules per layer, causing vram pressure # buffered mode is slower due to gpu<->cpu movement if buffered_fwd: # TODO tweak this number for masive MoE - logger.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`") + log.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`") tmp.fwd_inputs_buffered = True tmp.quantizer.configure( @@ -160,7 +160,7 @@ def process(self, module: NamedModule): stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name) self.log.append(stat) - logger.info(stat) + log.info(stat) self.result_save(module.full_name, { "scale": move_to(scale, device=CPU, stream=self.stream), diff --git a/gptqmodel/looper/loop_processor.py b/gptqmodel/looper/loop_processor.py index 13fba7c86..eb01996dd 100644 --- a/gptqmodel/looper/loop_processor.py +++ b/gptqmodel/looper/loop_processor.py @@ -27,7 +27,7 @@ from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory from ..utils.logger import setup_logger -logger = setup_logger() +log = setup_logger() # LoopProcessor is a singleton(), not per module instance @@ -91,7 +91,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare min_calibration_dataset_size = 256 min_calibration_dataset_input_ids_avg_length = 256 if len(calibration_dataset) < min_calibration_dataset_size: - logger.warning(f"Calibration dataset size should be more than {min_calibration_dataset_size}. " + log.warn(f"Calibration dataset size should be more than {min_calibration_dataset_size}. " f"Current: {len(calibration_dataset)}.") calibration_dataset = prepare_dataset_func(calibration_dataset=calibration_dataset, @@ -119,7 +119,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare avg = total_input_ids_length / len(calibration_dataset) if avg < min_calibration_dataset_input_ids_avg_length: - logger.warning(f"The average length of input_ids of calibration_dataset should be greater than " + log.warn(f"The average length of input_ids of calibration_dataset should be greater than " f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.") self.num_batches = len(calibration_dataset) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index d897517b9..16684a008 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -34,7 +34,7 @@ get_moe_layer_modules, move_to, nested_move_to) from ..utils.torch import torch_empty_cache -logger = setup_logger() +log = setup_logger() class ModuleLooper(): def __init__(self, model: BaseGPTQModel, processors: List[LoopProcessor]): @@ -192,7 +192,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal num_experts=num_experts) layer_count = len(layers) - quant_modules_pb = (logger.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count)) + quant_modules_pb = (log.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count)) .manual() .set(left_steps_offset=1)) @@ -419,7 +419,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal # ignore log pass else: - logger.info(f"{reverse_p.name()} summary:\n{reverse_p.log}") + log.info(f"{reverse_p.name()} summary:\n{reverse_p.log}") processor_name = reverse_p.name() total_log[processor_name] = reverse_p.log @@ -427,7 +427,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal self.gptq_model.quant_log = reverse_p.log for module_log in reverse_p.log: - logger.info(module_log) + log.info(module_log) reverse_p.log_plotly() reverse_p.finalize(model=self.gptq_model, **kwargs) diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py index 083418973..6152e9dfa 100644 --- a/gptqmodel/models/_const.py +++ b/gptqmodel/models/_const.py @@ -25,6 +25,7 @@ from ..utils.rocm import IS_ROCM from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU + CPU = device("cpu") CUDA = device("cuda") CUDA_0 = device("cuda:0") diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index a3c1a5aea..ef611839a 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -20,18 +20,20 @@ from ..utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() if not os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None): os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'expandable_segments:True' - logger.info("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.") + log.info("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.") if not os.environ.get("CUDA_DEVICE_ORDER", None): os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID' - logger.info("ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.") + log.info("ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.") import sys # noqa: E402 + # TODO: waiting for pytorch implementgation of aten ops for MPS if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" @@ -106,6 +108,7 @@ from .definitions.xverse import XverseGPTQ # noqa: E402 from .definitions.yi import YiGPTQ # noqa: E402 + # make quants and inference more determinisitc torch.manual_seed(787) random.seed(787) @@ -247,13 +250,13 @@ def from_pretrained( ) -> BaseGPTQModel: if hasattr(AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code), "quantization_config"): - logger.warning("Model is already quantized, will use `from_quantized` to load quantized model.\n" + log.warn("Model is already quantized, will use `from_quantized` to load quantized model.\n" "If you want to quantize the model, please pass un_quantized model path or id, and use " "`from_pretrained` with `quantize_config`.") return cls.from_quantized(model_id_or_path, trust_remote_code=trust_remote_code) if quantize_config and quantize_config.dynamic: - logger.warning( + log.warn( "GPTQModel's per-module `dynamic` quantization feature is currently not upstreamed to hf/vllm/sglang. If you're using vllm, you need to install this PR: https://github.com/vllm-project/vllm/pull/7086") model_type = check_and_get_model_type(model_id_or_path, trust_remote_code) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 590b851d7..b74ba22b0 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -28,8 +28,14 @@ from packaging import version from packaging.version import Version from tokenicer import Tokenicer -from transformers import (AutoModelForCausalLM, AutoProcessor, PreTrainedModel, - PreTrainedTokenizerBase, ProcessorMixin, modeling_utils) +from transformers import ( + AutoModelForCausalLM, + AutoProcessor, + PreTrainedModel, + PreTrainedTokenizerBase, + ProcessorMixin, + modeling_utils, +) from ..adapter.adapter import Adapter from ..nn_modules.hooked_linear import replace_linear_with_hooked_linear @@ -43,13 +49,31 @@ from ..utils.hf import autofix_hf_model_config from ..utils.importer import select_quant_linear from ..utils.logger import setup_logger -from ..utils.model import (MODALITY, check_to_quantized, find_modules, get_device, get_module, - get_module_by_name_prefix, get_moe_layer_modules, move_to, nested_move_to, pack_model) +from ..utils.model import ( + MODALITY, + check_to_quantized, + find_modules, + get_device, + get_module, + get_module_by_name_prefix, + get_moe_layer_modules, + move_to, + nested_move_to, + pack_model, +) from ..utils.torch import torch_compile, torch_empty_cache from ._const import CALIBRATION_DATASET_CONCAT_CHAR, CPU, DEFAULT_MAX_SHARD_SIZE, DEVICE, SUPPORTS_MODULE_TYPES from .loader import ModelLoader -from .writer import (PROCESS_LOG_FWD_TIME, PROCESS_LOG_LAYER, PROCESS_LOG_MODULE, - PROCESS_LOG_TIME, QUANT_LOG_DAMP, QUANT_LOG_LOSS, ModelWriter) +from .writer import ( + PROCESS_LOG_FWD_TIME, + PROCESS_LOG_LAYER, + PROCESS_LOG_MODULE, + PROCESS_LOG_TIME, + QUANT_LOG_DAMP, + QUANT_LOG_LOSS, + ModelWriter, +) + # pytorch 2.6.0 fixes many compilation errors TORCH_MIN_VERSION_STR = '2.6.0' @@ -63,7 +87,7 @@ def check_support_param_buffer_assignment(*args, **kwargs): # See https://github.com/huggingface/transformers/issues/34366 modeling_utils.check_support_param_buffer_assignment = check_support_param_buffer_assignment -logger = setup_logger() +log = setup_logger() class BaseGPTQModel(nn.Module): # these modules are non-repeating and at the root level @@ -183,10 +207,10 @@ def __init__( if all(hasattr(m.adapter, name) for name in Lora.parameter_keys()): loaded_loras += 1 - logger.info(f"Adapter: `{loaded_loras}` EoRA/Lora adapters loaded for `{len(qmodules)}` modules.") + log.info(f"Adapter: `{loaded_loras}` EoRA/Lora adapters loaded for `{len(qmodules)}` modules.") # print kernel info: - logger.info(f"Kernel: loaded -> `[{', '.join(cls.__name__ for cls in self.kernels())}]`") + log.info(f"Kernel: loaded -> `[{', '.join(cls.__name__ for cls in self.kernels())}]`") def prepare_dataset( self, @@ -566,7 +590,7 @@ def quantize_old( min_calibration_dataset_input_ids_avg_length = 256 if len(calibration_dataset) < min_calibration_dataset_size: - logger.warning(f"Calibration dataset size should be more than {min_calibration_dataset_size}. " + log.warn(f"Calibration dataset size should be more than {min_calibration_dataset_size}. " f"Current: {len(calibration_dataset)}.") if self.quantize_config.format == FORMAT.BITBLAS: @@ -599,7 +623,7 @@ def quantize_old( avg = total_input_ids_length / len(calibration_dataset) if avg < min_calibration_dataset_input_ids_avg_length: - logger.warning(f"The average length of input_ids of calibration_dataset should be greater than " + log.warn(f"The average length of input_ids of calibration_dataset should be greater than " f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.") if isinstance(self.quantize_config, AutoRoundQuantizeConfig): @@ -820,7 +844,7 @@ def store_input_hook(_, args, kwargs): quantizers = {} layer_count = len(layers) - quant_modules_pb = logger.pb(range(layer_count + 1 if self.quantize_config.lm_head else layer_count)).manual() + quant_modules_pb = log.pb(range(layer_count + 1 if self.quantize_config.lm_head else layer_count)).manual() gpu_memorys = [] cpu_memorys = [] durations = [] @@ -881,7 +905,7 @@ def store_input_hook(_, args, kwargs): layer_name = self.lm_head if is_lm_head_module else f"{self.layers_node}.{module_index}.{name}" if self.quantize_config.dynamic_get(layer_name=layer_name) == False: # noqa: E712 - logger.info(f"skip module: {layer_name}") + log.info(f"skip module: {layer_name}") skipped_modules.append(name) continue @@ -903,7 +927,7 @@ def store_input_hook(_, args, kwargs): # deepseek has massive # of sub-modules per layer, causing vram pressure # buffered mode is slower due to gpu<->cpu movement if buffered_fwd: # TODO tweak this number for masive MoE - logger.info(f"Experimental: enabling fwd buffered mode for: `{name}`") + log.info(f"Experimental: enabling fwd buffered mode for: `{name}`") tmp.fwd_inputs_buffered = True tmp.quantizer.configure( @@ -1016,7 +1040,7 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor): stat["dynamic"] = self.quantize_config.dynamic_get(layer_name=layer_name) self.quant_log.append(stat) - logger.info(stat) + log.info(stat) quantizers[layer_name] = ( gptq[name].quantizer.to(CPU), @@ -1079,9 +1103,9 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor): if auto_gc: torch_empty_cache() - logger.info(f"Quantization summary:\n{self.quant_log}") + log.info(f"Quantization summary:\n{self.quant_log}") for module_log in self.quant_log: - logger.info(module_log) + log.info(module_log) if task is not None: x = list(range(layer_count)) gpu_fig = create_plotly(x=x, y=gpu_memorys, xaxis_title="layer", yaxis_title="GPU usage (GB)") @@ -1153,7 +1177,7 @@ def push_to_hub(self, exists_ok: bool = False, # set to true if repo already exists token: Optional[str] = None): - logger.error("`push_to_hub()` api cannot be used on the model instance. Please use `GPTQModel.push_to_hub()` static api instead.") + log.error("`push_to_hub()` api cannot be used on the model instance. Please use `GPTQModel.push_to_hub()` static api instead.") def save( self, @@ -1199,31 +1223,31 @@ def kernels(self) -> List[Type[BaseQuantLinear]]: return list(loaded_kernels) def compile(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False): - logger.warn("Deprecation: `model.compile()` is deprecated. Please use `model.optimize()` instead.") + log.warn("Deprecation: `model.compile()` is deprecated. Please use `model.optimize()` instead.") return self.optimize(backend=backend, mode=mode, fullgraph=fullgraph) def optimize(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False): if not self.quantized: - logger.warning("model is not quantized, skip compiling...") + log.warn("model is not quantized, skip compiling...") return self if Version(torch.__version__) < PYTORCH_MIN_VERSION_WITH_COMPILE: self.compiled = False - logger.warning(f"To use compile(), you need to have torch version >= {TORCH_MIN_VERSION_STR}, please " + log.warn(f"To use compile(), you need to have torch version >= {TORCH_MIN_VERSION_STR}, please " f"upgrade it by `pip install torch -U`") return self # needed by eora # torch._dynamo.config.capture_scalar_outputs = True - logger.info(f"Compiling qlinear modules with backend: `{backend}`, mode: `{mode}`") + log.info(f"Compiling qlinear modules with backend: `{backend}`, mode: `{mode}`") modules = find_modules(self.model, layers=[BaseQuantLinear]) for name in modules.keys(): modules[name].optimize(fullgraph=False, backend=backend, mode=mode) # supress errors until PyTorch fixed: https://github.com/pytorch/pytorch/issues/132635 # torch._dynamo.config.suppress_errors = True - logger.info(f"Compiling model with backend: `{backend}`, mode: `{mode}`") + log.info(f"Compiling model with backend: `{backend}`, mode: `{mode}`") self.model = torch_compile(self.model, fullgraph=fullgraph, backend=backend, mode=mode) diff --git a/gptqmodel/models/definitions/gemma2.py b/gptqmodel/models/definitions/gemma2.py index c7b983402..c528e688c 100644 --- a/gptqmodel/models/definitions/gemma2.py +++ b/gptqmodel/models/definitions/gemma2.py @@ -18,7 +18,8 @@ from ...utils.logger import setup_logger from ..base import BaseGPTQModel -logger = setup_logger() + +log = setup_logger() SUPPORT_ERR = "Currently, only vLLM/SGLang with flashinfer enabled can correctly inference a quantized Gemma2-27B model. Pre-quantized model with sample vLLM code: https://huggingface.co/ModelCloud/gemma-2-27b-it-gptq-4bit ." @@ -44,7 +45,7 @@ def __init__(self, *args, **kwargs): # The gemma-2 model 9b has 42 hidden layers, while the gemma-2 model 27b has 46 hidden layers. if num_hidden_layers > 42: if not self.quantized: - logger.warning(SUPPORT_ERR) + log.warn(SUPPORT_ERR) return # quantized gemma-2 27b model only support vLLM/SGLang load. diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 6742d267b..6fc430aa5 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -23,6 +23,7 @@ import torch import transformers + if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']: try: from modelscope import snapshot_download @@ -47,12 +48,23 @@ from ..utils.importer import auto_select_device, normalize_device_device_map, select_quant_linear from ..utils.logger import setup_logger from ..utils.marlin import _validate_marlin_compatibility, _validate_marlin_device_support -from ..utils.model import (auto_dtype, convert_gptq_v1_to_v2_format, find_modules, get_checkpoints, - get_moe_layer_modules, gptqmodel_post_init, load_checkpoint_in_model_then_tie_weights, - make_quant, simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes) +from ..utils.model import ( + auto_dtype, + convert_gptq_v1_to_v2_format, + find_modules, + get_checkpoints, + get_moe_layer_modules, + gptqmodel_post_init, + load_checkpoint_in_model_then_tie_weights, + make_quant, + simple_dispatch_model, + verify_model_hash, + verify_sharded_model_hashes, +) from ._const import DEVICE, SUPPORTED_MODELS, normalize_device -logger = setup_logger() + +log = setup_logger() ATTN_IMPLEMENTATION = "attn_implementation" USE_FLASH_ATTENTION_2 = "use_flash_attention_2" @@ -191,7 +203,7 @@ def skip(*args, **kwargs): model.seqlen = model_config[key] break else: - logger.warning("Model: can't get model's sequence length from model config, will set to 4096.") + log.warn("Model: can't get model's sequence length from model config, will set to 4096.") model.seqlen = 4096 model.eval() @@ -395,7 +407,7 @@ def from_quantized( verfieid = verify_model_hash(model_save_name, verify_hash) if not verfieid: raise ValueError(f"Hash verification failed for {model_save_name}") - logger.info(f"Hash verification succeeded for {model_save_name}") + log.info(f"Hash verification succeeded for {model_save_name}") # == step2: convert model to gptq-model (replace Linear with QuantLinear) == # def skip(*args, **kwargs): @@ -432,7 +444,7 @@ def skip(*args, **kwargs): elif is_flash_attn_2_available() and not has_attn_implementation: args = {USE_FLASH_ATTENTION_2: True} - logger.info("Optimize: Auto enabling flash attention2") + log.info("Optimize: Auto enabling flash attention2") model = cls.loader.from_config( config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, **args @@ -457,7 +469,7 @@ def skip(*args, **kwargs): ): # log non-lm-head quantized modules only if name is not cls.lm_head: - logger.info(f"The layer {name} is not quantized.") + log.info(f"The layer {name} is not quantized.") del modules[name] preload_qlinear_kernel = make_quant( @@ -571,7 +583,7 @@ def skip(*args, **kwargs): model.seqlen = model_config[key] break else: - logger.warning("can't get model's sequence length from model config, will set to 4096.") + log.warn("can't get model's sequence length from model config, will set to 4096.") model.seqlen = 4096 # Any post-initialization that require device information, for example buffers initialization on device. diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py index 7100812d8..392859ade 100644 --- a/gptqmodel/models/writer.py +++ b/gptqmodel/models/writer.py @@ -37,19 +37,37 @@ from ..adapter.adapter import HF_ADAPTER_FILE_NAME, HF_ADAPTER_WEIGHT_KEY_PREFIX, Lora from ..adapter.peft import LoraConfig -from ..quantization.config import (FORMAT, META_FIELD_DAMP_AUTO_INCREMENT, META_FIELD_DAMP_PERCENT, META_FIELD_MSE, - META_FIELD_QUANTIZER, META_FIELD_STATIC_GROUPS, META_FIELD_TRUE_SEQUENTIAL, - META_FIELD_URI, META_QUANTIZER_GPTQMODEL, META_VALUE_URI, MIN_VERSION_WITH_V2) +from ..quantization.config import ( + FORMAT, + META_FIELD_DAMP_AUTO_INCREMENT, + META_FIELD_DAMP_PERCENT, + META_FIELD_MSE, + META_FIELD_QUANTIZER, + META_FIELD_STATIC_GROUPS, + META_FIELD_TRUE_SEQUENTIAL, + META_FIELD_URI, + META_QUANTIZER_GPTQMODEL, + META_VALUE_URI, + MIN_VERSION_WITH_V2, +) from ..utils.backend import BACKEND from ..utils.logger import setup_logger -from ..utils.model import (convert_gptq_v2_to_v1_format, copy_py_files, find_modules, - get_model_files_size, get_moe_layer_modules, get_state_dict_for_save, - load_checkpoint_in_model_then_tie_weights, make_quant) +from ..utils.model import ( + convert_gptq_v2_to_v1_format, + copy_py_files, + find_modules, + get_model_files_size, + get_moe_layer_modules, + get_state_dict_for_save, + load_checkpoint_in_model_then_tie_weights, + make_quant, +) from ..utils.torch import torch_empty_cache from ..version import __version__ from ._const import CPU, DEFAULT_MAX_SHARD_SIZE -logger = setup_logger() + +log = setup_logger() PROCESS_LOG_NAME = "process" PROCESS_LOG_LAYER = "layer" @@ -67,7 +85,7 @@ def save_pretrained( save_dir: str, **kwargs, ): - logger.warning("You are using save_pretrained, which will re-direct to save_quantized.") + log.warn("You are using save_pretrained, which will re-direct to save_quantized.") self.save_quantized(save_dir=save_dir, **kwargs) cls.save_pretrained = save_pretrained @@ -96,7 +114,7 @@ def _eora_save(self, save_dir: str, model_save_dir: str): for lora_key, lora_weight in d.items(): assert isinstance(lora_weight, torch.Tensor) weights[f"{key}.{lora_key}"] = lora_weight - logger.info(f"Adapter: EoRA weights found -> `{key}.{lora_key}`, rank = `{lora_rank}`") + log.info(f"Adapter: EoRA weights found -> `{key}.{lora_key}`, rank = `{lora_rank}`") weight_file_path = f"{save_dir.removesuffix('/')}/{HF_ADAPTER_FILE_NAME}" @@ -112,7 +130,7 @@ def _eora_save(self, save_dir: str, model_save_dir: str): rank_pattern=rank_pattern) lora_cfg.save_pretrained(save_dir=save_dir) - logger.info(f"Adapter: Saving EoRA weights to -> `{save_dir}`") + log.info(f"Adapter: Saving EoRA weights to -> `{save_dir}`") os.makedirs(os.path.dirname(save_dir), exist_ok=True) save_file(tensors=weights, filename=weight_file_path, metadata={"format": "pt"}) @@ -146,7 +164,7 @@ def save_quantized( if len(meta_quantizer.split(":")) == 2: quantizers.append(meta_quantizer.replace(" ","")) else: - logger.warning(f"meta_quantizer: '{meta_quantizer}' format is invalid, expected: 'quantizer_name:version'") + log.warn(f"meta_quantizer: '{meta_quantizer}' format is invalid, expected: 'quantizer_name:version'") # write gptqmodel tooling fingerprint to config self.quantize_config.meta_set_versionable( @@ -192,7 +210,7 @@ def save_quantized( raise ValueError("Save aborted as model is not quantized. Please call `quantize()` first.") if quantize_config.format == FORMAT.GPTQ_V2: - logger.warning( + log.warn( f"Using 'format = {FORMAT.GPTQ_V2}': the serialized model is only supported by GPTQModel version >= {MIN_VERSION_WITH_V2}." ) @@ -273,7 +291,7 @@ def debug_saved_config(path): model_save_name = model_base_name + ".safetensors" if not self.qlinear_kernel.SUPPORTS_SHARDS and max_shard_size is not None: - logger.warning("Sharding is not supported for this quant. Disabling sharding.") + log.warn("Sharding is not supported for this quant. Disabling sharding.") max_shard_size = None if max_shard_size is None: @@ -282,7 +300,7 @@ def debug_saved_config(path): elif not isinstance(safetensors_metadata, dict): raise TypeError("safetensors_metadata must be a dictionary.") else: - logger.debug(f"Received safetensors_metadata: {safetensors_metadata}") + log.debug(f"Received safetensors_metadata: {safetensors_metadata}") new_safetensors_metadata = {} converted_keys = False for key, value in safetensors_metadata.items(): @@ -296,13 +314,13 @@ def debug_saved_config(path): f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}" ) if new_key in new_safetensors_metadata: - logger.warning( + log.warn( f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting." ) new_safetensors_metadata[new_key] = new_value safetensors_metadata = new_safetensors_metadata if converted_keys: - logger.debug( + log.debug( f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}" ) @@ -342,7 +360,7 @@ def debug_saved_config(path): elif not isinstance(safetensors_metadata, dict): raise TypeError("safetensors_metadata must be a dictionary.") else: - logger.debug(f"Received safetensors_metadata: {safetensors_metadata}") + log.debug(f"Received safetensors_metadata: {safetensors_metadata}") new_safetensors_metadata = {} converted_keys = False for key, value in safetensors_metadata.items(): @@ -355,12 +373,12 @@ def debug_saved_config(path): raise TypeError( f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}") if new_key in new_safetensors_metadata: - logger.warning( + log.warn( f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.") new_safetensors_metadata[new_key] = new_value safetensors_metadata = new_safetensors_metadata if converted_keys: - logger.debug( + log.debug( f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}") # Format is required to enable Accelerate to load the metadata @@ -394,9 +412,9 @@ def debug_saved_config(path): size_diff_mb = pre_quantized_size_mb - total_size_mb size_diff_gb = size_diff_mb / 1024 percent_diff = (size_diff_mb / pre_quantized_size_mb) * 100 - logger.info(f"Pre-Quantized model size: {pre_quantized_size_mb:.2f}MB, {pre_quantized_size_gb:.2f}GB") - logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB") - logger.info(f"Size difference: {size_diff_mb:.2f}MB, {size_diff_gb:.2f}GB - {percent_diff:.2f}%") + log.info(f"Pre-Quantized model size: {pre_quantized_size_mb:.2f}MB, {pre_quantized_size_gb:.2f}GB") + log.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB") + log.info(f"Size difference: {size_diff_mb:.2f}MB, {size_diff_gb:.2f}GB - {percent_diff:.2f}%") # need to copy .py files for model/tokenizers not yet merged to HF transformers if self.trust_remote_code: @@ -457,7 +475,7 @@ def skip(*args, **kwargs): ): # log non-lm-head quantizerd modules only if name is not self.lm_head: - logger.info(f"The layer {name} is not quantized.") + log.info(f"The layer {name} is not quantized.") del modules[name] make_quant( diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py index 6705e8594..60778766f 100644 --- a/gptqmodel/nn_modules/qlinear/__init__.py +++ b/gptqmodel/nn_modules/qlinear/__init__.py @@ -28,7 +28,8 @@ from ...utils.backend import BACKEND from ...utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() class BaseQuantLinear(nn.Module): SUPPORTS_BITS: List[int] = None @@ -346,7 +347,7 @@ def validate_device(cls, device: DEVICE): # override me, to perform any torch.compile logic on the kernel pre forward def optimize(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False): self.optimized = True - logger.info.once(f"Optimize: `{self.__class__.__name__}` compilation triggered.") + log.info.once(f"Optimize: `{self.__class__.__name__}` compilation triggered.") pass class PackableQuantLinear(BaseQuantLinear): diff --git a/gptqmodel/nn_modules/qlinear/bitblas.py b/gptqmodel/nn_modules/qlinear/bitblas.py index 18aeef7b3..f924c36ca 100644 --- a/gptqmodel/nn_modules/qlinear/bitblas.py +++ b/gptqmodel/nn_modules/qlinear/bitblas.py @@ -30,7 +30,8 @@ from ...utils import BACKEND from ...utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() BITBLAS_TARGET = None BITBLAS_DATABASE_PATH = None @@ -253,13 +254,13 @@ def _get_or_create_bitblas_operator(self, config, enable_tuning): global_operator_cache.save_into_database( BITBLAS_DATABASE_PATH, BITBLAS_TARGET ) - logger.info( + log.info( "BitBLAS Tuning done, appended operator to global_operator_cache." ) else: - logger.info("BitBLAS Operator created.") + log.info("BitBLAS Operator created.") else: - logger.info("BitBLAS Operator found in global_operator_cache.") + log.info("BitBLAS Operator found in global_operator_cache.") return bitblas_matmul def reset_parameters(self): diff --git a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py index 2f689846e..f3de0dd8c 100644 --- a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py +++ b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py @@ -23,7 +23,8 @@ from ...utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() TARGET_MISSING_ERROR = ( "TVM target not found. Please set the TVM target environment variable using `export TVM_TARGET=`, " @@ -44,10 +45,10 @@ def check_target(best, default): if check_target(best_match, "cuda") == best_match: match = best_match if score >= MATCH_THRESHOLD else "cuda" - logger.info(f"found best match: {match}") + log.info(f"found best match: {match}") return match else: - logger.warning(TARGET_MISSING_ERROR) + log.warn(TARGET_MISSING_ERROR) return "cuda" diff --git a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py index ec7bb1166..e909159dc 100644 --- a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py +++ b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py @@ -24,7 +24,8 @@ from ...utils.backend import BACKEND from ...utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() gptqmodel_cuda_import_exception = None diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py index ee4beb18f..29e446dfe 100644 --- a/gptqmodel/nn_modules/qlinear/exllama.py +++ b/gptqmodel/nn_modules/qlinear/exllama.py @@ -26,6 +26,7 @@ from ...nn_modules.qlinear import PackableQuantLinear from ...utils.backend import BACKEND + exllama_import_exception = None try: from gptqmodel_exllama_kernels import make_q4, q4_matmul @@ -169,7 +170,7 @@ def forward(self, x: torch.Tensor): x_dtype = x.dtype if x_dtype != torch.float16: - logger.warning_once( + logger.warn.once( f"Exllama kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model." ) diff --git a/gptqmodel/nn_modules/qlinear/exllama_eora.py b/gptqmodel/nn_modules/qlinear/exllama_eora.py index e957df188..6c084cafd 100644 --- a/gptqmodel/nn_modules/qlinear/exllama_eora.py +++ b/gptqmodel/nn_modules/qlinear/exllama_eora.py @@ -25,6 +25,7 @@ from ...nn_modules.qlinear import BaseQuantLinear from ...utils.logger import setup_logger + exllama_v2v_import_exception = None try: diff --git a/gptqmodel/nn_modules/qlinear/exllamav2.py b/gptqmodel/nn_modules/qlinear/exllamav2.py index d08d2b266..788fb372a 100644 --- a/gptqmodel/nn_modules/qlinear/exllamav2.py +++ b/gptqmodel/nn_modules/qlinear/exllamav2.py @@ -26,13 +26,14 @@ from ...utils.backend import BACKEND from ...utils.logger import setup_logger + exllama_v2_import_exception = None try: from gptqmodel_exllamav2_kernels import gemm_half_q_half, make_q_matrix except ImportError as e: exllama_v2_import_exception = e -logger = setup_logger() +log = setup_logger() @@ -225,7 +226,7 @@ def forward(self, x: torch.Tensor, force_cuda=False): x_dtype = x.dtype if x_dtype != torch.float16: - logger.warning_once( + log.warn.once( f"Exllama v2 kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model." ) diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py index 870f89639..390dee27c 100644 --- a/gptqmodel/nn_modules/qlinear/ipex.py +++ b/gptqmodel/nn_modules/qlinear/ipex.py @@ -25,7 +25,8 @@ from ...utils.torch import torch_compile from . import PackableQuantLinear -logger = setup_logger() + +log = setup_logger() BITS_DTYPE_MAPPING = { 4: "int4_clip", diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py index 69e084b35..142170be8 100644 --- a/gptqmodel/nn_modules/qlinear/marlin.py +++ b/gptqmodel/nn_modules/qlinear/marlin.py @@ -30,13 +30,14 @@ from ...utils.logger import setup_logger from ...utils.rocm import IS_ROCM + marlin_import_exception = None try: import gptqmodel_marlin_kernels except ImportError as e: marlin_import_exception = e -logger = setup_logger() +log = setup_logger() GPTQ_MARLIN_TILE = 16 GPTQ_MARLIN_MIN_THREAD_N = 64 @@ -225,7 +226,7 @@ def __init__( self.fp32 = True if self.backend in [BACKEND.MARLIN, BACKEND.AUTO] else False if not self.fp32: - logger.warn.once("Kernel: Marlin FP16 mode is activated with reduced accuracy. Use default Marlin model for improved inference quality.") + log.warn.once("Kernel: Marlin FP16 mode is activated with reduced accuracy. Use default Marlin model for improved inference quality.") # Determine sharding if marlin_repeat_scales_on_all_ranks(desc_act, diff --git a/gptqmodel/nn_modules/qlinear/torch.py b/gptqmodel/nn_modules/qlinear/torch.py index 255e18fa8..3bcfea203 100644 --- a/gptqmodel/nn_modules/qlinear/torch.py +++ b/gptqmodel/nn_modules/qlinear/torch.py @@ -26,7 +26,8 @@ from ...utils.logger import setup_logger from ...utils.torch import torch_compile -logger = setup_logger() + +log = setup_logger() class TorchQuantLinear(PackableQuantLinear): SUPPORTS_BITS = [2, 3, 4, 8] diff --git a/gptqmodel/nn_modules/qlinear/tritonv2.py b/gptqmodel/nn_modules/qlinear/tritonv2.py index 4484c6e6c..bc75029f6 100644 --- a/gptqmodel/nn_modules/qlinear/tritonv2.py +++ b/gptqmodel/nn_modules/qlinear/tritonv2.py @@ -25,6 +25,7 @@ from ...utils.logger import setup_logger from . import PackableQuantLinear + try: import triton import triton.language as tl @@ -43,7 +44,7 @@ class TritonModuleMixin: TRITON_INSTALL_HINT = "Trying to use the triton backend, but it could not be imported. Please install triton by 'pip install gptqmodel[triton] --no-build-isolation'" TRITON_XPU_INSTALL_HINT = "Trying to use the triton backend and xpu device, but it could not be imported. Please install triton by [intel-xpu-backend-for-triton](https://github.com/intel/intel-xpu-backend-for-triton)" -logger = setup_logger() +log = setup_logger() class TritonV2QuantLinear(PackableQuantLinear, TritonModuleMixin): diff --git a/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/gptqmodel/nn_modules/triton_utils/custom_autotune.py index 72a9eedbe..9bce135cc 100644 --- a/gptqmodel/nn_modules/triton_utils/custom_autotune.py +++ b/gptqmodel/nn_modules/triton_utils/custom_autotune.py @@ -21,6 +21,7 @@ import triton + # code based https://github.com/fpgaminer/GPTQ-triton """ Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100. diff --git a/gptqmodel/nn_modules/triton_utils/kernels.py b/gptqmodel/nn_modules/triton_utils/kernels.py index 27ebfdffd..9d59fbb9a 100644 --- a/gptqmodel/nn_modules/triton_utils/kernels.py +++ b/gptqmodel/nn_modules/triton_utils/kernels.py @@ -22,7 +22,8 @@ from ...utils.logger import setup_logger from . import custom_autotune -logger = setup_logger() + +log = setup_logger() # code based https://github.com/fpgaminer/GPTQ-triton diff --git a/gptqmodel/quantization/__init__.py b/gptqmodel/quantization/__init__.py index ba5c5d889..7386923aa 100644 --- a/gptqmodel/quantization/__init__.py +++ b/gptqmodel/quantization/__init__.py @@ -14,7 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON, QUANT_CONFIG_FILENAME, - QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig) +from .config import ( + FORMAT, + FORMAT_FIELD_CODE, + FORMAT_FIELD_JSON, + QUANT_CONFIG_FILENAME, + QUANT_METHOD, + QUANT_METHOD_FIELD, + BaseQuantizeConfig, + QuantizeConfig, +) from .gptq import GPTQ from .quantizer import Quantizer, quantize diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py index f2b9734e2..b15d5aa0f 100644 --- a/gptqmodel/quantization/config.py +++ b/gptqmodel/quantization/config.py @@ -29,7 +29,8 @@ from ..adapter.adapter import Lora, normalize_adapter from ..utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() FORMAT_FIELD_CODE = "format" FORMAT_FIELD_JSON = "checkpoint_format" @@ -138,7 +139,7 @@ def dynamic_get(dynamic: Dict[str, Dict[str, Union[int, bool]]], module_name: st if isinstance(sub_value, Dict): return sub_value.get(sub_key, default) else: - logger.info(f"QuantConfig: Dynamic `sub_key`: `{sub_key}` failed extraction from `sub_value`: `{sub_value}`") + log.info(f"QuantConfig: Dynamic `sub_key`: `{sub_key}` failed extraction from `sub_value`: `{sub_value}`") else: return overrides.get(key, default) return default @@ -333,7 +334,7 @@ def save_pretrained(self, save_dir: str, **kwargs): with open(join(save_dir, QUANT_CONFIG_FILENAME), "w", encoding="utf-8") as f: d = self.to_dict() json_str = json.dumps(d, indent=2) - logger.info(f"Saved Quantize Config: \n{json_str}") + log.info(f"Saved Quantize Config: \n{json_str}") f.write(json_str) @classmethod @@ -386,17 +387,17 @@ def from_quant_config(cls, quantize_cfg, format: str = None): elif key in field_names: normalized[key] = val else: - logger.info(f"QuantizeConfig: Ignoring unknown parameter in the quantization configuration: {key}.") + log.info(f"QuantizeConfig: Ignoring unknown parameter in the quantization configuration: {key}.") if format_auto_inferred: - logger.info(f"QuantizeConfig: `{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}") + log.info(f"QuantizeConfig: `{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}") if normalized[FORMAT_FIELD_CODE] in {FORMAT.BITBLAS}: # AWQ and Marlin do not reorder the rows. normalized["desc_act"] = False if "sym" not in normalized: - logger.warning( + log.warn( "QuantizeConfig: config does not contain `sym` (symmetric quantization). This may result in silent errors. Defaulting to `sym=True`." ) @@ -483,7 +484,7 @@ def calculate_bits_per_weight(self): else: # there is only one scale int32 + one qzero int32 per entire module so overall it contributes to close to 0 bpw bpw = self.bits - logger.info(f"Estimated Quantization BPW (bits per weight): {bpw} bpw, based on [bits: {self.bits}, group_size: {self.group_size}]") + log.info(f"Estimated Quantization BPW (bits per weight): {bpw} bpw, based on [bits: {self.bits}, group_size: {self.group_size}]") @dataclass class AutoRoundQuantizeConfig(QuantizeConfig): @@ -549,4 +550,4 @@ def to_dict(self): class BaseQuantizeConfig(QuantizeConfig): def __init__(self, **kwargs): super().__init__(**kwargs) - logger.warning("QuantizeConfig: BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.") + log.warn("QuantizeConfig: BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.") diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py index 45fe11dff..d4e0941f2 100644 --- a/gptqmodel/quantization/gptq.py +++ b/gptqmodel/quantization/gptq.py @@ -32,7 +32,8 @@ from ..utils.torch import torch_sync from .quantizer import HF_OPTIMUM, Quantizer -logger = setup_logger() + +log = setup_logger() torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False @@ -229,10 +230,10 @@ def quantize( break except torch._C._LinAlgError as e: if self.qcfg.damp_auto_increment != 0: - logger.warning(f"Quantization: Current `damp_percent = {damp_percent:.5f}` is too low, auto-incrementing by `{ self.qcfg.damp_auto_increment:.5f}`") + log.warn(f"Quantization: Current `damp_percent = {damp_percent:.5f}` is too low, auto-incrementing by `{ self.qcfg.damp_auto_increment:.5f}`") damp_percent += self.qcfg.damp_auto_increment else: - logger.warning("Quantization: Please increase damp or nsamples for calibration data to avoid the following quant error: current damp_percent=`{damp_percent:.5f}`") + log.warn("Quantization: Please increase damp or nsamples for calibration data to avoid the following quant error: current damp_percent=`{damp_percent:.5f}`") raise e if not (0 < damp_percent < 1): diff --git a/gptqmodel/quantization/quantizer.py b/gptqmodel/quantization/quantizer.py index 985870a1b..993228278 100644 --- a/gptqmodel/quantization/quantizer.py +++ b/gptqmodel/quantization/quantizer.py @@ -22,7 +22,8 @@ from ..quantization import QuantizeConfig from ..utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() HF_OPTIMUM = "hf_optimum" diff --git a/gptqmodel/utils/bitblas.py b/gptqmodel/utils/bitblas.py index c8250838f..10eeef894 100644 --- a/gptqmodel/utils/bitblas.py +++ b/gptqmodel/utils/bitblas.py @@ -25,7 +25,8 @@ from .model import load_checkpoint_in_model_then_tie_weights from .torch import torch_empty_cache -logger = setup_logger() + +log = setup_logger() def prepare_model_for_bitblas_load( model, @@ -41,7 +42,7 @@ def prepare_model_for_bitblas_load( # The model (e.g. model.safetensors) is already serialized in the BitBLAS format, load it directly. if qcfg.format == FORMAT.BITBLAS: # if the checkpoint is already in bitblas format, we can load it directly. - logger.info(f"Loading a GPTQ model, detected BitBLAS serialized format at {model_save_name}.") + log.info(f"Loading a GPTQ model, detected BitBLAS serialized format at {model_save_name}.") model = convert_to_bitblas(model, quant_linear_class, qcfg, sym, desc_act, repack=False) load_checkpoint_in_model_then_tie_weights( model, @@ -91,7 +92,7 @@ def convert_to_bitblas(model, model_quantlinear, qcfg: QuantizeConfig, sym: bool # Note that due to tvm compilation of per layer modules shapes, the first layer loop is # relatively much slower if caching is not available. estimate time remaining is highly inaccurate - for name, module in logger.pb(list(model.named_modules())).title(message): + for name, module in log.pb(list(model.named_modules())).title(message): if not isinstance(module, model_quantlinear): continue diff --git a/gptqmodel/utils/hf.py b/gptqmodel/utils/hf.py index 8a7fc4d8e..b2d9c431b 100644 --- a/gptqmodel/utils/hf.py +++ b/gptqmodel/utils/hf.py @@ -2,26 +2,27 @@ from ..utils.logger import setup_logger -logger = setup_logger() + +log = setup_logger() # TODO FIXME! Pre-quantized use AutoModelForCausalLM.from_pretrained() but post-quantized use AutoModelForCausalLM.from_config() def autofix_hf_model_config(model: PreTrainedModel, path: str = None): if model.can_generate(): # sync config first if path: - logger.info(f"Model: Loaded `generation_config`: {model.generation_config}") + log.info(f"Model: Loaded `generation_config`: {model.generation_config}") try: cfg = GenerationConfig.from_pretrained(pretrained_model_name=path) if cfg != model.generation_config: model.generation_config = cfg - logger.info( + log.info( "Model: Auto-fixed `generation_config` mismatch between model and `generation_config.json`.") - logger.info(f"Model: Updated `generation_config`: {model.generation_config}") + log.info(f"Model: Updated `generation_config`: {model.generation_config}") else: pass # logger.info(f"Model: loaded `generation_config` matching `generation_config.json`.") except Exception: - logger.info("Model: `generation_config.json` not found. Skipped checking.") + log.info("Model: `generation_config.json` not found. Skipped checking.") # print(f"Before autofix_hf_model_config: {model.generation_config}") autofix_hf_generation_config(model.generation_config) @@ -51,5 +52,5 @@ def autofix_hf_generation_config(cfg: GenerationConfig): # fix wrong do_sample if errors > 0: cfg.do_sample = True - logger.info("Model: Auto-Fixed `generation_config` by setting `do_sample=True`.") + log.info("Model: Auto-Fixed `generation_config` by setting `do_sample=True`.") diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py index ea7b7aca6..8db2bacae 100644 --- a/gptqmodel/utils/importer.py +++ b/gptqmodel/utils/importer.py @@ -19,6 +19,7 @@ from typing import Dict, List, Optional, Type, Union import torch + from gptqmodel.adapter.adapter import Adapter from ..models._const import DEVICE, normalize_device @@ -38,8 +39,9 @@ from .rocm import IS_ROCM from .torch import HAS_CUDA, HAS_MPS, HAS_XPU + message_logged = False -logger = setup_logger() +log = setup_logger() AUTO_SELECT_BACKEND_ORDER = OrderedDict({ BACKEND.MARLIN: MarlinQuantLinear, # optimized for bs > 1 @@ -197,7 +199,7 @@ def select_quant_linear( adapter=adapter, ) if os.environ.get("DEBUG") and not validate: - logger.info(f"skip {k} for {str(err)}") + log.info(f"skip {k} for {str(err)}") if validate: if pack: check_pack_func = issubclass(cls, PackableQuantLinear) @@ -205,7 +207,7 @@ def select_quant_linear( #if not message_logged: # logger.info(f"Auto pick kernel based on compatibility: {cls}") # message_logged = True - logger.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`") + log.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`") validated_qlinears.append(cls) if not multi_select: return cls @@ -213,7 +215,7 @@ def select_quant_linear( #if not message_logged: # logger.info(f"Auto pick kernel based on compatibility: {cls}") # message_logged = True - logger.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`") + log.info(f"Kernel: Auto-selection: adding candidate `{cls.__name__}`") validated_qlinears.append(cls) if not multi_select: return cls @@ -249,7 +251,7 @@ def select_quant_linear( cpu_vendor = Device("cpu").vendor if cpu_vendor != "intel": - logger.warning(f"Kernel: IPEX on cpu is only validated and optimized for Intel cpu with AVX512, AMX, or XMX. Current cpu vendor: `{cpu_vendor}`.") + log.warn(f"Kernel: IPEX on cpu is only validated and optimized for Intel cpu with AVX512, AMX, or XMX. Current cpu vendor: `{cpu_vendor}`.") qlinear = IPEXQuantLinear elif backend == BACKEND.TORCH: diff --git a/gptqmodel/utils/marlin.py b/gptqmodel/utils/marlin.py index cd1dc3c4c..4fbdfdf57 100644 --- a/gptqmodel/utils/marlin.py +++ b/gptqmodel/utils/marlin.py @@ -21,7 +21,8 @@ from ..utils.logger import setup_logger from .rocm import IS_ROCM -logger = setup_logger() + +log = setup_logger() # Validate marlin support def _validate_marlin_device_support() -> bool: diff --git a/gptqmodel/utils/mlx.py b/gptqmodel/utils/mlx.py index 24541400d..3a82a8aa1 100644 --- a/gptqmodel/utils/mlx.py +++ b/gptqmodel/utils/mlx.py @@ -6,9 +6,10 @@ from ..models import BaseGPTQModel from ..nn_modules.qlinear.torch import TorchQuantLinear from ..quantization import FORMAT, QuantizeConfig -from .logger import setup_logger +from .log import setup_logger from .torch import torch_empty_cache + try: import mlx.core as mx from mlx_lm import generate @@ -17,7 +18,7 @@ except ImportError: MLX_AVAILABLE = False -logger = setup_logger() +log = setup_logger() def convert_gptq_to_mlx_weights(model_id_or_path: str, model: Union[PreTrainedModel, BaseGPTQModel], gptq_config: QuantizeConfig, lm_head_name: str): if not MLX_AVAILABLE: @@ -48,7 +49,7 @@ def convert_gptq_to_mlx_weights(model_id_or_path: str, model: Union[PreTrainedMo # Convert weights weights = {} n = 1 - pb = logger.pb(model.named_modules()).title("Format: Converting to mlx ->").manual() + pb = log.pb(model.named_modules()).title("Format: Converting to mlx ->").manual() for name, module in pb: pb.subtitle(f"{name}").draw() if isinstance(module, TorchQuantLinear): @@ -85,11 +86,11 @@ def convert_gptq_to_mlx_weights(model_id_or_path: str, model: Union[PreTrainedMo mlx_model = model_class(model_args_class.from_dict(config)) # Load and quantize weights - logger.info("Starting MLX quantization...") + log.info("Starting MLX quantization...") mlx_model.load_weights(list(weights.items())) weights, mlx_config = quantize_model(mlx_model, config, q_group_size=gptq_config["group_size"], q_bits=gptq_config["bits"]) - logger.info("MLX quantization completed") + log.info("MLX quantization completed") return weights, mlx_config diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 80137274a..d9861f4de 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -34,18 +34,25 @@ import torch import torch.nn as nn import transformers -from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear -from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear from huggingface_hub import HfApi, hf_hub_download from packaging import version from transformers import AutoConfig, PretrainedConfig from transformers.pytorch_utils import id_tensor_storage from transformers.utils.hub import cached_file +from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear +from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear + from ..adapter.adapter import Adapter from ..looper.named_module import NamedModule -from ..models._const import (CPU, DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, - EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS, SUPPORTS_MODULE_TYPES) +from ..models._const import ( + CPU, + DEVICE, + EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, + EXPERT_INDEX_PLACEHOLDER, + SUPPORTED_MODELS, + SUPPORTS_MODULE_TYPES, +) from ..nn_modules.qlinear import BaseQuantLinear from ..nn_modules.qlinear.exllama import ExllamaQuantLinear from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear @@ -54,10 +61,11 @@ from ..quantization.config import FORMAT_FIELD_JSON, dynamic_get from .backend import BACKEND from .importer import select_quant_linear -from .logger import setup_logger +from .log import setup_logger from .torch import torch_empty_cache, torch_new_stream_ctx -logger = setup_logger() + +log = setup_logger() def recurse_getattr(obj, attr: str): """ @@ -203,7 +211,7 @@ def make_quant( adapter=extension, ) - logger.info(f"Kernel: candidates -> `[{', '.join(cls.__name__ for cls in quant_linear_candidates)}]`") + log.info(f"Kernel: candidates -> `[{', '.join(cls.__name__ for cls in quant_linear_candidates)}]`") # loop over actual QLinear init, catch errors and use fallbacks if applicable for cls in quant_linear_candidates: @@ -228,10 +236,10 @@ def make_quant( backend=backend, adapter=qcfg.adapter, ) - logger.info(f"Kernel: selected -> `{linear_cls.__name__}`.") + log.info(f"Kernel: selected -> `{linear_cls.__name__}`.") return linear_cls except NotImplementedError as e: - logger.info(f"Kernel: skipped -> `{cls}`.") + log.info(f"Kernel: skipped -> `{cls}`.") # only fallback to other quant linears when backend is auto. if backend not in [BACKEND.AUTO, BACKEND.AUTO_TRAINABLE]: @@ -373,7 +381,7 @@ def convert_gptq_v1_to_v2_format( # Limit thread usage to avoid auto-parallizataion regression with tctl.threadpool_limits(limits=1): t = time.time() - logger.info( + log.info( f"Format: Converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to internal `{FORMAT.GPTQ_V2}`.") for _, submodule in model.named_modules(): @@ -454,7 +462,7 @@ def convert_gptq_v1_to_v2_format( else: raise NotImplementedError("Only 2,3,4,8 bits are supported.") - logger.info(f"Format: Conversion complete: {time.time() - t}s") + log.info(f"Format: Conversion complete: {time.time() - t}s") return model @@ -556,7 +564,7 @@ def pack_model( model.to(CPU) - logger.info("Packing model...") + log.info("Packing model...") modules = find_modules(model) @@ -582,7 +590,7 @@ def pack_model( max_workers = 1 with ThreadPoolExecutor(max_workers=max_workers) as executor: - with logger.pb(names).manual() as pb: + with log.pb(names).manual() as pb: def wrapper(name): # TODO FIX, thread pool executor does not advance iterator pb.next() @@ -592,7 +600,7 @@ def wrapper(name): for _ in executor.map(wrapper, names): pass - logger.info("Model packed.") + log.info("Model packed.") return quant_linear_cls @@ -623,7 +631,7 @@ def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]): for shard_file, expected_hash in zip(shard_files, verify_hash): if not verify_model_hash(shard_file, expected_hash): - logger.info(f"Hash verification failed for {shard_file}") + log.info(f"Hash verification failed for {shard_file}") return False return True @@ -721,7 +729,7 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon max_input_len = max_input_length else: if max_input_length is not None: - logger.info( + log.info( "Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored." ) max_input_len = 1 @@ -1019,7 +1027,7 @@ def get_state_dict_for_save(model: nn.Module) -> Dict: del state_dict[name] warn_names.add(name) if len(warn_names) > 0: - logger.warning_once( + log.warn.once( f"Removed shared tensor {warn_names} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading", ) return state_dict diff --git a/gptqmodel/utils/openai_server.py b/gptqmodel/utils/openai_server.py index fa9b52177..dce41b413 100644 --- a/gptqmodel/utils/openai_server.py +++ b/gptqmodel/utils/openai_server.py @@ -20,6 +20,7 @@ import torch + try: import uvicorn from fastapi import FastAPI, HTTPException diff --git a/gptqmodel/utils/perplexity.py b/gptqmodel/utils/perplexity.py index 22de74157..f03d64ef7 100644 --- a/gptqmodel/utils/perplexity.py +++ b/gptqmodel/utils/perplexity.py @@ -21,6 +21,7 @@ from datasets import load_dataset, load_from_disk from logbar import LogBar + logger = LogBar.shared() class Perplexity: diff --git a/gptqmodel/utils/rocm.py b/gptqmodel/utils/rocm.py index 4bef3edbd..93da34dcb 100644 --- a/gptqmodel/utils/rocm.py +++ b/gptqmodel/utils/rocm.py @@ -16,4 +16,5 @@ import torch + IS_ROCM = torch.version.hip is not None diff --git a/gptqmodel/utils/sglang.py b/gptqmodel/utils/sglang.py index 3067994b5..7b655cc86 100644 --- a/gptqmodel/utils/sglang.py +++ b/gptqmodel/utils/sglang.py @@ -19,6 +19,7 @@ import torch from transformers import AutoConfig + try: import sglang as sgl SGLANG_AVAILABLE = True diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py index b725a1c57..2bced525a 100644 --- a/gptqmodel/utils/torch.py +++ b/gptqmodel/utils/torch.py @@ -22,6 +22,7 @@ from ..utils.logger import setup_logger + HAS_CUDA = False HAS_XPU = False HAS_MPS = False @@ -29,7 +30,7 @@ STREAM = None # cache -logger = setup_logger() +log = setup_logger() # reset dynamo cache on each model load since during ci loop model inference may exhuast cache torch._dynamo.reset() @@ -62,7 +63,7 @@ def torch_compile(module: Union[torch.nn.Module, Callable], backend:str ="induct try: return torch.compile(module, backend=backend, mode=mode, fullgraph=fullgraph) except BaseException: - logger.warning(f"Failed to compile `{module}`") + log.warn(f"Failed to compile `{module}`") return module def torch_new_stream(): diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py index a2ccc092d..ee41f5f14 100644 --- a/gptqmodel/utils/vllm.py +++ b/gptqmodel/utils/vllm.py @@ -18,6 +18,7 @@ import torch + try: from vllm import LLM, SamplingParams diff --git a/setup.py b/setup.py index 5752a3041..9e70703a9 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ import torch from setuptools import find_packages, setup + try: from setuptools.command.bdist_wheel import bdist_wheel as _bdist_wheel except BaseException: diff --git a/tests/benchmark/benchmark.py b/tests/benchmark/benchmark.py index 5aeb3f276..41f4ee9a5 100644 --- a/tests/benchmark/benchmark.py +++ b/tests/benchmark/benchmark.py @@ -15,9 +15,10 @@ # limitations under the License. from benchmark_test import BenchmarkTest -from gptqmodel import BACKEND from parameterized import parameterized # noqa: E402 +from gptqmodel import BACKEND + class TestInference(BenchmarkTest): @parameterized.expand( diff --git a/tests/benchmark/benchmark_test.py b/tests/benchmark/benchmark_test.py index 71c30da5e..cd8ca2bc8 100644 --- a/tests/benchmark/benchmark_test.py +++ b/tests/benchmark/benchmark_test.py @@ -17,13 +17,16 @@ import os import time + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import unittest # noqa: E402 -from gptqmodel import GPTQModel # noqa: E402 from logbar import LogBar +from gptqmodel import GPTQModel # noqa: E402 + + logger = LogBar.shared() class BenchmarkTest(unittest.TestCase): diff --git a/tests/inference_speed.py b/tests/inference_speed.py index 1e6d5102c..b5601e052 100644 --- a/tests/inference_speed.py +++ b/tests/inference_speed.py @@ -19,15 +19,18 @@ from gptqmodel.utils.torch import torch_empty_cache + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import unittest -from gptqmodel import GPTQModel from logbar import LogBar from transformers import AutoTokenizer +from gptqmodel import GPTQModel + + logger = LogBar.shared() class InferenceSpeed(unittest.TestCase): diff --git a/tests/models/model_test.py b/tests/models/model_test.py index c7f29b9d0..fb220604f 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -20,14 +20,17 @@ from typing import Dict, List from device_smi import Device + from gptqmodel.models._const import CUDA_0 + if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch from pathlib import Path # noqa: E402 + sys.path.insert(0, f"{str(Path(__file__).resolve().parent.parent)}/models") # noqa: E402 import contextlib # noqa: E402 import shutil # noqa: E402 @@ -37,6 +40,10 @@ import torch.cuda # noqa: E402 import transformers # noqa: E402 from datasets import load_dataset # noqa: E402 +from ovis.image_to_test_dataset import get_calib_dataset # noqa: E402 +from packaging.version import Version # noqa: E402 +from transformers import AutoProcessor, AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 from gptqmodel.quantization import FORMAT # noqa: E402 @@ -44,9 +51,7 @@ from gptqmodel.utils.eval import EVAL # noqa: E402 from gptqmodel.utils.model import MODALITY # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 -from ovis.image_to_test_dataset import get_calib_dataset # noqa: E402 -from packaging.version import Version # noqa: E402 -from transformers import AutoProcessor, AutoTokenizer # noqa: E402 + RAND_SEED = 898 diff --git a/tests/models/test_gptbigcode.py b/tests/models/test_gptbigcode.py index 78aa52276..bc465ffbb 100644 --- a/tests/models/test_gptbigcode.py +++ b/tests/models/test_gptbigcode.py @@ -17,6 +17,7 @@ import importlib.util import os + # TODO: find how ipex registered it jit interpreter # if intel_extension_for_pytorch was installed, @torch.jit.script in transformers/models/gpt_bigcode/modeling_gpt_bigcode.py will try to use ipex as torchScript interpreter. # However, in quantization, tensor were on gpu, which will throw RuntimeError: itensor_view_from_dense expects CPU tensor input diff --git a/tests/models/test_opt.py b/tests/models/test_opt.py index 3467ffd20..cf0303516 100644 --- a/tests/models/test_opt.py +++ b/tests/models/test_opt.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from model_test import ModelTest + from gptqmodel import BACKEND from gptqmodel.utils.importer import AUTO_SELECT_BACKEND_ORDER -from model_test import ModelTest class TestOpt(ModelTest): diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py index a6b50c1c0..65ecf05c7 100644 --- a/tests/models/test_qwen2_vl.py +++ b/tests/models/test_qwen2_vl.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ from model_test import ModelTest +from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ + class TestQwen2_VL(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2-VL-2B-Instruct" diff --git a/tests/models/test_qwen_15_moe.py b/tests/models/test_qwen_15_moe.py index a95fc4610..1ff0da2d6 100644 --- a/tests/models/test_qwen_15_moe.py +++ b/tests/models/test_qwen_15_moe.py @@ -1,6 +1,7 @@ import unittest import torch + from gptqmodel import BACKEND, GPTQModel diff --git a/tests/tasks/mmlu/_generate_configs.py b/tests/tasks/mmlu/_generate_configs.py index f613f7cd4..28b94616d 100644 --- a/tests/tasks/mmlu/_generate_configs.py +++ b/tests/tasks/mmlu/_generate_configs.py @@ -9,6 +9,7 @@ import yaml from tqdm import tqdm + eval_logger = logging.getLogger("lm-eval") diff --git a/tests/test_adapter_config.py b/tests/test_adapter_config.py index 6c09017e4..dc635087a 100644 --- a/tests/test_adapter_config.py +++ b/tests/test_adapter_config.py @@ -19,11 +19,13 @@ from gptqmodel import QuantizeConfig from gptqmodel.adapter.adapter import Lora, normalize_adapter + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 + lora = "lora" class TestExtensionConfig(unittest.TestCase): diff --git a/tests/test_asym_gptq_v1.py b/tests/test_asym_gptq_v1.py index b115dfd1f..2c9a2176b 100644 --- a/tests/test_asym_gptq_v1.py +++ b/tests/test_asym_gptq_v1.py @@ -17,11 +17,13 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -from gptqmodel.quantization import FORMAT # noqa: E402 # -- end do not touch from models.model_test import ModelTest # noqa: E402 +from gptqmodel.quantization import FORMAT # noqa: E402 + class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" diff --git a/tests/test_bits.py b/tests/test_bits.py index 097f6ca04..c8ee6c022 100644 --- a/tests/test_bits.py +++ b/tests/test_bits.py @@ -19,6 +19,7 @@ from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import logging # noqa: E402 @@ -26,6 +27,9 @@ import traceback # noqa: E402 import unittest # noqa: E402 +from lm_eval.utils import make_table # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear # noqa: E402 @@ -36,8 +40,7 @@ from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from gptqmodel.utils.eval import EVAL # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 + logger = logging.getLogger(__name__) diff --git a/tests/test_bits_new.py b/tests/test_bits_new.py index 125169453..818387340 100644 --- a/tests/test_bits_new.py +++ b/tests/test_bits_new.py @@ -16,6 +16,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -23,13 +24,14 @@ from typing import Optional # noqa: E402 from datasets import load_dataset # noqa: E402 +from lm_eval.utils import make_table # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from tabulate import tabulate # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.adapter.adapter import Lora # noqa: E402 from gptqmodel.utils.eval import EVAL # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from tabulate import tabulate # noqa: E402 def bench(path: str, backend: BACKEND, adapter: Optional[Lora]): diff --git a/tests/test_dynamic.py b/tests/test_dynamic.py index 3e5874507..537e67b1a 100644 --- a/tests/test_dynamic.py +++ b/tests/test_dynamic.py @@ -19,20 +19,22 @@ from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import json # noqa: E402 import tempfile # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from parameterized import parameterized # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from gptqmodel.quantization import QuantizeConfig # noqa: E402 from gptqmodel.utils import Perplexity, safetensor # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from parameterized import parameterized # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 class TestDynamic(ModelTest): diff --git a/tests/test_estimate_vram.py b/tests/test_estimate_vram.py index ba9b76343..ca9dd5be7 100644 --- a/tests/test_estimate_vram.py +++ b/tests/test_estimate_vram.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 diff --git a/tests/test_eval.py b/tests/test_eval.py index c4d71ba7b..63bc89f19 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -16,18 +16,22 @@ import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import tempfile # noqa: E402 -from typing import Type # noqa: E402 -from typing import Union # noqa: E402 +from typing import ( + Type, # noqa: E402 + Union, # noqa: E402 +) -from gptqmodel import GPTQModel # noqa: E402 -from gptqmodel.utils.eval import EVAL # noqa: E402 from lm_eval.tasks import TaskManager # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 +from gptqmodel.utils.eval import EVAL # noqa: E402 + class TestEval(ModelTest): @classmethod diff --git a/tests/test_evalplus.py b/tests/test_evalplus.py index 13d7251b7..775a41240 100644 --- a/tests/test_evalplus.py +++ b/tests/test_evalplus.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py index b56a0eecc..e61cd96f1 100644 --- a/tests/test_flash_attention.py +++ b/tests/test_flash_attention.py @@ -17,14 +17,16 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch -from gptqmodel import GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 + class Test(ModelTest): diff --git a/tests/test_group_size.py b/tests/test_group_size.py index 719866080..47900066d 100644 --- a/tests/test_group_size.py +++ b/tests/test_group_size.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import logging # noqa: E402 @@ -24,6 +25,9 @@ import traceback # noqa: E402 import unittest # noqa: E402 +from lm_eval.utils import make_table # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear # noqa: E402 @@ -34,8 +38,7 @@ from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from gptqmodel.utils.eval import EVAL # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 + logger = logging.getLogger(__name__) diff --git a/tests/test_inference_speed.py b/tests/test_inference_speed.py index ed9955b3f..4b97c85a7 100644 --- a/tests/test_inference_speed.py +++ b/tests/test_inference_speed.py @@ -17,12 +17,15 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -from gptqmodel.utils import BACKEND # noqa: E402 # -- end do not touch from inference_speed import InferenceSpeed # noqa: E402 from parameterized import parameterized # noqa: E402 +from gptqmodel.utils import BACKEND # noqa: E402 + + ''' NATIVE_MODEL_ID = /monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortext-v1 BITBLAS_NATIVE_MODEL_ID = /monster/data/model/opt-125M-autoround-lm_head-false-symTrue diff --git a/tests/test_inference_speed_ipex.py b/tests/test_inference_speed_ipex.py index 08cf088b9..0cd974eb1 100644 --- a/tests/test_inference_speed_ipex.py +++ b/tests/test_inference_speed_ipex.py @@ -17,13 +17,15 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch -from gptqmodel.utils import BACKEND from inference_speed import InferenceSpeed from parameterized import parameterized +from gptqmodel.utils import BACKEND + class TestInferenceSpeedIpex(InferenceSpeed): @parameterized.expand( diff --git a/tests/test_ipex_xpu.py b/tests/test_ipex_xpu.py index 50fb9b85c..ab235fdf6 100644 --- a/tests/test_ipex_xpu.py +++ b/tests/test_ipex_xpu.py @@ -17,14 +17,16 @@ # -- do not touch import os + os.environ["CUDA_VISIBLE_DEVICES"] = "" # -- end do not touch import tempfile # noqa: E402 +from models.model_test import ModelTest # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.models._const import DEVICE # noqa: E402 -from models.model_test import ModelTest # noqa: E402 class TestsIPEX(ModelTest): diff --git a/tests/test_kernel_output.py b/tests/test_kernel_output.py index be94531ce..125836ffd 100644 --- a/tests/test_kernel_output.py +++ b/tests/test_kernel_output.py @@ -1,6 +1,10 @@ import unittest import torch +from logbar import LogBar +from parameterized import parameterized +from torch import Tensor + from gptqmodel import BACKEND, GPTQModel from gptqmodel.adapter.adapter import Adapter, AdapterCache, Lora from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear @@ -11,9 +15,7 @@ from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear from gptqmodel.utils.model import find_modules -from logbar import LogBar -from parameterized import parameterized -from torch import Tensor + log = LogBar.shared() diff --git a/tests/test_lm_eval.py b/tests/test_lm_eval.py index 1ceaffaf1..99364b919 100644 --- a/tests/test_lm_eval.py +++ b/tests/test_lm_eval.py @@ -17,14 +17,16 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 import unittest # noqa: E402 +from lm_eval.utils import make_table # noqa: E402 + from gptqmodel import BACKEND, GPTQModel from gptqmodel.utils.eval import EVAL # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 class TestLmEval(unittest.TestCase): diff --git a/tests/test_lm_head.py b/tests/test_lm_head.py index c5d39bacf..134362790 100644 --- a/tests/test_lm_head.py +++ b/tests/test_lm_head.py @@ -20,12 +20,14 @@ from datasets import load_dataset + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 -from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 # -- end do not touch from models.model_test import ModelTest # noqa: E402 +from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 +from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 + class TestLmHeadLoad(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse" # "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse" diff --git a/tests/test_lora.py b/tests/test_lora.py index 0e50794fb..4b1727fea 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -16,14 +16,16 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch -from gptqmodel import BACKEND, GPTQModel # noqa: E402 -from gptqmodel.adapter.adapter import Lora # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 +from gptqmodel.adapter.adapter import Lora # noqa: E402 + class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128" diff --git a/tests/test_mlx.py b/tests/test_mlx.py index 32ca4125f..d3fa1137b 100644 --- a/tests/test_mlx.py +++ b/tests/test_mlx.py @@ -1,6 +1,7 @@ import os import sys + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if sys.platform == "darwin": @@ -8,11 +9,12 @@ import tempfile # noqa: E402 -from gptqmodel import GPTQModel # noqa: E402 from mlx_lm import generate, load # noqa: E402 from models.model_test import ModelTest # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 + class TestExport(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct/gptq_4bits_01-07_14-18-11_maxlen1024_ns1024_descFalse_damp0.1/" diff --git a/tests/test_mlx_generate.py b/tests/test_mlx_generate.py index f3484bfe1..f8581101b 100644 --- a/tests/test_mlx_generate.py +++ b/tests/test_mlx_generate.py @@ -1,14 +1,17 @@ import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import sys # noqa: E402 + if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" -from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 + class TestMlxGenerate(ModelTest): @classmethod diff --git a/tests/test_modelscope.py b/tests/test_modelscope.py index 22fcf2663..7214a86b4 100644 --- a/tests/test_modelscope.py +++ b/tests/test_modelscope.py @@ -1,9 +1,11 @@ import os + os.environ["GPTQMODEL_USE_MODELSCOPE"] = "True" -from gptqmodel import GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 + class TestLoadModelscope(ModelTest): diff --git a/tests/test_openai_server.py b/tests/test_openai_server.py index 4b2e4f8c3..777ed650c 100644 --- a/tests/test_openai_server.py +++ b/tests/test_openai_server.py @@ -18,8 +18,10 @@ import unittest import openai + from gptqmodel import GPTQModel + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" class TestOpeniServer(unittest.TestCase): diff --git a/tests/test_packable.py b/tests/test_packable.py index 53eff32ee..27f593029 100644 --- a/tests/test_packable.py +++ b/tests/test_packable.py @@ -3,6 +3,9 @@ from typing import Dict import torch +from parameterized import parameterized +from safetensors.torch import load_file + from gptqmodel import BACKEND, GPTQModel from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 @@ -12,8 +15,6 @@ from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from gptqmodel.utils.model import convert_gptq_v2_to_v1_format, find_modules -from parameterized import parameterized -from safetensors.torch import load_file class TestPackable(unittest.TestCase): diff --git a/tests/test_packing.py b/tests/test_packing.py index 7b08099a4..b8a6970b4 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -17,17 +17,20 @@ # -- do not touch import os +from parameterized import parameterized + from gptqmodel import BACKEND from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear -from parameterized import parameterized + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 + # isort: off import torch # noqa: E402 import torch.nn as nn # noqa: E402 diff --git a/tests/test_packing_speed.py b/tests/test_packing_speed.py index d6e0f699d..11eff2a62 100644 --- a/tests/test_packing_speed.py +++ b/tests/test_packing_speed.py @@ -19,6 +19,7 @@ from gptqmodel import BACKEND + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -28,6 +29,7 @@ import threadpoolctl # noqa: E402 from parameterized import parameterized # noqa: E402 + # isort: off import torch # noqa: E402 import torch.nn as nn # noqa: E402 diff --git a/tests/test_parameter_count.py b/tests/test_parameter_count.py index 599c5823a..260ac2541 100644 --- a/tests/test_parameter_count.py +++ b/tests/test_parameter_count.py @@ -2,11 +2,12 @@ import tempfile import torch.cuda -from gptqmodel import GPTQModel, QuantizeConfig -from gptqmodel.utils.tensor import tensor_parameters from models.model_test import ModelTest from safetensors.torch import load_file +from gptqmodel import GPTQModel, QuantizeConfig +from gptqmodel.utils.tensor import tensor_parameters + class TestsParameterCount(ModelTest): LLAMA_3_2_1B_PARAMETER_COUNT = 1235814400 @@ -19,11 +20,12 @@ class TestsParameterCount(ModelTest): def test_parameter_count(self): import os.path - from gptqmodel import QuantizeConfig - from gptqmodel.utils.tensor import tensor_parameters from huggingface_hub import hf_hub_download from safetensors.torch import load_file + from gptqmodel import QuantizeConfig + from gptqmodel.utils.tensor import tensor_parameters + model_id = "/monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" if os.path.isdir(model_id): file_path = os.path.join(model_id, "model.safetensors") diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py index a30e81d3c..98454aa26 100644 --- a/tests/test_perplexity.py +++ b/tests/test_perplexity.py @@ -18,6 +18,7 @@ import os import time + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,13 +26,14 @@ import unittest # noqa: E402 from datasets import load_dataset # noqa: E402 +from parameterized import parameterized # noqa: E402 +from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.quantization.config import FORMAT, QUANT_METHOD, AutoRoundQuantizeConfig, QuantizeConfig # noqa: E402 from gptqmodel.utils import Perplexity # noqa: E402 from gptqmodel.utils.rocm import IS_ROCM # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 -from parameterized import parameterized # noqa: E402 -from transformers import AutoModelForCausalLM, AutoTokenizer # noqa: E402 class TestPerplexity(unittest.TestCase): diff --git a/tests/test_post_quant_eora.py b/tests/test_post_quant_eora.py index 152d8c410..c281c27a6 100644 --- a/tests/test_post_quant_eora.py +++ b/tests/test_post_quant_eora.py @@ -16,19 +16,21 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 from typing import Optional # noqa: E402 +from lm_eval.utils import make_table # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from tabulate import tabulate # noqa: E402 + from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.adapter.adapter import Lora # noqa: E402 from gptqmodel.utils.eval import EVAL # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from tabulate import tabulate # noqa: E402 def bench(path: str, backend: BACKEND, adapter: Optional[Lora]): diff --git a/tests/test_q4_bitblas.py b/tests/test_q4_bitblas.py index ee819ec39..0d04505b1 100644 --- a/tests/test_q4_bitblas.py +++ b/tests/test_q4_bitblas.py @@ -17,15 +17,17 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 import torch # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 class TestQ4BitBLAS(unittest.TestCase): diff --git a/tests/test_q4_cuda.py b/tests/test_q4_cuda.py index 51af7c270..31d45fcf0 100644 --- a/tests/test_q4_cuda.py +++ b/tests/test_q4_cuda.py @@ -17,16 +17,18 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 + class TestsQ4CUDA(ModelTest): diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py index b6135e75a..30353270a 100644 --- a/tests/test_q4_exllama_v1.py +++ b/tests/test_q4_exllama_v1.py @@ -17,20 +17,23 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 +from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, exllama_set_max_input_length # noqa: E402 from gptqmodel.models._const import EXLLAMA_DEFAULT_MAX_INPUT_LENGTH # noqa: E402 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 from gptqmodel.quantization import FORMAT # noqa: E402 from gptqmodel.utils.importer import select_quant_linear # noqa: E402 from gptqmodel.utils.model import gptqmodel_post_init # noqa: E402 -from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 + REFERENCE = torch.Tensor( [ diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py index cf3ecac42..45c2a1202 100644 --- a/tests/test_q4_exllama_v2.py +++ b/tests/test_q4_exllama_v2.py @@ -17,19 +17,22 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import unittest # noqa: E402 import torch # noqa: E402 +from test_q4_exllama_v1 import REFERENCE, get_diff # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear # noqa: E402 from gptqmodel.quantization import FORMAT # noqa: E402 from gptqmodel.utils.importer import select_quant_linear # noqa: E402 from gptqmodel.utils.model import gptqmodel_post_init # noqa: E402 -from test_q4_exllama_v1 import REFERENCE, get_diff # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 + GENERATE_EVAL_SIZE = 100 diff --git a/tests/test_q4_ipex.py b/tests/test_q4_ipex.py index efdb3d0ca..1e78fff95 100644 --- a/tests/test_q4_ipex.py +++ b/tests/test_q4_ipex.py @@ -18,13 +18,15 @@ import os import sys + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from gptqmodel import BACKEND # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from gptqmodel import BACKEND # noqa: E402 + class TestsIPEX(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" # "bigscience/bloom-560m" diff --git a/tests/test_q4_marlin.py b/tests/test_q4_marlin.py index 044f1dfa4..9b8bbdf56 100644 --- a/tests/test_q4_marlin.py +++ b/tests/test_q4_marlin.py @@ -17,16 +17,18 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 -from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 +from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 + class TestQ4Marlin(ModelTest): diff --git a/tests/test_q4_torch.py b/tests/test_q4_torch.py index 0b4884147..3a9f68db7 100644 --- a/tests/test_q4_torch.py +++ b/tests/test_q4_torch.py @@ -17,14 +17,16 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 + class TestsQ4Torch(ModelTest): GENERATE_EVAL_SIZE_MIN = 20 diff --git a/tests/test_q4_torch_apple.py b/tests/test_q4_torch_apple.py index e51fe5ba8..e9318100d 100644 --- a/tests/test_q4_torch_apple.py +++ b/tests/test_q4_torch_apple.py @@ -17,11 +17,12 @@ import sys # noqa: E402 import torch # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 + class TestsQ4Torch(ModelTest): GENERATE_EVAL_SIZE_MIN = 5 diff --git a/tests/test_q4_triton.py b/tests/test_q4_triton.py index c0a7e9a2e..0da3238f8 100644 --- a/tests/test_q4_triton.py +++ b/tests/test_q4_triton.py @@ -17,15 +17,17 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import torch # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 -from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 from models.model_test import ModelTest # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 +from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear # noqa: E402 + class TestsQ4Triton(ModelTest): model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" diff --git a/tests/test_quant_and_eora.py b/tests/test_quant_and_eora.py index 6a907d4df..813cdfb57 100644 --- a/tests/test_quant_and_eora.py +++ b/tests/test_quant_and_eora.py @@ -16,6 +16,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -23,13 +24,14 @@ from typing import Optional # noqa: E402 from datasets import load_dataset # noqa: E402 +from lm_eval.utils import make_table # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from tabulate import tabulate # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.adapter.adapter import Lora # noqa: E402 from gptqmodel.utils.eval import EVAL # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 -from lm_eval.utils import make_table # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from tabulate import tabulate # noqa: E402 class Test(ModelTest): diff --git a/tests/test_quant_and_eora_transformers.py b/tests/test_quant_and_eora_transformers.py index 8071cac46..40086fd9a 100644 --- a/tests/test_quant_and_eora_transformers.py +++ b/tests/test_quant_and_eora_transformers.py @@ -21,6 +21,7 @@ from safetensors.torch import load_file from transformers import AutoModelForCausalLM, AutoTokenizer + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -28,15 +29,17 @@ from typing import Optional # noqa: E402 from datasets import load_dataset # noqa: E402 -from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 -from gptqmodel.adapter.adapter import HF_ADAPTER_FILE_NAME, HF_ADAPTER_WEIGHT_KEY_PREFIX, Lora # noqa: E402 -from gptqmodel.utils.eval import EVAL # noqa: E402 -from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 from lm_eval.utils import make_table # noqa: E402 from logbar import LogBar from models.model_test import ModelTest # noqa: E402 from tabulate import tabulate # noqa: E402 +from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 +from gptqmodel.adapter.adapter import HF_ADAPTER_FILE_NAME, HF_ADAPTER_WEIGHT_KEY_PREFIX, Lora # noqa: E402 +from gptqmodel.utils.eval import EVAL # noqa: E402 +from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 + + log = LogBar.shared() diff --git a/tests/test_quant_batch.py b/tests/test_quant_batch.py index eace9e815..6ae851594 100644 --- a/tests/test_quant_batch.py +++ b/tests/test_quant_batch.py @@ -17,16 +17,18 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import GPTQModel # noqa: E402 from gptqmodel.quantization import QuantizeConfig # noqa: E402 from gptqmodel.utils import Perplexity # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 class TestQuantBatch(ModelTest): diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py index 9e6736859..910720900 100644 --- a/tests/test_quant_formats.py +++ b/tests/test_quant_formats.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,15 +26,20 @@ import tempfile # noqa: E402 from datasets import load_dataset # noqa: E402 -from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 -from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 -from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL, # noqa: E402 - AutoRoundQuantizeConfig, QuantizeConfig) -from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 +from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 +from gptqmodel.quantization.config import ( # noqa: E402 + META_FIELD_QUANTIZER, + META_QUANTIZER_GPTQMODEL, + AutoRoundQuantizeConfig, + QuantizeConfig, +) +from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 + class TestQuantization(ModelTest): diff --git a/tests/test_quant_formats_auto_round.py b/tests/test_quant_formats_auto_round.py index 3b99d47e0..03f79bf4e 100644 --- a/tests/test_quant_formats_auto_round.py +++ b/tests/test_quant_formats_auto_round.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,15 +26,20 @@ import tempfile # noqa: E402 from datasets import load_dataset # noqa: E402 -from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 -from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 -from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL, # noqa: E402 - AutoRoundQuantizeConfig, QuantizeConfig) -from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 +from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 +from gptqmodel.quantization.config import ( # noqa: E402 + META_FIELD_QUANTIZER, + META_QUANTIZER_GPTQMODEL, + AutoRoundQuantizeConfig, + QuantizeConfig, +) +from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 + class TestQuantization(ModelTest): diff --git a/tests/test_quant_formats_ipex.py b/tests/test_quant_formats_ipex.py index a2774d8ad..14780643e 100644 --- a/tests/test_quant_formats_ipex.py +++ b/tests/test_quant_formats_ipex.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,15 +26,20 @@ import tempfile # noqa: E402 from datasets import load_dataset # noqa: E402 -from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 -from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 -from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL, # noqa: E402 - AutoRoundQuantizeConfig, QuantizeConfig) -from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device # noqa: E402 +from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD # noqa: E402 +from gptqmodel.quantization.config import ( # noqa: E402 + META_FIELD_QUANTIZER, + META_QUANTIZER_GPTQMODEL, + AutoRoundQuantizeConfig, + QuantizeConfig, +) +from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 + class TestQuantization(ModelTest): diff --git a/tests/test_quant_time.py b/tests/test_quant_time.py index b925a9c0b..ef3c1e2e0 100644 --- a/tests/test_quant_time.py +++ b/tests/test_quant_time.py @@ -16,13 +16,15 @@ import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import time # noqa: E402 +from models.model_test import ModelTest # noqa: E402 + from gptqmodel import GPTQModel # noqa: E402 from gptqmodel.quantization.config import QuantizeConfig # noqa: E402 -from models.model_test import ModelTest # noqa: E402 class TestQuantTime(ModelTest): diff --git a/tests/test_quant_trust_remote.py b/tests/test_quant_trust_remote.py index 312800420..7437e42c7 100644 --- a/tests/test_quant_trust_remote.py +++ b/tests/test_quant_trust_remote.py @@ -17,18 +17,20 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 import transformers # noqa: E402 -from gptqmodel import GPTQModel # noqa: E402 -from gptqmodel.quantization import FORMAT, QuantizeConfig # noqa: E402 from models.model_test import ModelTest # noqa: E402 from packaging.version import Version # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 +from gptqmodel.quantization import FORMAT, QuantizeConfig # noqa: E402 + class TestQuantWithTrustRemoteTrue(ModelTest): @classmethod diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py index 6f85bd14f..75ea967dc 100644 --- a/tests/test_save_loaded_quantized_model.py +++ b/tests/test_save_loaded_quantized_model.py @@ -17,15 +17,18 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 import unittest # noqa: E402 -from gptqmodel import BACKEND, GPTQModel, get_best_device # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel, get_best_device # noqa: E402 + + MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" class TestSave(unittest.TestCase): diff --git a/tests/test_save_loaded_quantized_model_ipex.py b/tests/test_save_loaded_quantized_model_ipex.py index 70a6e526a..92aef288e 100644 --- a/tests/test_save_loaded_quantized_model_ipex.py +++ b/tests/test_save_loaded_quantized_model_ipex.py @@ -17,15 +17,18 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import tempfile # noqa: E402 import unittest # noqa: E402 -from gptqmodel import BACKEND, GPTQModel, get_best_device # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel, get_best_device # noqa: E402 + + MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" class TestSave(unittest.TestCase): diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 8610e4af0..2df43e218 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch diff --git a/tests/test_sglang.py b/tests/test_sglang.py index cbc8e6344..9883d09b0 100644 --- a/tests/test_sglang.py +++ b/tests/test_sglang.py @@ -16,14 +16,16 @@ import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import importlib.util # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 from models.model_test import ModelTest # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 + class TestLoadSglang(ModelTest): diff --git a/tests/test_sharded.py b/tests/test_sharded.py index fa57c045a..d5524fed4 100644 --- a/tests/test_sharded.py +++ b/tests/test_sharded.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,9 +26,10 @@ import unittest # noqa: E402 import torch # noqa: E402 -from gptqmodel import GPTQModel # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import GPTQModel # noqa: E402 + class TestSharded(unittest.TestCase): MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" diff --git a/tests/test_tgi.py b/tests/test_tgi.py index 55136f35d..c8be3e9b4 100644 --- a/tests/test_tgi.py +++ b/tests/test_tgi.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import json # noqa: E402 diff --git a/tests/test_tokenicer.py b/tests/test_tokenicer.py index 243359367..a9dad21ad 100644 --- a/tests/test_tokenicer.py +++ b/tests/test_tokenicer.py @@ -16,13 +16,15 @@ import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import unittest # noqa: E402 -from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 from parameterized import parameterized # noqa: E402 +from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 + class TestTokenicer(unittest.TestCase): diff --git a/tests/test_transformers.py b/tests/test_transformers.py index a35a5c32a..3c287549c 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -15,15 +15,17 @@ # limitations under the License. import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" import tempfile # noqa: E402 import unittest # noqa: E402 import transformers # noqa: E402 -from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 from packaging.version import Version # noqa: E402 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig # noqa: E402 +from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 + class TestTransformersIntegration(unittest.TestCase): INFERENCE_PROMPT = "Which city is the capital of France? The city name is " diff --git a/tests/test_triton.py b/tests/test_triton.py index cce0c09d1..2050ab6b6 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch @@ -25,9 +26,11 @@ import torch # noqa: E402 import torch.utils.benchmark as benchmark # noqa: E402 -from gptqmodel import BACKEND, GPTQModel # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +from gptqmodel import BACKEND, GPTQModel # noqa: E402 + + MODEL_ID = "/monster/data/model/Llama-7B-GPTQ" DATASET_ID = "timdettmers/openassistant-guanaco" LEARNING_RATE = 3e-5 diff --git a/tests/test_triton_xpu.py b/tests/test_triton_xpu.py index 110bea6bc..cf61879ad 100644 --- a/tests/test_triton_xpu.py +++ b/tests/test_triton_xpu.py @@ -17,14 +17,16 @@ # -- do not touch import os + os.environ["CUDA_VISIBLE_DEVICES"] = "" # -- end do not touch import tempfile # noqa: E402 +from models.model_test import ModelTest # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.models._const import DEVICE # noqa: E402 -from models.model_test import ModelTest # noqa: E402 class TestTritonXPU(ModelTest): diff --git a/tests/test_verify_hash.py b/tests/test_verify_hash.py index e65f7af3e..1bc22f3c1 100644 --- a/tests/test_verify_hash.py +++ b/tests/test_verify_hash.py @@ -17,6 +17,7 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch diff --git a/tests/test_vllm.py b/tests/test_vllm.py index 16534b9cb..f84d133cb 100644 --- a/tests/test_vllm.py +++ b/tests/test_vllm.py @@ -17,17 +17,19 @@ # -- do not touch import os + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch import importlib.util # noqa: E402 import tempfile # noqa: E402 +from models.model_test import ModelTest # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 class TestLoadVLLM(ModelTest):