Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion examples/benchmark/generation_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@

import torch
from datasets import Dataset, load_dataset
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from logbar import LogBar
from transformers import AutoTokenizer, GenerationConfig
from transformers.generation.logits_process import LogitsProcessor

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig


logger = LogBar.shared()

random.seed(0)
Expand Down
2 changes: 2 additions & 0 deletions examples/benchmark/ipex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


try:
from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
bind_cores_for_best_perf()
Expand All @@ -29,6 +30,7 @@

import argparse


parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")
Expand Down
4 changes: 3 additions & 1 deletion examples/benchmark/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
import argparse
import os

from gptqmodel.utils import Perplexity
from transformers import AutoTokenizer

from gptqmodel.utils import Perplexity


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

if __name__ == "__main__":
Expand Down
4 changes: 3 additions & 1 deletion examples/evaluation/run_language_modeling_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@

import datasets
import torch
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import LanguageModelingTask
from gptqmodel.utils.torch import torch_empty_cache
from transformers import AutoTokenizer


DATASET = "tatsu-lab/alpaca"
WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
Expand Down
4 changes: 3 additions & 1 deletion examples/evaluation/run_sequence_classification_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@

import datasets
import torch
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import SequenceClassificationTask
from gptqmodel.utils.torch import torch_empty_cache
from transformers import AutoTokenizer


DATASET = "cardiffnlp/tweet_sentiment_multilingual"
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
Expand Down
4 changes: 3 additions & 1 deletion examples/evaluation/run_text_summarization_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@

import datasets
import torch
from transformers import AutoTokenizer, GenerationConfig

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import TextSummarizationTask
from gptqmodel.utils.torch import torch_empty_cache
from transformers import AutoTokenizer, GenerationConfig


os.system("pip install py7zr")

Expand Down
1 change: 1 addition & 0 deletions examples/inference/run_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
Expand Down
4 changes: 3 additions & 1 deletion examples/inference/run_with_different_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
import sys
from argparse import ArgumentParser

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "./TinyLlama/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@

import os

from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig, get_best_device


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
# limitations under the License.

import torch
from transformers import AutoTokenizer

from gptqmodel import GPTQModel
from gptqmodel.quantization.config import AutoRoundQuantizeConfig # noqa: E402
from transformers import AutoTokenizer


pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage_wikitext2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@

import torch
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig


pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"

Expand Down
1 change: 1 addition & 0 deletions examples/quantization/transformers_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig


model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
Expand Down
1 change: 1 addition & 0 deletions gptqmodel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .utils.exllama import exllama_set_max_input_length
from .version import __version__


if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
try:
from modelscope.utils.hf_util.patcher import patch_hub
Expand Down
12 changes: 6 additions & 6 deletions gptqmodel/adapter/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .peft import LoraConfig
from .remote import resolve_path

logger = setup_logger()
log = setup_logger()
LORA_MERGED_WEIGHT_PATHS = [None, ""]
HF_ADAPTER_FILE_NAME = "adapter_model.safetensors"
HF_ADAPTER_CONFIG_FILE_NAME = "adapter_config.json"
Expand All @@ -30,7 +30,7 @@ def get(cls, path: str) -> Optional[Tuple[LoraConfig, Dict[str, torch.Tensor]]]:

@classmethod
def reset(cls):
logger.info("Adapter Cache: Resetting cache")
log.info("Adapter Cache: Resetting cache")
cls.cache = {}

@classmethod
Expand Down Expand Up @@ -181,10 +181,10 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N
# we have consumed all modules
if len(lora_weights) == 0:
AdapterCache.remove(self.path)
logger.info("Adapter: Consumed all Lora weights")
log.info("Adapter: Consumed all Lora weights")

else:
logger.warn(f"Adapter: Lora weights not found for `{weight_key}`")
log.warn(f"Adapter: Lora weights not found for `{weight_key}`")

assert lora_A is not None and lora_B is not None, f"Adapter: `lora_A` and `lora_B` must both be present in the weights: actual = `{lora_A}` and `{lora_B}`"

Expand All @@ -198,7 +198,7 @@ def post_init(self, weight_key: str, device:torch.device, lora_A: torch.Tensor=N
# print(f"Adapter: {self.name()}, loaded lora_A shape: {lora_A.shape}")
# print(f"Adapter: {self.name()}, loaded lora_B shape: {lora_B.shape}")
if lora_A.dtype != torch.float16 or lora_A.dtype != torch.float16:
logger.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.")
log.warn(f"Adapter: `lora_A` and `lora_B` tensors should be of dtype = `torch.float16`: actual = `[{lora_A.dtype}, {lora_A.dtype}]`.")

self.lora_A = lora_A.to(device=device, dtype=torch.float16)
self.lora_B = lora_B.to(device=device, dtype=torch.float16)
Expand All @@ -216,7 +216,7 @@ def dynamic_rank_override(self, lora_cfg: LoraConfig, weight_key: str) -> bool:
# first do string full match, then suffix match, then regex match
if weight_key == k or k.endswith(weight_key) or re.match(k, weight_key):
self.rank = v
logger.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.")
log.info(f"Adapter: Base Lora `rank` = `{self.rank}` has been overridden by `{k}` due to dynamic `LoraConfig.rank_pattern` control.")
return True

return False
Expand Down
6 changes: 3 additions & 3 deletions gptqmodel/eora/eora.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..looper.named_module import NamedModule
from ..utils.logger import setup_logger

logger = setup_logger()
log = setup_logger()

def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.dtype], sample_size: int):
inp = input[0].to(dtype=torch.float32)
Expand Down Expand Up @@ -54,7 +54,7 @@ def eora_compute_lora(
L, Q = torch.linalg.eigh(raw_scaling_diag_matrix)
if (L < 0).any():
## When expanding the calibration data size for EoRA, I suggest maintaining the balance by allocating 50% to general input (C4) and the remaining 50% to downstream task data.
logger.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.")
log.warn(f"Found negative eigenvalues in `{module.name}`. Please increase your calibration data set for EoRA.")
minimum = torch.min(L[L > 0])
L[L < 0] = minimum

Expand All @@ -64,7 +64,7 @@ def eora_compute_lora(
try:
scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
except Exception:
logger.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert?
log.warn("`scaling_diag_matrix` is not full rank!") # TODO: assert?
scaling_diag_matrix += 1e-6 * torch.eye(scaling_diag_matrix.shape[0]).to(device)
scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)

Expand Down
2 changes: 1 addition & 1 deletion gptqmodel/looper/dequantize_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..utils.logger import setup_logger
from ..utils.torch import torch_compile

logger = setup_logger()
log = setup_logger()

class DequantizeProcessor(LoopProcessor):
def __init__(self, quantized_modules: Dict[str, TorchQuantLinear]):
Expand Down
4 changes: 2 additions & 2 deletions gptqmodel/looper/eora_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from ..utils.model import move_to
from ..utils.torch import torch_compile, torch_sync

logger = setup_logger()
log = setup_logger()


class EoraProcessor(LoopProcessor):
Expand Down Expand Up @@ -182,7 +182,7 @@ def process(self, module: NamedModule):
stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name)

self.log.append(stat)
logger.info(stat)
log.info(stat)

# logger.info(f"Quantizing module END: {name}, {gptq[name].shape()}")
self.result_save(module.full_name, {
Expand Down
6 changes: 3 additions & 3 deletions gptqmodel/looper/gptq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from ..utils.model import move_to, pack_model
from ..utils.torch import torch_sync

logger = setup_logger()
log = setup_logger()

class GPTQProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func,
Expand Down Expand Up @@ -90,7 +90,7 @@ def preprocess(self, module: NamedModule, buffered_fwd: bool):
# deepseek has massive # of sub-modules per layer, causing vram pressure
# buffered mode is slower due to gpu<->cpu movement
if buffered_fwd: # TODO tweak this number for masive MoE
logger.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`")
log.info(f"Experimental: enabling fwd buffered mode for: `{module.name}`")
tmp.fwd_inputs_buffered = True

tmp.quantizer.configure(
Expand Down Expand Up @@ -160,7 +160,7 @@ def process(self, module: NamedModule):
stat["dynamic"] = self.qcfg.dynamic_get(layer_name=module.full_name)

self.log.append(stat)
logger.info(stat)
log.info(stat)

self.result_save(module.full_name, {
"scale": move_to(scale, device=CPU, stream=self.stream),
Expand Down
6 changes: 3 additions & 3 deletions gptqmodel/looper/loop_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory
from ..utils.logger import setup_logger

logger = setup_logger()
log = setup_logger()


# LoopProcessor is a singleton(), not per module instance
Expand Down Expand Up @@ -91,7 +91,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare
min_calibration_dataset_size = 256
min_calibration_dataset_input_ids_avg_length = 256
if len(calibration_dataset) < min_calibration_dataset_size:
logger.warning(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
log.warn(f"Calibration dataset size should be more than {min_calibration_dataset_size}. "
f"Current: {len(calibration_dataset)}.")

calibration_dataset = prepare_dataset_func(calibration_dataset=calibration_dataset,
Expand Down Expand Up @@ -119,7 +119,7 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare
avg = total_input_ids_length / len(calibration_dataset)

if avg < min_calibration_dataset_input_ids_avg_length:
logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
log.warn(f"The average length of input_ids of calibration_dataset should be greater than "
f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.")

self.num_batches = len(calibration_dataset)
Expand Down
8 changes: 4 additions & 4 deletions gptqmodel/looper/module_looper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
get_moe_layer_modules, move_to, nested_move_to)
from ..utils.torch import torch_empty_cache

logger = setup_logger()
log = setup_logger()

class ModuleLooper():
def __init__(self, model: BaseGPTQModel, processors: List[LoopProcessor]):
Expand Down Expand Up @@ -192,7 +192,7 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal
num_experts=num_experts)

layer_count = len(layers)
quant_modules_pb = (logger.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count))
quant_modules_pb = (log.pb(range(layer_count + 1 if self.gptq_model.quantize_config.lm_head else layer_count))
.manual()
.set(left_steps_offset=1))

Expand Down Expand Up @@ -419,15 +419,15 @@ def loop(self, auto_gc=True, calibration_enable_gpu_cache=True, buffered_fwd=Fal
# ignore log
pass
else:
logger.info(f"{reverse_p.name()} summary:\n{reverse_p.log}")
log.info(f"{reverse_p.name()} summary:\n{reverse_p.log}")

processor_name = reverse_p.name()
total_log[processor_name] = reverse_p.log
if processor_name == "gptq":
self.gptq_model.quant_log = reverse_p.log

for module_log in reverse_p.log:
logger.info(module_log)
log.info(module_log)
reverse_p.log_plotly()

reverse_p.finalize(model=self.gptq_model, **kwargs)
Expand Down
1 change: 1 addition & 0 deletions gptqmodel/models/_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from ..utils.rocm import IS_ROCM
from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU


CPU = device("cpu")
CUDA = device("cuda")
CUDA_0 = device("cuda:0")
Expand Down
Loading