Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
c5f9351
fix type hint
CSY-ModelCloud Feb 18, 2025
73e7349
update warning msg
CSY-ModelCloud Feb 18, 2025
b5ed67c
update eora license to apache and attribute nvidia/arxiv
Qubitium Feb 18, 2025
498d426
remove early eora test files
Qubitium Feb 18, 2025
b7d627d
ipex doesn't need to pass register_buffers to Torch
CSY-ModelCloud Feb 18, 2025
e361076
refractor ipex
Qubitium Feb 18, 2025
b6d606d
refractor ipex2
Qubitium Feb 18, 2025
c18828e
fix typo
Qubitium Feb 18, 2025
313acca
make ipex packable & add missing register_buffers
CSY-ModelCloud Feb 18, 2025
74c0895
cleanup ipex, add lora + bias check
Qubitium Feb 18, 2025
6f45930
remove duplicated codes
CSY-ModelCloud Feb 18, 2025
4f426d7
ignore two folders for pytest
CSY-ModelCloud Feb 18, 2025
f89a60a
fix test lora. fix wrong tokenizer type
CSY-ModelCloud Feb 18, 2025
cec95c2
compile adapter
Qubitium Feb 18, 2025
48318ac
Fix `generation_config.json` not auto-saved (#1292)
Qubitium Feb 18, 2025
4ee9b24
[CI] update ci for requirements installation
CSY-ModelCloud Feb 18, 2025
b2cdc70
[CI] don't update intel_extension_for_pytorch for now
CSY-ModelCloud Feb 18, 2025
07115df
[CI] remove ipex
CSY-ModelCloud Feb 18, 2025
7131d3b
correct name backend to exllama_eora
Qubitium Feb 18, 2025
2062fac
use hf save hack to fix config saves
Qubitium Feb 18, 2025
f325d44
fix param name changed
CSY-ModelCloud Feb 18, 2025
0f8269a
[SAVE] Save config files with empty state dict (#1293)
ZX-ModelCloud Feb 18, 2025
3670778
print lora adapter loaded count vs total number of of quantized modules
Qubitium Feb 18, 2025
a72b581
print lora adapter loaded count vs total number of of quantized modules
Qubitium Feb 18, 2025
362b2de
fix wrong model.save
Qubitium Feb 18, 2025
4a53ac4
Test GSM8K
Qubitium Feb 18, 2025
56f5ea3
patch __repr__ for evalplus
CSY-ModelCloud Feb 18, 2025
b51497b
Save processor related config files. For example: preprocessor_config…
ZX-ModelCloud Feb 18, 2025
c88e08c
Fix adapter/eora for ipex kernel
Qubitium Feb 18, 2025
38cb121
Fix eora for ipex/marlin
Qubitium Feb 18, 2025
b735fc0
Clean eora for exllama v1/v2
Qubitium Feb 18, 2025
25f66a6
fix shape does not match in Backend.Marlin
ZX-ModelCloud Feb 18, 2025
bd76832
add comment
ZX-ModelCloud Feb 18, 2025
680de1c
type hint use torch.dtype instead of torch.float32
ZX-ModelCloud Feb 18, 2025
1f6c342
get _supports_flash_attn_2 from transformers
CSY-ModelCloud Feb 18, 2025
de792ef
fix prepare_dataset() error
ZX-ModelCloud Feb 18, 2025
09157ec
add color to logs
Qubitium Feb 18, 2025
3950b01
fix ci: lm_head test
Qubitium Feb 18, 2025
40075a0
fix pb and logging conflicting on output
Qubitium Feb 18, 2025
7019f32
refractor logging/pb
Qubitium Feb 18, 2025
fdc783c
move wf_ buffer to post_init
Qubitium Feb 18, 2025
100f3b7
fix logger + pb compat
Qubitium Feb 18, 2025
36080bd
rename pb.set_description to pb.info
Qubitium Feb 18, 2025
781a6f2
fix progressbar padding so cli ui width is stable
Qubitium Feb 18, 2025
b492284
add progressbar test
Qubitium Feb 18, 2025
cb3ba26
fix progressbar display at close()/end
Qubitium Feb 18, 2025
3b8408c
todo fixme for pb
Qubitium Feb 18, 2025
a67da25
fix pb display at end of iterable
Qubitium Feb 18, 2025
53ce9bc
fix pb: reserve 1 char for cursor and remove external dependency
Qubitium Feb 18, 2025
1e3e892
fix pb: render end
Qubitium Feb 18, 2025
52e19e2
fix minicpm layer_modules error
ZX-ModelCloud Feb 19, 2025
11eb046
fix sharded models were deleted
CSY-ModelCloud Feb 19, 2025
4aa3520
fix wrong order of config save causing sharded tensors to be removed …
Qubitium Feb 19, 2025
b40d43d
sync with main..fix save
Qubitium Feb 19, 2025
9a0c41b
clean logs
Qubitium Feb 19, 2025
db1db30
[CI] install color log
CSY-ModelCloud Feb 19, 2025
14b8a0b
fix hf is doing config validation on save which cause model save failure
Qubitium Feb 19, 2025
3b03131
[FIX] not pack when group_size=-1 (#1298)
ZX-ModelCloud Feb 19, 2025
45625be
disable eora kernel until validated
Qubitium Feb 19, 2025
cb90ddb
[CI] clean evalplus cache
CSY-ModelCloud Feb 19, 2025
cffc753
[CI] fix colorlog for xpu
CSY-ModelCloud Feb 19, 2025
9102a85
Merge branch 'main' into eora-main
Qubitium Feb 19, 2025
9624168
fix merge error
ZX-ModelCloud Feb 19, 2025
27018ab
ruff
Qubitium Feb 19, 2025
7d5ae1d
Merge branch 'eora' into eora-main
CSY-ModelCloud Feb 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 30 additions & 18 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ env:
PYTORCH_CUDA_ALLOC_CONF: 'expandable_segments:True'
MAX_JOBS: 8
RUNNER: 10.0.13.31
TRANSFORMERS_DIFF_TESTS: "models/test_internlm.py,models/test_internlm2_5.py,models/test_xverse.py"
TORCH_2_5_TESTS: "test_evalplus.py,test_perplexity.py,test_q4_ipex.py,test_ipex_xpu.py,test_save_loaded_quantized_model.py,test_quant_formats.py,models/test_hymba.py"
LEGACY_TESTS: "models/test_internlm.py,models/test_internlm2_5.py,models/test_xverse.py"
IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral.py,models/test_phi_3_moe.py"
GPTQMODEL_FORCE_BUILD: 1
repo: ${{ github.event.inputs.repo || github.repository }}
Expand Down Expand Up @@ -139,15 +138,15 @@ jobs:
import os
import re

TRANSFORMERS_DIFF_TESTS = '${TRANSFORMERS_DIFF_TESTS}'
LEGACY_TESTS = '${LEGACY_TESTS}'
IGNORED_TEST_FILES = '${IGNORED_TEST_FILES}'

TEST_NAMES='${{ github.event.inputs.test_names }}'
TEST_REGEX='${{ github.event.inputs.test_regex }}'

input_test_files_list = [f.strip().removesuffix('.py') for f in TEST_NAMES.split(',') if f.strip()]

transformers_test_files = [f.strip().removesuffix('.py') for f in f'{TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()]
transformers_test_files = [f.strip().removesuffix('.py') for f in f'{LEGACY_TESTS}'.split(',') if f.strip()]
transformers_test_files = [f for f in transformers_test_files if not input_test_files_list or f in input_test_files_list]

all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('py') not in f'{IGNORED_TEST_FILES}']
Expand Down Expand Up @@ -190,8 +189,8 @@ jobs:

echo "Conditions:"
echo "will build run: ${{ github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-files != '[]' && needs.list-test-files.outputs.transformers-files != '[]' && !(needs.list-test-files.outputs.m4-files == '[]' && needs.list-test-files.outputs.m4-files == '[]') }}"
echo "will transformers_diff run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]' }}"
echo "will torch2_5 run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-files != '[]' }}"
echo "will legacy run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]' }}"
echo "will torch run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-files != '[]' }}"
echo "will m4 run: ${{ (github.event.inputs.test_names == '' || contains(github.event.inputs.test_names, 'apple') || contains(github.event.inputs.test_names, 'mlx') ) && (needs.list-test-files.outputs.m4-files != '' || needs.list-test-files.outputs.m4-files != '[]') }}"

build:
Expand All @@ -202,6 +201,12 @@ jobs:
if: github.event.inputs.m4-only != 'true' && (needs.list-test-files.outputs.torch-files != '[]' || needs.list-test-files.outputs.transformers-files != '[]')
container:
image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5
options: --device /dev/dri --ipc=host --runtime=nvidia --gpus all
volumes:
- /dev/dri/by-path:/dev/dri/by-path
- /home/ci/models:/monster/data/model
- /home/ci/models/huggingface:/github/home/.cache/huggingface

steps:
- name: Checkout Codes
uses: actions/checkout@v4
Expand Down Expand Up @@ -286,7 +291,7 @@ jobs:
if: always()
run: pip cache purge && uv cache clean && rm -rf ./* ./.*

transformers_diff:
legacy:
needs:
- build
- list-test-files
Expand Down Expand Up @@ -383,6 +388,7 @@ jobs:

- name: Install wheel
run: |
uv pip install colorlog
uv pip install git+https://github.com/ModelCloud/Tokenicer -U
echo "===== install optimum bitblas parameterized uvicorn ====="
uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
Expand Down Expand Up @@ -441,7 +447,7 @@ jobs:
if: always()
run: pip cache purge && uv cache clean && rm -rf ./* ./.*

torch2_5:
torch:
needs:
- build
- list-test-files
Expand Down Expand Up @@ -541,22 +547,26 @@ jobs:

- name: Install wheel
run: |
if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
echo "===== install auto_round ====="
uv pip install auto_round -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
fi
if [ "${{ matrix.test_script }}" == "models/test_cohere2" ] || [ "${{ matrix.test_script }}" == "models/test_gemma" ]; then
echo "===== install transformers from git ====="
uv pip install -U git+https://github.com/huggingface/transformers.git -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
uv pip install colorlog
echo "===== updateing latest transformers ====="
uv pip install -U transformers

if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ] || [ "${{ matrix.test_script }}" == "test_q4_bitblas" ]; then
echo "===== install auto_round bitblas==0.0.1.dev13 ====="
uv pip install auto_round bitblas==0.0.1.dev13 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
fi

if [[ "${{ matrix.test_script }}" == *xpu* ]]; then
source /etc/profile.d/pyenv.sh && pyenv activate xpu
uv pip install colorlog
fi

if [[ "${{ matrix.test_script }}" == *"mlx"* ]]; then
uv pip install mlx_lm --no-build-isolation -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
fi

if [[ "${{ matrix.test_script }}" == "test_modelscope" ]]; then
echo "===== installing modelscope ====="
uv pip install modelscope --no-build-isolation -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
fi

Expand Down Expand Up @@ -622,16 +632,18 @@ jobs:

- name: Clean cache
if: always()
run: pip cache purge && uv cache clean && rm -rf ./* ./.*
run: |
rm ~/.cache/evalplus/*pkl || true
pip cache purge && uv cache clean && rm -rf ./* ./.*

show-statistics:
runs-on: [ self-hosted, xeon5 ]
if: github.event.inputs.exclusive-gpu != 'true'
container:
image: modelcloud/gptqmodel:alpine-ci-v1
needs:
- transformers_diff
- torch2_5
- legacy
- torch
steps:
- name: Print statistics
run: curl "http://10.0.14.248/gpu/get_vram_logs?id=${{ github.run_id }}"
Expand Down
6 changes: 3 additions & 3 deletions examples/benchmark/generation_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ def load_model_tokenizer(
def benchmark_generation_speed(model, tokenizer, examples, generation_config):
generation_time_list = []
num_generated_tokens_list = []
progress_bar = ProgressBar(examples)
for example in progress_bar:
pb = ProgressBar(examples)
for example in pb:
input_ids = example["input_ids"].to(model.device)

start = time.time()
Expand All @@ -217,7 +217,7 @@ def benchmark_generation_speed(model, tokenizer, examples, generation_config):
)
num_generated_tokens_list.append(num_generated_tokens)

progress_bar.set_postfix(
pb.set_postfix(
num_tokens=num_generated_tokens_list[-1],
time=generation_time_list[-1],
speed=f"{num_generated_tokens_list[-1] / generation_time_list[-1]:.3f} tokens/s",
Expand Down
3 changes: 0 additions & 3 deletions examples/quantization/basic_usage_wikitext2.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,6 @@ def main():
# with value under torch.LongTensor type.
model.quantize(traindataset)

# save quantized model
model.save(quantized_model_id)

# save quantized model using safetensors
model.save(quantized_model_id)

Expand Down
3 changes: 2 additions & 1 deletion gptqmodel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from .models import GPTQModel, get_best_device
from .quantization import BaseQuantizeConfig, QuantizeConfig
from .utils import BACKEND
from .utils.exllama import exllama_set_max_input_length
from .version import __version__

import os
if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
try:
from modelscope.utils.hf_util.patcher import patch_hub
Expand Down
13 changes: 8 additions & 5 deletions gptqmodel/adapter/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def validate_path(self, local_only=False):
raise ValueError(f"Adapter: `path` str in this context must be a local os path: actual = `{self.path}`.")

# override me
def apply(self, x: torch.Tensor, out: torch.Tensor):
def apply(self, x: torch.Tensor, out: torch.Tensor) -> torch.Tensor:
pass

# override me
Expand Down Expand Up @@ -67,15 +67,18 @@ def parameter_keys(cls) -> List[str]:
return ["lora_A", "lora_B"]

def optimize(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False):
print("Lora compile")
self.apply = torch_compile(self.apply, backend=backend, mode=mode, fullgraph=fullgraph)
pass
#logger.info("Adapter: optimize (compile)")
#self.apply = torch_compile(self.apply, backend=backend, mode=mode, fullgraph=fullgraph)

def apply(self, x: torch.Tensor, out: torch.Tensor):
def apply(self, x: torch.Tensor, out: torch.Tensor) -> torch.Tensor:
# original code
# out = out + ((x @ self.lora_A) @ self.lora_B)

# fix batch for lora
if out.shape[0] > 1:
# Some kernels do not reshape x, such as marlin / exllama / exllamav2.
# out.dim() > x.dim() is used to exclude these kernels without additional processing
if out.dim() > x.dim() and out.shape[0] > 1:
out_orgi_shape = out.shape
out = out.view(-1, out.shape[-1])
out.add_((x @ self.lora_A) @ self.lora_B)
Expand Down
8 changes: 4 additions & 4 deletions gptqmodel/eora/eora.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024-2025 NVIDIA
# Copyright 2024-2025 NVIDIA CORPORATION
# EoRA arXiv: https://arxiv.org/abs/2410.21271

# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -22,7 +22,7 @@

logger = setup_logger()

def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.float32], sample_size: int):
def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.dtype], sample_size: int):
inp = input[0].to(dtype=torch.float32)
if inp.dim() == 2:
inp = inp.unsqueeze(0)
Expand All @@ -38,9 +38,9 @@ def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict

def eora_compute_lora(
device: torch.device,
w_wq_delta: Tensor, # need the w (original weight) and wq (quantized qeight) delta in float32
w_wq_delta: Tensor, # need the w (original weight) and wq (quantized qweight) delta in float32
module: NamedModule,
eigen_scaling_diag_matrix: torch.float32,
eigen_scaling_diag_matrix: torch.dtype,
rank: int) -> Tuple[Tensor, Tensor]:

assert w_wq_delta.dtype == torch.float32
Expand Down
3 changes: 2 additions & 1 deletion gptqmodel/looper/dequantize_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@

class DequantizeProcessor(LoopProcessor):
def __init__(self, quantized_modules: Dict[str, TorchQuantLinear]):
super().__init__(tokenizer=None, qcfg=None, calibration_dataset=None, calibration_dataset_concat_size=None, batch_size=1,
super().__init__(tokenizer=None, qcfg=None, calibration_dataset=None, calibration_dataset_concat_size=None,
prepare_dataset_func=None, batch_size=1,
logger_board="", require_fwd=True)

self.quantized_modules = quantized_modules
Expand Down
10 changes: 6 additions & 4 deletions gptqmodel/looper/eora_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,20 @@
from gptqmodel.quantization.gptq import CPU
from gptqmodel.utils.logger import setup_logger
from gptqmodel.utils.model import move_to
from gptqmodel.utils.torch import torch_sync, torch_compile
from gptqmodel.utils.torch import torch_compile, torch_sync
from torch.nn import Module

logger = setup_logger()


class EoraProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset,
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func,
calibration_dataset_concat_size: Optional[int], batch_size: int,
logger_board: str = "", require_fwd: bool = True,
):
super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset, calibration_dataset_concat_size=calibration_dataset_concat_size, batch_size=batch_size,
super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset,
calibration_dataset_concat_size=calibration_dataset_concat_size,
prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
logger_board=logger_board, require_fwd=require_fwd)

# dict: key is module name, value is the accumulated eigen_scaling_diag_matrix
Expand Down Expand Up @@ -113,7 +115,7 @@ def tmp(_, input: Tuple[torch.Tensor, ...], output: torch.Tensor):
def process(self, module: NamedModule):
assert isinstance(module.adapter_cfg, Lora)

self.pb.set_description(f"EoRA gen: {module.name} in layer {module.layer_index} of {self.layer_count - 1}")
self.pb.info(f"EoRA gen: {module.name} in layer {module.layer_index} of {self.layer_count - 1}")

start = time.time()

Expand Down
8 changes: 5 additions & 3 deletions gptqmodel/looper/gptq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@
logger = setup_logger()

class GPTQProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset,
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func,
calibration_dataset_concat_size: Optional[int], batch_size: int,
logger_board: str = "", require_fwd: bool = True, retain_w: bool = False):

super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset, calibration_dataset_concat_size=calibration_dataset_concat_size, batch_size=batch_size,
super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset,
calibration_dataset_concat_size=calibration_dataset_concat_size,
prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
logger_board=logger_board, require_fwd=require_fwd)

self.retain_w = retain_w
Expand Down Expand Up @@ -111,7 +113,7 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
return tmp

def process(self, module: NamedModule):
self.pb.set_description(f"Quantizing {module.name} in layer {module.layer_index} of {self.layer_count - 1}")
self.pb.info(f"Quantizing {module.name} in layer {module.layer_index} of {self.layer_count - 1}")
gptq = self.tasks

# logger.info(f"Quantizing module START: {name}, {gptq[name].shape()}")
Expand Down
Loading
Loading