ModelCloud · Qubitium · Feb 19, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -61,8 +61,7 @@ env:
   PYTORCH_CUDA_ALLOC_CONF: 'expandable_segments:True'
   MAX_JOBS: 8
   RUNNER: 10.0.13.31
-  TRANSFORMERS_DIFF_TESTS: "models/test_internlm.py,models/test_internlm2_5.py,models/test_xverse.py"
-  TORCH_2_5_TESTS: "test_evalplus.py,test_perplexity.py,test_q4_ipex.py,test_ipex_xpu.py,test_save_loaded_quantized_model.py,test_quant_formats.py,models/test_hymba.py"
+  LEGACY_TESTS: "models/test_internlm.py,models/test_internlm2_5.py,models/test_xverse.py"
   IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral.py,models/test_phi_3_moe.py"
   GPTQMODEL_FORCE_BUILD: 1
   repo: ${{ github.event.inputs.repo || github.repository }}
@@ -139,15 +138,15 @@ jobs:
           import os
           import re
 
-          TRANSFORMERS_DIFF_TESTS = '${TRANSFORMERS_DIFF_TESTS}'
+          LEGACY_TESTS = '${LEGACY_TESTS}'
           IGNORED_TEST_FILES = '${IGNORED_TEST_FILES}'
 
           TEST_NAMES='${{ github.event.inputs.test_names }}'
           TEST_REGEX='${{ github.event.inputs.test_regex }}'
 
           input_test_files_list = [f.strip().removesuffix('.py') for f in TEST_NAMES.split(',') if f.strip()]
 
-          transformers_test_files = [f.strip().removesuffix('.py') for f in f'{TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()]
+          transformers_test_files = [f.strip().removesuffix('.py') for f in f'{LEGACY_TESTS}'.split(',') if f.strip()]
           transformers_test_files = [f for f in transformers_test_files if not input_test_files_list or f in input_test_files_list]
 
           all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('py') not in f'{IGNORED_TEST_FILES}']
@@ -190,8 +189,8 @@ jobs:
 
           echo "Conditions:"
           echo "will build run: ${{ github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-files != '[]' && needs.list-test-files.outputs.transformers-files != '[]' && !(needs.list-test-files.outputs.m4-files == '[]' && needs.list-test-files.outputs.m4-files == '[]') }}"
-          echo "will transformers_diff run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]' }}"
-          echo "will torch2_5 run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-files != '[]' }}"
+          echo "will legacy run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]' }}"
+          echo "will torch run: ${{ (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-files != '[]' }}"
           echo "will m4 run: ${{ (github.event.inputs.test_names == '' || contains(github.event.inputs.test_names, 'apple') || contains(github.event.inputs.test_names, 'mlx') )  && (needs.list-test-files.outputs.m4-files != '' || needs.list-test-files.outputs.m4-files != '[]') }}"
 
   build:
@@ -202,6 +201,12 @@ jobs:
     if: github.event.inputs.m4-only != 'true' && (needs.list-test-files.outputs.torch-files != '[]' || needs.list-test-files.outputs.transformers-files != '[]')
     container:
       image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5
+      options: --device /dev/dri --ipc=host --runtime=nvidia --gpus all
+      volumes:
+        - /dev/dri/by-path:/dev/dri/by-path
+        - /home/ci/models:/monster/data/model
+        - /home/ci/models/huggingface:/github/home/.cache/huggingface
+
     steps:
       - name: Checkout Codes
         uses: actions/checkout@v4
@@ -286,7 +291,7 @@ jobs:
         if: always()
         run: pip cache purge && uv cache clean && rm -rf ./* ./.*
 
-  transformers_diff:
+  legacy:
     needs:
       - build
       - list-test-files
@@ -383,6 +388,7 @@ jobs:
 
       - name: Install wheel
         run: |
+          uv pip install colorlog
           uv pip install git+https://github.com/ModelCloud/Tokenicer -U
           echo "===== install optimum bitblas parameterized uvicorn ====="
           uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
@@ -441,7 +447,7 @@ jobs:
         if: always()
         run: pip cache purge && uv cache clean && rm -rf ./* ./.*
 
-  torch2_5:
+  torch:
     needs:
       - build
       - list-test-files
@@ -541,22 +547,26 @@ jobs:
 
       - name: Install wheel
         run: |
-          if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
-            echo "===== install auto_round ====="
-            uv pip install auto_round -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
-          fi
-          if [ "${{ matrix.test_script }}" == "models/test_cohere2" ] || [ "${{ matrix.test_script }}" == "models/test_gemma" ]; then
-            echo "===== install transformers from git ====="
-            uv pip install -U git+https://github.com/huggingface/transformers.git -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
+          uv pip install colorlog
+          echo "===== updateing latest transformers ====="
+          uv pip install -U transformers
+
+          if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ] || [ "${{ matrix.test_script }}" == "test_q4_bitblas" ]; then
+            echo "===== install auto_round bitblas==0.0.1.dev13 ====="
+            uv pip install auto_round bitblas==0.0.1.dev13 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
           fi
+
           if [[ "${{ matrix.test_script }}" == *xpu* ]]; then
             source /etc/profile.d/pyenv.sh && pyenv activate xpu
+            uv pip install colorlog
           fi
 
           if [[ "${{ matrix.test_script }}" == *"mlx"* ]]; then
             uv pip install mlx_lm --no-build-isolation -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
           fi
+
           if [[ "${{ matrix.test_script }}" == "test_modelscope" ]]; then
+            echo "===== installing modelscope ====="
             uv pip install modelscope --no-build-isolation -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} --extra-index-url https://pypi.org/simple
           fi
 
@@ -622,16 +632,18 @@ jobs:
 
       - name: Clean cache
         if: always()
-        run: pip cache purge && uv cache clean && rm -rf ./* ./.*
+        run: |
+          rm ~/.cache/evalplus/*pkl || true
+          pip cache purge && uv cache clean && rm -rf ./* ./.*
 
   show-statistics:
     runs-on: [ self-hosted, xeon5 ]
     if: github.event.inputs.exclusive-gpu != 'true'
     container:
       image: modelcloud/gptqmodel:alpine-ci-v1
     needs:
-      - transformers_diff
-      - torch2_5
+      - legacy
+      - torch
     steps:
       - name: Print statistics
         run: curl "http://10.0.14.248/gpu/get_vram_logs?id=${{ github.run_id }}"

diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py
@@ -195,8 +195,8 @@ def load_model_tokenizer(
 def benchmark_generation_speed(model, tokenizer, examples, generation_config):
     generation_time_list = []
     num_generated_tokens_list = []
-    progress_bar = ProgressBar(examples)
-    for example in progress_bar:
+    pb = ProgressBar(examples)
+    for example in pb:
         input_ids = example["input_ids"].to(model.device)
 
         start = time.time()
@@ -217,7 +217,7 @@ def benchmark_generation_speed(model, tokenizer, examples, generation_config):
             )
         num_generated_tokens_list.append(num_generated_tokens)
 
-        progress_bar.set_postfix(
+        pb.set_postfix(
             num_tokens=num_generated_tokens_list[-1],
             time=generation_time_list[-1],
             speed=f"{num_generated_tokens_list[-1] / generation_time_list[-1]:.3f} tokens/s",

diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py
@@ -68,9 +68,6 @@ def main():
     # with value under torch.LongTensor type.
     model.quantize(traindataset)
 
-    # save quantized model
-    model.save(quantized_model_id)
-
     # save quantized model using safetensors
     model.save(quantized_model_id)
 

diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
@@ -14,13 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 from .models import GPTQModel, get_best_device
 from .quantization import BaseQuantizeConfig, QuantizeConfig
 from .utils import BACKEND
 from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
 
-import os
 if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
     try:
         from modelscope.utils.hf_util.patcher import patch_hub

diff --git a/gptqmodel/adapter/adapter.py b/gptqmodel/adapter/adapter.py
@@ -28,7 +28,7 @@ def validate_path(self, local_only=False):
                 raise ValueError(f"Adapter: `path` str in this context must be a local os path: actual = `{self.path}`.")
 
     # override me
-    def apply(self, x: torch.Tensor, out: torch.Tensor):
+    def apply(self, x: torch.Tensor, out: torch.Tensor) -> torch.Tensor:
         pass
 
     # override me
@@ -67,15 +67,18 @@ def parameter_keys(cls) -> List[str]:
         return ["lora_A", "lora_B"]
 
     def optimize(self, backend: str = "inductor", mode: str = None, fullgraph: bool = False):
-        print("Lora compile")
-        self.apply = torch_compile(self.apply, backend=backend, mode=mode, fullgraph=fullgraph)
+        pass
+        #logger.info("Adapter: optimize (compile)")
+        #self.apply = torch_compile(self.apply, backend=backend, mode=mode, fullgraph=fullgraph)
 
-    def apply(self, x: torch.Tensor, out: torch.Tensor):
+    def apply(self, x: torch.Tensor, out: torch.Tensor) -> torch.Tensor:
         # original code
         # out = out + ((x @ self.lora_A) @ self.lora_B)
 
         # fix batch for lora
-        if out.shape[0] > 1:
+        # Some kernels do not reshape x, such as marlin / exllama / exllamav2.
+        # out.dim() > x.dim() is used to exclude these kernels without additional processing
+        if out.dim() > x.dim() and out.shape[0] > 1:
             out_orgi_shape = out.shape
             out = out.view(-1, out.shape[-1])
             out.add_((x @ self.lora_A) @ self.lora_B)

diff --git a/gptqmodel/eora/eora.py b/gptqmodel/eora/eora.py
@@ -1,4 +1,4 @@
-# Copyright 2024-2025 NVIDIA
+# Copyright 2024-2025 NVIDIA CORPORATION
 # EoRA arXiv: https://arxiv.org/abs/2410.21271
 
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,7 +22,7 @@
 
 logger = setup_logger()
 
-def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.float32], sample_size: int):
+def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict[str, torch.dtype], sample_size: int):
     inp = input[0].to(dtype=torch.float32)
     if inp.dim() == 2:
         inp = inp.unsqueeze(0)
@@ -38,9 +38,9 @@ def eora_process_input(input: Tensor, name: str, eigen_scaling_diag_matrix: Dict
 
 def eora_compute_lora(
         device: torch.device,
-        w_wq_delta: Tensor, # need the w (original weight) and wq (quantized qeight) delta in float32
+        w_wq_delta: Tensor, # need the w (original weight) and wq (quantized qweight) delta in float32
         module: NamedModule,
-        eigen_scaling_diag_matrix: torch.float32,
+        eigen_scaling_diag_matrix: torch.dtype,
         rank: int) -> Tuple[Tensor, Tensor]:
 
     assert w_wq_delta.dtype == torch.float32

diff --git a/gptqmodel/looper/dequantize_processor.py b/gptqmodel/looper/dequantize_processor.py
@@ -26,7 +26,8 @@
 
 class DequantizeProcessor(LoopProcessor):
     def __init__(self, quantized_modules: Dict[str, TorchQuantLinear]):
-        super().__init__(tokenizer=None, qcfg=None, calibration_dataset=None, calibration_dataset_concat_size=None, batch_size=1,
+        super().__init__(tokenizer=None, qcfg=None, calibration_dataset=None, calibration_dataset_concat_size=None,
+                         prepare_dataset_func=None, batch_size=1,
                          logger_board="", require_fwd=True)
 
         self.quantized_modules = quantized_modules

diff --git a/gptqmodel/looper/eora_processor.py b/gptqmodel/looper/eora_processor.py
@@ -30,18 +30,20 @@
 from gptqmodel.quantization.gptq import CPU
 from gptqmodel.utils.logger import setup_logger
 from gptqmodel.utils.model import move_to
-from gptqmodel.utils.torch import torch_sync, torch_compile
+from gptqmodel.utils.torch import torch_compile, torch_sync
 from torch.nn import Module
 
 logger = setup_logger()
 
 
 class EoraProcessor(LoopProcessor):
-    def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset,
+    def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func,
                  calibration_dataset_concat_size: Optional[int], batch_size: int,
                  logger_board: str = "", require_fwd: bool = True,
                  ):
-        super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset, calibration_dataset_concat_size=calibration_dataset_concat_size, batch_size=batch_size,
+        super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset,
+                         calibration_dataset_concat_size=calibration_dataset_concat_size,
+                         prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
                          logger_board=logger_board, require_fwd=require_fwd)
 
         # dict: key is module name, value is the accumulated eigen_scaling_diag_matrix
@@ -113,7 +115,7 @@ def tmp(_, input: Tuple[torch.Tensor, ...], output: torch.Tensor):
     def process(self, module: NamedModule):
         assert isinstance(module.adapter_cfg, Lora)
 
-        self.pb.set_description(f"EoRA gen: {module.name} in layer {module.layer_index} of {self.layer_count - 1}")
+        self.pb.info(f"EoRA gen: {module.name} in layer {module.layer_index} of {self.layer_count - 1}")
 
         start = time.time()
 

diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py
@@ -34,11 +34,13 @@
 logger = setup_logger()
 
 class GPTQProcessor(LoopProcessor):
-    def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset,
+    def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration_dataset, prepare_dataset_func,
                  calibration_dataset_concat_size: Optional[int], batch_size: int,
                  logger_board: str = "", require_fwd: bool = True, retain_w: bool = False):
 
-        super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset, calibration_dataset_concat_size=calibration_dataset_concat_size, batch_size=batch_size,
+        super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration_dataset=calibration_dataset,
+                         calibration_dataset_concat_size=calibration_dataset_concat_size,
+                         prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
                          logger_board=logger_board, require_fwd=require_fwd)
 
         self.retain_w = retain_w
@@ -111,7 +113,7 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
         return tmp
 
     def process(self, module: NamedModule):
-        self.pb.set_description(f"Quantizing {module.name} in layer {module.layer_index} of {self.layer_count - 1}")
+        self.pb.info(f"Quantizing {module.name} in layer {module.layer_index} of {self.layer_count - 1}")
         gptq = self.tasks
 
         # logger.info(f"Quantizing module START: {name}, {gptq[name].shape()}")