From 22dbba9b47da02985e5086379dcd206e311487d9 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Mon, 3 Mar 2025 12:00:36 +0000 Subject: [PATCH 1/2] update Signed-off-by: Qubitium --- README.md | 7 ++++++- gptqmodel/utils/importer.py | 4 ++-- gptqmodel/version.py | 2 +- tests/test_lora.py | 10 +++++----- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 13de0baa3..098807078 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,12 @@

## News -* 2/22/2025 2.0.0-dev: 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster, backend. `ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. Fixed `generation_config.json` save and load. Fix Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes. Delegate loggin/progressbar to [LogBar](https://github.com/modelcloud/logbar) pkg. +* 03/03/2025 [2.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v2.0.0): 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. +Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster, backend. +`ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. +Fixed `generation_config.json` save and load. Fixed Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. +Fixed Qwen 2.5 MoE regressions. +Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes. Delegate loggin/progressbar to [LogBar](https://github.com/modelcloud/logbar) pkg. * 02/12/2025 [1.9.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.9.0): ⚡ Offload `tokenizer` fixes to [Toke(n)icer](https://github.com/modelcloud/tokenicer) pkg. Optimized `lm_head` quant time and vram usage. Optimized `DeepSeek v3/R1` model quant vram usage. Fixed `Optimum` compat regresion in `v1.8.1`. 3x speed-up for `Torch` kernel when using Pytorch >= 2.5.0 with `model.optimize()`. New `calibration_dataset_concat_size` option to enable calibration data `concat` mode to mimic original GPTQ data packing strategy which may improve quant speed and accuracy for datasets like `wikitext2`. * 02/08/2025 [1.8.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.8.1): ⚡ `DeepSeek v3/R1` model support. New flexible weight `packing`: allow quantized weights to be packed to `[int32, int16, int8]` dtypes. diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py index 8db2bacae..956099def 100644 --- a/gptqmodel/utils/importer.py +++ b/gptqmodel/utils/importer.py @@ -45,7 +45,7 @@ AUTO_SELECT_BACKEND_ORDER = OrderedDict({ BACKEND.MARLIN: MarlinQuantLinear, # optimized for bs > 1 - BACKEND.EXLLAMA_EORA: ExllamaEoraQuantLinear, # + # BACKEND.EXLLAMA_EORA: ExllamaEoraQuantLinear, # BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear, # optimized for bs > 1 BACKEND.EXLLAMA_V1: ExllamaQuantLinear, # optimized for bs == 1 BACKEND.TRITON: TritonV2QuantLinear, # good all around kernel that JIT compiles @@ -56,7 +56,7 @@ }) FORMAT_DICT = { - FORMAT.GPTQ: [BACKEND.MARLIN, BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.EXLLAMA_EORA, BACKEND.TRITON, BACKEND.CUDA, BACKEND.IPEX, BACKEND.TORCH, BACKEND.MARLIN_FP16], + FORMAT.GPTQ: [BACKEND.MARLIN, BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.IPEX, BACKEND.TORCH, BACKEND.MARLIN_FP16, BACKEND.EXLLAMA_EORA], FORMAT.GPTQ_V2: [BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.TORCH], FORMAT.MARLIN: [BACKEND.MARLIN, BACKEND.MARLIN_FP16], FORMAT.BITBLAS: [BACKEND.BITBLAS], diff --git a/gptqmodel/version.py b/gptqmodel/version.py index 7e85f6946..5bf199c76 100644 --- a/gptqmodel/version.py +++ b/gptqmodel/version.py @@ -14,4 +14,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.0.0-dev" +__version__ = "2.0.0" diff --git a/tests/test_lora.py b/tests/test_lora.py index 4b1727fea..36dd8c07a 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -29,7 +29,7 @@ class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128" - lora_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/llama3.2-1b-4bit-group128-eora-rank128-arc/adapter_model.safetensors" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc/blob/main/adapter_model.safetensors" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc" + lora_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/llama3.2-1b-4bit-group128-eora-rank128-arc" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc/blob/main/adapter_model.safetensors" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc" NATIVE_ARC_CHALLENGE_ACC = 0.3567 NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 @@ -45,8 +45,8 @@ def setUpClass(cls): # BACKEND.CUDA, # BACKEND.TRITON, # BACKEND.EXLLAMA_V1, - BACKEND.EXLLAMA_V2, - # BACKEND.MARLIN, + # BACKEND.EXLLAMA_V2, + BACKEND.MARLIN, # # (BACKEND.IPEX), <-- not tested yet # # (BACKEND.BITBLAS, <-- not tested yet ]) @@ -65,10 +65,10 @@ def test_load(self, backend: BACKEND): self.assertIn("paris", result.lower()) @parameterized.expand([ - BACKEND.EXLLAMA_V2, + BACKEND.MARLIN, ]) def test_download(self, backend: BACKEND): - adapter = Lora(path="https://huggingface.co/sliuau/llama3.2-1b-4bit-group128-eora-rank128-arc/blob/main/adapter_model.safetensors", rank=128) + adapter = Lora(path="sliuau/llama3.2-1b-4bit-group128-eora-rank128-arc", rank=128) model = GPTQModel.load( self.NATIVE_MODEL_ID, From 93ed8492667efa7cad56c330593c87ebf7191058 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Mon, 3 Mar 2025 12:03:55 +0000 Subject: [PATCH 2/2] update Signed-off-by: Qubitium --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 098807078..48819d9ea 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,12 @@ ## News * 03/03/2025 [2.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v2.0.0): 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. -Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster, backend. +Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster backend. `ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. Fixed `generation_config.json` save and load. Fixed Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. Fixed Qwen 2.5 MoE regressions. Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes. Delegate loggin/progressbar to [LogBar](https://github.com/modelcloud/logbar) pkg. +Fix ROCm version auto detection in `setup` install. * 02/12/2025 [1.9.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.9.0): ⚡ Offload `tokenizer` fixes to [Toke(n)icer](https://github.com/modelcloud/tokenicer) pkg. Optimized `lm_head` quant time and vram usage. Optimized `DeepSeek v3/R1` model quant vram usage. Fixed `Optimum` compat regresion in `v1.8.1`. 3x speed-up for `Torch` kernel when using Pytorch >= 2.5.0 with `model.optimize()`. New `calibration_dataset_concat_size` option to enable calibration data `concat` mode to mimic original GPTQ data packing strategy which may improve quant speed and accuracy for datasets like `wikitext2`. * 02/08/2025 [1.8.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.8.1): ⚡ `DeepSeek v3/R1` model support. New flexible weight `packing`: allow quantized weights to be packed to `[int32, int16, int8]` dtypes.