Skip to content
Merged

update #1370

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@
</p>

## News
* 2/22/2025 2.0.0-dev: 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster, backend. `ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. Fixed `generation_config.json` save and load. Fix Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes. Delegate loggin/progressbar to [LogBar](https://github.com/modelcloud/logbar) pkg.
* 03/03/2025 [2.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v2.0.0): 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion.
Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster backend.
`ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress.
Fixed `generation_config.json` save and load. Fixed Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression.
Fixed Qwen 2.5 MoE regressions.
Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes. Delegate loggin/progressbar to [LogBar](https://github.com/modelcloud/logbar) pkg.
Fix ROCm version auto detection in `setup` install.
* 02/12/2025 [1.9.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.9.0): ⚡ Offload `tokenizer` fixes to [Toke(n)icer](https://github.com/modelcloud/tokenicer) pkg. Optimized `lm_head` quant time and vram usage.
Optimized `DeepSeek v3/R1` model quant vram usage. Fixed `Optimum` compat regresion in `v1.8.1`. 3x speed-up for `Torch` kernel when using Pytorch >= 2.5.0 with `model.optimize()`. New `calibration_dataset_concat_size` option to enable calibration data `concat` mode to mimic original GPTQ data packing strategy which may improve quant speed and accuracy for datasets like `wikitext2`.
* 02/08/2025 [1.8.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.8.1): ⚡ `DeepSeek v3/R1` model support. New flexible weight `packing`: allow quantized weights to be packed to `[int32, int16, int8]` dtypes.
Expand Down
4 changes: 2 additions & 2 deletions gptqmodel/utils/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

AUTO_SELECT_BACKEND_ORDER = OrderedDict({
BACKEND.MARLIN: MarlinQuantLinear, # optimized for bs > 1
BACKEND.EXLLAMA_EORA: ExllamaEoraQuantLinear, #
# BACKEND.EXLLAMA_EORA: ExllamaEoraQuantLinear, #
BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear, # optimized for bs > 1
BACKEND.EXLLAMA_V1: ExllamaQuantLinear, # optimized for bs == 1
BACKEND.TRITON: TritonV2QuantLinear, # good all around kernel that JIT compiles
Expand All @@ -56,7 +56,7 @@
})

FORMAT_DICT = {
FORMAT.GPTQ: [BACKEND.MARLIN, BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.EXLLAMA_EORA, BACKEND.TRITON, BACKEND.CUDA, BACKEND.IPEX, BACKEND.TORCH, BACKEND.MARLIN_FP16],
FORMAT.GPTQ: [BACKEND.MARLIN, BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.IPEX, BACKEND.TORCH, BACKEND.MARLIN_FP16, BACKEND.EXLLAMA_EORA],
FORMAT.GPTQ_V2: [BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.TORCH],
FORMAT.MARLIN: [BACKEND.MARLIN, BACKEND.MARLIN_FP16],
FORMAT.BITBLAS: [BACKEND.BITBLAS],
Expand Down
2 changes: 1 addition & 1 deletion gptqmodel/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "2.0.0-dev"
__version__ = "2.0.0"
10 changes: 5 additions & 5 deletions tests/test_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

class Test(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128"
lora_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/llama3.2-1b-4bit-group128-eora-rank128-arc/adapter_model.safetensors" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc/blob/main/adapter_model.safetensors" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc"
lora_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/llama3.2-1b-4bit-group128-eora-rank128-arc" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc/blob/main/adapter_model.safetensors" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc"

NATIVE_ARC_CHALLENGE_ACC = 0.3567
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805
Expand All @@ -45,8 +45,8 @@ def setUpClass(cls):
# BACKEND.CUDA,
# BACKEND.TRITON,
# BACKEND.EXLLAMA_V1,
BACKEND.EXLLAMA_V2,
# BACKEND.MARLIN,
# BACKEND.EXLLAMA_V2,
BACKEND.MARLIN,
# # (BACKEND.IPEX), <-- not tested yet
# # (BACKEND.BITBLAS, <-- not tested yet
])
Expand All @@ -65,10 +65,10 @@ def test_load(self, backend: BACKEND):
self.assertIn("paris", result.lower())

@parameterized.expand([
BACKEND.EXLLAMA_V2,
BACKEND.MARLIN,
])
def test_download(self, backend: BACKEND):
adapter = Lora(path="https://huggingface.co/sliuau/llama3.2-1b-4bit-group128-eora-rank128-arc/blob/main/adapter_model.safetensors", rank=128)
adapter = Lora(path="sliuau/llama3.2-1b-4bit-group128-eora-rank128-arc", rank=128)

model = GPTQModel.load(
self.NATIVE_MODEL_ID,
Expand Down