From bf6b36aa7d466847e4ea23ed43ce4067cdd7b29e Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Wed, 26 Feb 2025 09:08:50 +0800
Subject: [PATCH 1/4] [CI] update UI

---
 .github/workflows/unit_tests.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 7b0e7251c..0f36f1919 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -43,13 +43,12 @@ on:
         required: false
         default: '20'
       exclusive-gpu:
-        description: 'one test, one gpu. for collecting statistics'
+        description: 'one test per gpu'
         type: boolean
         required: false
         default: true
       server:
-        description: 'Choose server (zen4 or xeon5)'
-        required: true
+        description: 'build server'
         type: choice
         options:
           - '["self-hosted", "zen4"]'

From 27fd33d6f20e1b380026909b462c8a8c6db1c550 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Wed, 26 Feb 2025 09:12:04 +0800
Subject: [PATCH 2/4] Update unit_tests.yml

---
 .github/workflows/unit_tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 0f36f1919..b1e8fb9d5 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -39,16 +39,16 @@ on:
         required: false
         default: ''
       max-parallel:
-        description: 'max parallel jobs'
+        description: 'Parallel jobs'
         required: false
         default: '20'
       exclusive-gpu:
-        description: 'one test per gpu'
+        description: 'One Test Per GPU'
         type: boolean
         required: false
         default: true
       server:
-        description: 'build server'
+        description: 'Wheel Build Server'
         type: choice
         options:
           - '["self-hosted", "zen4"]'

From 1bc2ef8feb7575d321137c7746e9638261c5cb9d Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Wed, 26 Feb 2025 09:12:30 +0800
Subject: [PATCH 3/4] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 93a8c5ec7..22a0e9133 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 </p>
   
 ## News
-* 2/22/2025 2.0.0-dev: 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. Synced `Marlin` kernel inference quality fix from upstream. Added `MRLIN_FP16`, lower-quality but faster, backend. `ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. Fixed `generation_config.json` save and load. Fix Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes.
+* 2/22/2025 2.0.0-dev: 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster, backend. `ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. Fixed `generation_config.json` save and load. Fix Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes.
 * 02/12/2025 [1.9.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.9.0): ⚡ Offload `tokenizer` fixes to [Toke(n)icer](https://github.com/modelcloud/tokenicer) pkg. Optimized `lm_head` quant time and vram usage.
   Optimized `DeepSeek v3/R1` model quant vram usage. Fixed `Optimum` compat regresion in `v1.8.1`. 3x speed-up for `Torch` kernel when using Pytorch >= 2.5.0 with `model.optimize()`. New `calibration_dataset_concat_size` option to enable calibration data `concat` mode to mimic original GPTQ data packing strategy which may improve quant speed and accuracy for datasets like `wikitext2`. 
 * 02/08/2025 [1.8.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.8.1): ⚡ `DeepSeek v3/R1` model support. New flexible weight `packing`: allow quantized weights to be packed to `[int32, int16, int8]` dtypes. 

From 7f57aa9947603c58dca381bbd7d5216fcd6cdf9d Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Wed, 26 Feb 2025 09:14:00 +0800
Subject: [PATCH 4/4] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 22a0e9133..248b288f0 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 </p>
   
 ## News
-* 2/22/2025 2.0.0-dev: 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster, backend. `ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. Fixed `generation_config.json` save and load. Fix Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes.
+* 2/22/2025 2.0.0-dev: 🎉 `GPTQ` quantization internals are now broken into multiple stages (processes) for feature expansion. Synced `Marlin` kernel inference quality fix from upstream. Added `MARLIN_FP16`, lower-quality but faster, backend. `ModelScope` support added. Logging and cli progress bar output has been revamped with sticky bottom progress. Fixed `generation_config.json` save and load. Fix Transformers v4.49.0 compat. Fixed compat of models without `bos`. Fixed `group_size=-1` and `bits=3` packing regression. Added CI tests to track regression in kernel inference quality and sweep all bits/group_sizes. Delegate loggin/progressbar to [LogBar](https://github.com/modelcloud/logbar) pkg.
 * 02/12/2025 [1.9.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.9.0): ⚡ Offload `tokenizer` fixes to [Toke(n)icer](https://github.com/modelcloud/tokenicer) pkg. Optimized `lm_head` quant time and vram usage.
   Optimized `DeepSeek v3/R1` model quant vram usage. Fixed `Optimum` compat regresion in `v1.8.1`. 3x speed-up for `Torch` kernel when using Pytorch >= 2.5.0 with `model.optimize()`. New `calibration_dataset_concat_size` option to enable calibration data `concat` mode to mimic original GPTQ data packing strategy which may improve quant speed and accuracy for datasets like `wikitext2`. 
 * 02/08/2025 [1.8.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.8.1): ⚡ `DeepSeek v3/R1` model support. New flexible weight `packing`: allow quantized weights to be packed to `[int32, int16, int8]` dtypes.