From 7a5b6cb9eb178814d988e6b861df4c03b5c17ca0 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Fri, 28 Feb 2025 16:33:07 +0000 Subject: [PATCH 1/7] add gptqmodel doc --- .../source/features/quantization/gptqmodel.md | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 docs/source/features/quantization/gptqmodel.md diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md new file mode 100644 index 000000000000..2e1f423f5d07 --- /dev/null +++ b/docs/source/features/quantization/gptqmodel.md @@ -0,0 +1,80 @@ +(gptqmodel)= + +# GPTQModel + +To create a new [2, 3, 4, 8]-bit quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel). + +Quantizing reduces the model's precision from BF16 (16-bits) to 2-to-8-bits range which can significantly reduces the +model memory footprint size while increasing inference performance. + +Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` kernels to maximize batching +transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs. +These two kernels are highly optimized by vLLM teams to allow word-class inference performance of quantized GPTQModel +models. + + +You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq). + +```console +pip install -U gptqmodel --no-build-isolation -v +``` + +After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details. + +Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: + +```python +from datasets import load_dataset +from gptqmodel import GPTQModel, QuantizeConfig + +model_id = "meta-llama/Llama-3.2-1B-Instruct" +quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" + +calibration_dataset = load_dataset( + "allenai/c4", + data_files="en/c4-train.00001-of-01024.json.gz", + split="train" + ).select(range(1024))["text"] + +quant_config = QuantizeConfig(bits=4, group_size=128) + +model = GPTQModel.load(model_id, quant_config) + +# increase `batch_size` to match gpu/vram specs to speed up quantization +model.quantize(calibration_dataset, batch_size=2) + +model.save(quant_path) +``` + +To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: + +```console +python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 --quantization gptq +``` + +AWQ models are also supported directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.6, top_p=0.9) + +# Create an LLM. +llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2", quantization="gptq") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` From 0debd84825ff365e71b8ba80a5d1ccc0d7c49caa Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Sat, 1 Mar 2025 00:45:26 +0800 Subject: [PATCH 2/7] Update gptqmodel.md --- docs/source/features/quantization/gptqmodel.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md index 2e1f423f5d07..2f8ce648205c 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/source/features/quantization/gptqmodel.md @@ -2,16 +2,20 @@ # GPTQModel -To create a new [2, 3, 4, 8]-bit quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel). +To create a new [2, 3, 4, 8]-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel). -Quantizing reduces the model's precision from BF16 (16-bits) to 2-to-8-bits range which can significantly reduces the -model memory footprint size while increasing inference performance. +Quantizing reduces the model's precision from BF16 (16-bits) to 2-to-8-bits range which can significantly reduce the +model memory footprint while at-the-same-time increasing inference performance. -Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` kernels to maximize batching +Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs. -These two kernels are highly optimized by vLLM teams to allow word-class inference performance of quantized GPTQModel +These two kernels are highly optimized by vLLM and NeuralMagic (now part of Redhat) to allow word-class inference performance of quantized GPTQ models. +GPTQModel is one of the few quantization toolkits in the world that allows `Dynamic` per-module quantization where different layers and/or modules within a llm model can be further optimized with custom quantization parameters. `Dynamic` quantization +is fully integrated into vLLM is backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override) +for more details on this and other advanced features. + You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq). @@ -52,7 +56,7 @@ To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill- python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 --quantization gptq ``` -AWQ models are also supported directly through the LLM entrypoint: +GPTQModel quantized models are also supported directly through the LLM entrypoint: ```python from vllm import LLM, SamplingParams From f778c8b3f4ff1d78b286fb59664e2d8fdf389532 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Sat, 1 Mar 2025 00:49:01 +0800 Subject: [PATCH 3/7] Update gptqmodel.md --- docs/source/features/quantization/gptqmodel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md index 2f8ce648205c..ed976c1bc238 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/source/features/quantization/gptqmodel.md @@ -2,7 +2,7 @@ # GPTQModel -To create a new [2, 3, 4, 8]-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel). +To create a new [2, 3, 4, 8]-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI. Quantizing reduces the model's precision from BF16 (16-bits) to 2-to-8-bits range which can significantly reduce the model memory footprint while at-the-same-time increasing inference performance. From 311efab34c6900e62c7e6e3921dbdf24066ebc51 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Sat, 1 Mar 2025 01:25:44 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20gptqmodel.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/features/quantization/gptqmodel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md index ed976c1bc238..8790f6772a94 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/source/features/quantization/gptqmodel.md @@ -13,7 +13,7 @@ These two kernels are highly optimized by vLLM and NeuralMagic (now part of Redh models. GPTQModel is one of the few quantization toolkits in the world that allows `Dynamic` per-module quantization where different layers and/or modules within a llm model can be further optimized with custom quantization parameters. `Dynamic` quantization -is fully integrated into vLLM is backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override) +is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override) for more details on this and other advanced features. From 5ba6e09aac2bcb6ff67f2bfe0c8d48a393aa1c42 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Sat, 1 Mar 2025 08:36:56 +0800 Subject: [PATCH 5/7] Update gptqmodel.md --- docs/source/features/quantization/gptqmodel.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md index 8790f6772a94..874de817f0ed 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/source/features/quantization/gptqmodel.md @@ -2,10 +2,10 @@ # GPTQModel -To create a new [2, 3, 4, 8]-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI. +To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI. -Quantizing reduces the model's precision from BF16 (16-bits) to 2-to-8-bits range which can significantly reduce the -model memory footprint while at-the-same-time increasing inference performance. +Quantization reduces the model's precision from BF16/FP16 (16-bits) to 4-bits or 8-bits which significantly reduce the +total model memory footprint while at-the-same-time increasing inference performance. Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs. @@ -53,7 +53,7 @@ model.save(quant_path) To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: ```console -python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 --quantization gptq +python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 ``` GPTQModel quantized models are also supported directly through the LLM entrypoint: @@ -72,7 +72,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.6, top_p=0.9) # Create an LLM. -llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2", quantization="gptq") +llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) From c37bfbb93e6a9107e27a23f30a9c2883313652f0 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 1 Mar 2025 00:49:58 +0000 Subject: [PATCH 6/7] update --- docs/source/features/quantization/auto_awq.md | 2 +- docs/source/features/quantization/gptqmodel.md | 2 +- docs/source/features/quantization/index.md | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 7001ec91467f..b703d0195319 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -3,7 +3,7 @@ # AutoAWQ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). -Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. +Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint. The main benefits are lower latency and memory usage. You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md index 874de817f0ed..9991a3d7aa6b 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/source/features/quantization/gptqmodel.md @@ -4,7 +4,7 @@ To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI. -Quantization reduces the model's precision from BF16/FP16 (16-bits) to 4-bits or 8-bits which significantly reduce the +Quantization reduces the model's precision from BF16/FP16 (16-bits) to IN4 (4-bits) or INT8 (8-bits) which significantly reduces the total model memory footprint while at-the-same-time increasing inference performance. Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index 1c98620aa214..65f438f599f1 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -12,6 +12,7 @@ supported_hardware auto_awq bnb gguf +gptqmodel int4 int8 fp8 From c1cc3d99f3454c99334cd27c19d72ba6fd6616e7 Mon Sep 17 00:00:00 2001 From: mgoin Date: Mon, 3 Mar 2025 21:18:37 +0000 Subject: [PATCH 7/7] Fix Signed-off-by: mgoin --- .../source/features/quantization/gptqmodel.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md index 9991a3d7aa6b..34adf6512b7e 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/source/features/quantization/gptqmodel.md @@ -4,18 +4,17 @@ To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI. -Quantization reduces the model's precision from BF16/FP16 (16-bits) to IN4 (4-bits) or INT8 (8-bits) which significantly reduces the +Quantization reduces the model's precision from BF16/FP16 (16-bits) to INT4 (4-bits) or INT8 (8-bits) which significantly reduces the total model memory footprint while at-the-same-time increasing inference performance. -Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching -transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs. -These two kernels are highly optimized by vLLM and NeuralMagic (now part of Redhat) to allow word-class inference performance of quantized GPTQ -models. - -GPTQModel is one of the few quantization toolkits in the world that allows `Dynamic` per-module quantization where different layers and/or modules within a llm model can be further optimized with custom quantization parameters. `Dynamic` quantization -is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override) -for more details on this and other advanced features. +Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching +transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs. +These two kernels are highly optimized by vLLM and NeuralMagic (now part of Redhat) to allow world-class inference performance of quantized GPTQ +models. +GPTQModel is one of the few quantization toolkits in the world that allows `Dynamic` per-module quantization where different layers and/or modules within a llm model can be further optimized with custom quantization parameters. `Dynamic` quantization +is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override) +for more details on this and other advanced features. You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq). @@ -23,7 +22,7 @@ You can quantize your own models by installing [GPTQModel](https://github.com/Mo pip install -U gptqmodel --no-build-isolation -v ``` -After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details. +After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details. Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: