Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Model Optimizer Changelog (Linux)
- Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
- Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
- Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
- Add KL Divergence loss based auto_quantize method. See `auto_quantize API docs <https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize>`_ for more details.
- Add support for saving and resuming auto_quantize search state. This speeds up the auto_quantize process by skipping the score estimation step if the search state is provided.
- Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
- Add support for PyTorch Geometric quantization.
- Add per tensor and per channel MSE calibrator support.
Expand Down
35 changes: 34 additions & 1 deletion examples/llm_eval/gen_model_answer.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,11 @@ def get_model_answers(
tokenizer,
args.calib_batch_size,
args.calib_size,
args.auto_quantize_bits,
test_generated=False,
auto_quantize_bits=args.auto_quantize_bits,
auto_quantize_method=args.auto_quantize_method,
auto_quantize_score_size=args.auto_quantize_score_size,
auto_quantize_checkpoint=args.auto_quantize_checkpoint,
)

for question in tqdm(questions):
Expand Down Expand Up @@ -450,6 +453,36 @@ def reorg_answer_file(answer_file):
"regular quantization without auto_quantize search will be applied."
),
)
parser.add_argument(
"--auto_quantize_method",
type=str,
default="gradient",
choices=["gradient", "kl_div"],
help=(
"Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
"(requires labels in dataset). 'kl_div' uses KL divergence between original and "
"quantized model outputs (no labels required). Default: 'gradient'"
),
)
parser.add_argument(
"--auto_quantize_score_size",
type=int,
default=128,
help=(
"Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
"sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
"final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
),
)
parser.add_argument(
"--auto_quantize_checkpoint",
type=str,
default=None,
help=(
"Path to checkpoint file for saving/restoring auto_quantize search state "
"(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
),
)
parser.add_argument(
"--trust_remote_code",
help="Set trust_remote_code for Huggingface models and tokenizers",
Expand Down
39 changes: 37 additions & 2 deletions examples/llm_eval/lm_eval_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |

quant_cfg = arg_dict.pop("quant_cfg", None)
auto_quantize_bits = arg_dict.pop("auto_quantize_bits", None)
auto_quantize_method = arg_dict.pop("auto_quantize_method", "gradient")
auto_quantize_score_size = arg_dict.pop("auto_quantize_score_size", 128)
auto_quantize_checkpoint = arg_dict.pop("auto_quantize_checkpoint", None)
calib_batch_size = arg_dict.pop("calib_batch_size", None)
calib_size = arg_dict.pop("calib_size", 512)
compress = arg_dict.pop("compress", False)
Expand Down Expand Up @@ -81,8 +84,11 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
batch_size=calib_batch_size,
calib_size=calib_size,
auto_quantize_bits=auto_quantize_bits,
auto_quantize_method=auto_quantize_method,
auto_quantize_score_size=auto_quantize_score_size,
test_generated=False,
compress=compress,
auto_quantize_checkpoint=auto_quantize_checkpoint,
)

return model_obj
Expand All @@ -101,6 +107,12 @@ def setup_parser_with_modelopt_args():
"comma-separated list of quantization quantization formats that will be searched by `auto_quantize`"
),
)
parser.add_argument(
"--calib_batch_size", type=int, help="Batch size for quantization calibration"
)
parser.add_argument(
"--calib_size", type=int, help="Calibration size for quantization", default=512
)
parser.add_argument(
"--auto_quantize_bits",
type=float,
Expand All @@ -110,10 +122,30 @@ def setup_parser_with_modelopt_args():
),
)
parser.add_argument(
"--calib_batch_size", type=int, help="Batch size for quantization calibration"
"--auto_quantize_method",
type=str,
default="gradient",
choices=["gradient", "kl_div"],
help=(
"Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
"(requires labels in dataset). 'kl_div' uses KL divergence between original and "
"quantized model outputs (no labels required). Default: 'gradient'"
),
)
parser.add_argument(
"--calib_size", type=int, help="Calibration size for quantization", default=512
"--auto_quantize_score_size",
type=int,
default=128,
help=(
"Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
"sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
"final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
),
)
parser.add_argument(
"--auto_quantize_checkpoint",
type=str,
help=("Path to checkpoint file for saving/restoring auto_quantize search state. "),
)
parser.add_argument(
"--compress",
Expand All @@ -139,6 +171,9 @@ def setup_parser_with_modelopt_args():
{
"quant_cfg": args.quant_cfg,
"auto_quantize_bits": args.auto_quantize_bits,
"auto_quantize_method": args.auto_quantize_method,
"auto_quantize_score_size": args.auto_quantize_score_size,
"auto_quantize_checkpoint": args.auto_quantize_checkpoint,
"calib_batch_size": args.calib_batch_size,
"calib_size": args.calib_size,
"compress": args.compress,
Expand Down
6 changes: 6 additions & 0 deletions examples/llm_eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ def main(
batch_size: int = 0,
calib_size: int = 512,
dtype: str = "bfloat16",
auto_quantize_method: str = "gradient",
auto_quantize_score_size: int = 128,
auto_quantize_checkpoint: str | None = None,
**kwargs,
):
random.seed(RAND_SEED)
Expand Down Expand Up @@ -281,6 +284,9 @@ def main(
batch_size=batch_size,
calib_size=calib_size,
auto_quantize_bits=auto_quantize_bits,
auto_quantize_method=auto_quantize_method,
auto_quantize_score_size=auto_quantize_score_size,
auto_quantize_checkpoint=auto_quantize_checkpoint,
)

for subject in tqdm(subjects):
Expand Down
65 changes: 52 additions & 13 deletions examples/llm_eval/quantization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,11 @@ def _quantize_model_with_dataset(
quant_cfg: str | list[str],
calib_dataset,
auto_quantize_bits=None,
auto_quantize_method="gradient",
auto_quantize_score_size=128,
batch_size=1,
compress=False,
auto_quantize_checkpoint=None,
):
if hasattr(lm, "gpt2"):
net = lm.gpt2
Expand All @@ -81,23 +84,42 @@ def _quantize_model_with_dataset(
getattr(mtq, quant_fmt) for quant_fmt in quant_cfg if quant_fmt != "NONE"
]

def loss_func(output, data):
# For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
# which contains the loss attribute.
return output.loss
# Configure forward_step and loss_func based on method
if auto_quantize_method == "gradient":
# For gradient-based method, return full output with loss
def forward_step(model, batch):
return model(**batch)

def loss_func(output, data):
# For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
# which contains the loss attribute.
return output.loss
elif auto_quantize_method == "kl_div":
# For KL divergence method, return only logits
def forward_step(model, batch):
return model(**batch).logits

loss_func = None # KL divergence doesn't need a custom loss function
else:
raise ValueError(
f"Invalid auto_quantize_method: {auto_quantize_method}. "
"Must be 'gradient' or 'kl_div'"
)

net, _ = mtq.auto_quantize(
net,
constraints={"effective_bits": auto_quantize_bits},
quantization_formats=quant_cfg_for_search,
data_loader=calib_dataset,
forward_step=lambda model, batch: model(**batch),
forward_step=forward_step,
loss_func=loss_func,
num_calib_steps=len(calib_dataset),
num_score_steps=min(
len(calib_dataset), 128 // batch_size
), # Limit the number of score steps to avoid long calibration time
# Most time is spent on score estimation; fewer samples speed it up with little accuracy impact.
num_score_steps=min(len(calib_dataset), max(auto_quantize_score_size // batch_size, 1)),
verbose=True,
method=auto_quantize_method,
# disabled_layers=["*lm_head*", "*mlp.gate.*"],
checkpoint=auto_quantize_checkpoint,
)
else:
mtq_cfg = CUSTOM_CONFIG.get(quant_cfg) # type: ignore [arg-type]
Expand Down Expand Up @@ -141,10 +163,13 @@ def quantize_model(
tokenizer,
batch_size,
calib_size,
auto_quantize_bits=None,
data="cnn_dailymail",
test_generated=True,
compress=False,
auto_quantize_bits=None,
auto_quantize_method="gradient",
auto_quantize_score_size=128,
auto_quantize_checkpoint=None,
):
"""Quantizes the model with the provided calibration dataset.

Expand All @@ -155,10 +180,14 @@ def quantize_model(
tokenizer: the tokenizer.
batch_size: the calibration batch size for each calibration inference run.
calib_size: the total calibration dataset size.
auto_quantize_bits: The effective bits constraint for auto_quantize.
data: the name of the calibration dataset.
test_generated: If ``True``, test the generated text before and after quantization.
compress: If ``True``, compress the model after quantization.
auto_quantize_bits: The effective bits constraint for auto_quantize.
auto_quantize_method: The method for auto_quantize ('gradient' or 'kl_div').
auto_quantize_score_size: Number of samples used for auto_quantize scoring.
auto_quantize_checkpoint: Path to checkpoint file for saving/restoring auto_quantize search state
(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified.
"""
if "AWQ" in quant_cfg:
print(
Expand All @@ -170,8 +199,10 @@ def quantize_model(
if hasattr(model, "model"):
device = model.model.device

is_gradient_based = auto_quantize_bits is not None and auto_quantize_method == "gradient"

if batch_size == 0:
if auto_quantize_bits is not None or torch.distributed.is_initialized():
if is_gradient_based or torch.distributed.is_initialized():
raise ValueError("We dont support automatic batch size inference for this case.")

net = model.gpt2 if hasattr(model, "gpt2") else model.model
Expand All @@ -186,15 +217,23 @@ def quantize_model(
batch_size=batch_size,
num_samples=calib_size,
device=device,
include_labels=auto_quantize_bits is not None,
include_labels=is_gradient_based,
)

if test_generated:
input_str = tokenizer.decode(next(iter(calib_dataloader))["input_ids"][0])
generated_str_before_ptq = model.run(input_str)

_quantize_model_with_dataset(
model, quant_cfg, calib_dataloader, auto_quantize_bits, batch_size, compress
model,
quant_cfg,
calib_dataloader,
auto_quantize_bits,
auto_quantize_method,
auto_quantize_score_size,
batch_size,
compress,
auto_quantize_checkpoint,
)

if test_generated:
Expand Down
Loading