NVIDIA · realAsma · Nov 26, 2025 · Nov 13, 2025
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,8 @@ Model Optimizer Changelog (Linux)
 - Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
 - Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
+- Add KL Divergence loss based auto_quantize method. See `auto_quantize API docs <https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize>`_ for more details.
+- Add support for saving and resuming auto_quantize search state. This speeds up the auto_quantize process by skipping the score estimation step if the search state is provided.
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
 - Add support for PyTorch Geometric quantization.
 - Add per tensor and per channel MSE calibrator support.

@@ -201,8 +201,11 @@ def get_model_answers(
                 tokenizer,
                 args.calib_batch_size,
                 args.calib_size,
-                args.auto_quantize_bits,
                 test_generated=False,
+                auto_quantize_bits=args.auto_quantize_bits,
+                auto_quantize_method=args.auto_quantize_method,
+                auto_quantize_score_size=args.auto_quantize_score_size,
+                auto_quantize_checkpoint=args.auto_quantize_checkpoint,
             )
 
     for question in tqdm(questions):
@@ -450,6 +453,36 @@ def reorg_answer_file(answer_file):
             "regular quantization without auto_quantize search will be applied."
         ),
     )
+    parser.add_argument(
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Path to checkpoint file for saving/restoring auto_quantize search state "
+            "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
+        ),
+    )
     parser.add_argument(
         "--trust_remote_code",
         help="Set trust_remote_code for Huggingface models and tokenizers",

@@ -53,6 +53,9 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
 
     quant_cfg = arg_dict.pop("quant_cfg", None)
     auto_quantize_bits = arg_dict.pop("auto_quantize_bits", None)
+    auto_quantize_method = arg_dict.pop("auto_quantize_method", "gradient")
+    auto_quantize_score_size = arg_dict.pop("auto_quantize_score_size", 128)
+    auto_quantize_checkpoint = arg_dict.pop("auto_quantize_checkpoint", None)
     calib_batch_size = arg_dict.pop("calib_batch_size", None)
     calib_size = arg_dict.pop("calib_size", 512)
     compress = arg_dict.pop("compress", False)
@@ -81,8 +84,11 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
             batch_size=calib_batch_size,
             calib_size=calib_size,
             auto_quantize_bits=auto_quantize_bits,
+            auto_quantize_method=auto_quantize_method,
+            auto_quantize_score_size=auto_quantize_score_size,
             test_generated=False,
             compress=compress,
+            auto_quantize_checkpoint=auto_quantize_checkpoint,
         )
 
     return model_obj
@@ -101,6 +107,12 @@ def setup_parser_with_modelopt_args():
             "comma-separated list of quantization quantization formats that will be searched by `auto_quantize`"
         ),
     )
+    parser.add_argument(
+        "--calib_batch_size", type=int, help="Batch size for quantization calibration"
+    )
+    parser.add_argument(
+        "--calib_size", type=int, help="Calibration size for quantization", default=512
+    )
     parser.add_argument(
         "--auto_quantize_bits",
         type=float,
@@ -110,10 +122,30 @@ def setup_parser_with_modelopt_args():
         ),
     )
     parser.add_argument(
-        "--calib_batch_size", type=int, help="Batch size for quantization calibration"
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
     )
     parser.add_argument(
-        "--calib_size", type=int, help="Calibration size for quantization", default=512
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_checkpoint",
+        type=str,
+        help=("Path to checkpoint file for saving/restoring auto_quantize search state. "),
     )
     parser.add_argument(
         "--compress",
@@ -139,6 +171,9 @@ def setup_parser_with_modelopt_args():
         {
             "quant_cfg": args.quant_cfg,
             "auto_quantize_bits": args.auto_quantize_bits,
+            "auto_quantize_method": args.auto_quantize_method,
+            "auto_quantize_score_size": args.auto_quantize_score_size,
+            "auto_quantize_checkpoint": args.auto_quantize_checkpoint,
             "calib_batch_size": args.calib_batch_size,
             "calib_size": args.calib_size,
             "compress": args.compress,

@@ -227,6 +227,9 @@ def main(
     batch_size: int = 0,
     calib_size: int = 512,
     dtype: str = "bfloat16",
+    auto_quantize_method: str = "gradient",
+    auto_quantize_score_size: int = 128,
+    auto_quantize_checkpoint: str | None = None,
     **kwargs,
 ):
     random.seed(RAND_SEED)
@@ -281,6 +284,9 @@ def main(
                     batch_size=batch_size,
                     calib_size=calib_size,
                     auto_quantize_bits=auto_quantize_bits,
+                    auto_quantize_method=auto_quantize_method,
+                    auto_quantize_score_size=auto_quantize_score_size,
+                    auto_quantize_checkpoint=auto_quantize_checkpoint,
                 )
 
     for subject in tqdm(subjects):

@@ -66,8 +66,11 @@ def _quantize_model_with_dataset(
     quant_cfg: str | list[str],
     calib_dataset,
     auto_quantize_bits=None,
+    auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
     batch_size=1,
     compress=False,
+    auto_quantize_checkpoint=None,
 ):
     if hasattr(lm, "gpt2"):
         net = lm.gpt2
@@ -81,23 +84,42 @@ def _quantize_model_with_dataset(
             getattr(mtq, quant_fmt) for quant_fmt in quant_cfg if quant_fmt != "NONE"
         ]
 
-        def loss_func(output, data):
-            # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
-            # which contains the loss attribute.
-            return output.loss
+        # Configure forward_step and loss_func based on method
+        if auto_quantize_method == "gradient":
+            # For gradient-based method, return full output with loss
+            def forward_step(model, batch):
+                return model(**batch)
+
+            def loss_func(output, data):
+                # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast`
+                # which contains the loss attribute.
+                return output.loss
+        elif auto_quantize_method == "kl_div":
+            # For KL divergence method, return only logits
+            def forward_step(model, batch):
+                return model(**batch).logits
+
+            loss_func = None  # KL divergence doesn't need a custom loss function
+        else:
+            raise ValueError(
+                f"Invalid auto_quantize_method: {auto_quantize_method}. "
+                "Must be 'gradient' or 'kl_div'"
+            )
 
         net, _ = mtq.auto_quantize(
             net,
             constraints={"effective_bits": auto_quantize_bits},
             quantization_formats=quant_cfg_for_search,
             data_loader=calib_dataset,
-            forward_step=lambda model, batch: model(**batch),
+            forward_step=forward_step,
             loss_func=loss_func,
             num_calib_steps=len(calib_dataset),
-            num_score_steps=min(
-                len(calib_dataset), 128 // batch_size
-            ),  # Limit the number of score steps to avoid long calibration time
+            # Most time is spent on score estimation; fewer samples speed it up with little accuracy impact.
+            num_score_steps=min(len(calib_dataset), max(auto_quantize_score_size // batch_size, 1)),
             verbose=True,
+            method=auto_quantize_method,
+            # disabled_layers=["*lm_head*", "*mlp.gate.*"],
+            checkpoint=auto_quantize_checkpoint,
         )
     else:
         mtq_cfg = CUSTOM_CONFIG.get(quant_cfg)  # type: ignore [arg-type]
@@ -141,10 +163,13 @@ def quantize_model(
     tokenizer,
     batch_size,
     calib_size,
-    auto_quantize_bits=None,
     data="cnn_dailymail",
     test_generated=True,
     compress=False,
+    auto_quantize_bits=None,
+    auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
+    auto_quantize_checkpoint=None,
 ):
     """Quantizes the model with the provided calibration dataset.
 
@@ -155,10 +180,14 @@ def quantize_model(
         tokenizer: the tokenizer.
         batch_size: the calibration batch size for each calibration inference run.
         calib_size: the total calibration dataset size.
-        auto_quantize_bits: The effective bits constraint for auto_quantize.
         data: the name of the calibration dataset.
         test_generated:  If ``True``, test the generated text before and after quantization.
         compress: If ``True``, compress the model after quantization.
+        auto_quantize_bits: The effective bits constraint for auto_quantize.
+        auto_quantize_method: The method for auto_quantize ('gradient' or 'kl_div').
+        auto_quantize_score_size: Number of samples used for auto_quantize scoring.
+        auto_quantize_checkpoint: Path to checkpoint file for saving/restoring auto_quantize search state
+            (sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified.
     """
     if "AWQ" in quant_cfg:
         print(
@@ -170,8 +199,10 @@ def quantize_model(
     if hasattr(model, "model"):
         device = model.model.device
 
+    is_gradient_based = auto_quantize_bits is not None and auto_quantize_method == "gradient"
+
     if batch_size == 0:
-        if auto_quantize_bits is not None or torch.distributed.is_initialized():
+        if is_gradient_based or torch.distributed.is_initialized():
             raise ValueError("We dont support automatic batch size inference for this case.")
 
         net = model.gpt2 if hasattr(model, "gpt2") else model.model
@@ -186,15 +217,23 @@ def quantize_model(
         batch_size=batch_size,
         num_samples=calib_size,
         device=device,
-        include_labels=auto_quantize_bits is not None,
+        include_labels=is_gradient_based,
     )
 
     if test_generated:
         input_str = tokenizer.decode(next(iter(calib_dataloader))["input_ids"][0])
         generated_str_before_ptq = model.run(input_str)
 
     _quantize_model_with_dataset(
-        model, quant_cfg, calib_dataloader, auto_quantize_bits, batch_size, compress
+        model,
+        quant_cfg,
+        calib_dataloader,
+        auto_quantize_bits,
+        auto_quantize_method,
+        auto_quantize_score_size,
+        batch_size,
+        compress,
+        auto_quantize_checkpoint,
     )
 
     if test_generated: