updated docs; code clean up

realAsma · realAsma · commit e38a5516101d · 2025-11-21T08:36:29.000-08:00
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;

clean ups

Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/modelopt/torch/opt/hparam.py b/modelopt/torch/opt/hparam.py
@@ -48,7 +48,7 @@ def __eq__(self, other) -> bool:
 class Hparam:
     """A base hyperparameter of a DynamicModule.
 
-    An example of such a Hparam could be an hparam with identity dependencies.
+    Keeps track of hyperparameter values and their importance, which can be used for search algorithms.
     """
 
     Importance = Union[torch.Tensor, None]  # noqa: UP007
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
@@ -170,12 +170,12 @@ def fold_pqs_to_weights(model):
 class QuantRecipeHparam(Hparam):
     """An Hparam for quantization recipes.
 
-    In addition, this Hparam also:
+    See :class:`Hparam <modelopt.torch.opt.hparam.Hparam>` for more details. In addition, this Hparam also:
 
-    * Keeps a link to its quant_modules and score_modules and sets the quantizers for the
-      quant_modules based on the active recipe.
+    * Keeps a link to its ``quant_modules`` and ``score_modules`` and sets the quantizers for the
+      ``quant_modules`` based on the active recipe.
     * Keeps track of the importance of each recipe in a dict instead of a tensor.
-    * Registers itself with each score_module via the _hparams_for_scoring attribute.
+    * Registers itself with each ``score_module`` via the ``_hparams_for_scoring`` attribute.
     """
 
     def __init__(
@@ -271,8 +271,14 @@ def attrs(self) -> list[str]:
         """Return the attributes of the hparam for repr."""
         return ["name", *super().attrs]
 
+
 class _AutoQuantizeBaseSearcher(BaseSearcher, ABC):
-    """A base searcher for AutoQuantize algorithm."""
+    """Base searcher for AutoQuantize algorithm."""
+
+    # This searcher finds optimal per-layer quantization by searching across quantization formats
+    # for each quantizable module (quant module). Optionally, quant grouping rules can restrict
+    # certain modules to share the same format. Sensitivity scores are computed from perturbations
+    # at score modules. See AutoQuantizeGradientSearcher for detailed documentation.
 
     candidate_stats: dict[str, dict[str, list[float]]]
     best: dict[str, Any]
@@ -383,25 +389,26 @@ def _apply_score_group_rule(self, name: str, rule) -> str | None:
         return None
 
     def _get_score_module_from_name(
-        self, model: nn.Module, score_module_name: str, fallback_module: nn.Module
+        self, model: nn.Module, score_module_name: str, quant_module: nn.Module
     ) -> nn.Module:
         """Get the actual score module object from its name.
 
         Args:
             model: The model containing all modules
             score_module_name: The name of the score module to retrieve
-            fallback_module: The fallback module to use if score_module_name doesn't exist (typically the quant module)
+            quant_module: The quantized module for which the score is estimated
 
         Returns:
-            The score module object, or fallback_module if not found
+            The score module object, or the quantized module itself if the score module is not found
         """
         try:
             score_module = model.get_submodule(score_module_name)
             return score_module
         except AttributeError:
-            # If score module doesn't exist, fall back to the provided fallback module
-            # This shouldn't happen with valid rules, but provide a safe fallback
-            return fallback_module
+            warnings.warn(
+                f"Score module '{score_module_name}' not found. Score will estimated from the quantized module itself."
+            )
+            return quant_module
 
     def insert_hparams_after_merge_rules(self, model, quant_recipes, disabled_layers=None):
         """Restrict the search space using the merge rules and insert the hparams for the model."""
@@ -459,20 +466,12 @@ def insert_hparams_after_merge_rules(self, model, quant_recipes, disabled_layers
             disabled = any(disabled for _, _, disabled, _ in module_info_list)
             score_modules = [score_module for _, _, _, score_module in module_info_list]
 
-            hparam = (
-                QuantRecipeHparam(
-                    None,
-                    quant_modules=quant_modules,
-                    score_modules=score_modules,
-                    name=str(group_key),
-                )
-                if disabled
-                else QuantRecipeHparam(
-                    quant_recipes,
-                    quant_modules=quant_modules,
-                    score_modules=score_modules,
-                    name=str(group_key),
-                )
+            quant_recipes = None if disabled else quant_recipes
+            hparam = QuantRecipeHparam(
+                quant_recipes,
+                quant_modules=quant_modules,
+                score_modules=score_modules,
+                name=str(group_key),
             )
 
             for module in quant_modules:
@@ -495,8 +494,8 @@ def _verify_constraint(self, search_recipes):
         )
 
     @abstractmethod
-    def estimate_sensitivity_scores(self):
-        """Estimate the sensitivity scores for the model."""
+    def estimate_sensitivity_scores(self) -> None:
+        """Estimate sensitivity scores and track them with Hparam."""
 
     def _run_func(self, func, num_iters=1, desc=""):
         for i, data in tqdm(
@@ -656,8 +655,6 @@ def run_search(self):
         QuantRecipe.fold_pqs_to_weights(self.model)
 
 
-
-
 @torch.compile
 def _get_auto_quantize_score(grad_output, output_diff):
     return ((grad_output.float() ** 2) * (output_diff.float() ** 2)).sum()
@@ -675,13 +672,29 @@ class AutoQuantizeGradientSearcher(_AutoQuantizeBaseSearcher):
     scores while meeting the specified constraint. AutoQuantize uses Linear Programming Solver to find the
     optimal quantization configuration.
 
-    The auto_quantize score for a layer quantization configuration is an approximation of model loss change change due
+    The auto_quantize score for a layer quantization configuration is an approximation of model loss change due
     to quantizing the particular layer with the particular configuration.
     The approximation is based on taylor expansion of the loss function wrt to the quantized output of the layer and
     substitution of Fisher information for Hessian.
     This approximation is mathematically correct for models where the loss
     is a log likelihood loss such as BERT, GPT, etc. However, the auto_quantize score can still be used as a proxy
     for other models such as ResNet.
+
+    **Quant Modules:**
+
+    This searcher operates on quantizable modules (quant modules), which are typically Linear or Conv layers
+    that support quantization. Optionally, grouping rules can be applied to ensure certain layers share the same
+    quantization format (e.g., Q, K, V projections in the same attention layer). For details on quant_grouping_rules
+    and customization, see the :meth:`auto_quantize <modelopt.torch.quantization.model_quant.auto_quantize>`
+    API documentation.
+
+    **Score Modules:**
+
+    By default, for each quant module, its sensitivity score is estimated using that module's output perturbation.
+    However, the sensitivity can also be estimated by looking at perturbation at a separate point in the neural
+    network (score module). This is helpful in some cases such as MoEs for speed and lower memory consumption.
+    Since all experts are already restricted to the same quant format by quant grouping rules, their sensitivity
+    can be estimated together at a single point (e.g., the MLP output level).
     """
 
     score_module_rules = [
@@ -872,8 +885,8 @@ def cleanup_params_after_score_estimation(name, param, params_metadata):
         del params_metadata
         gc.collect()
 
-    def estimate_sensitivity_scores(self):
-        """Estimate the sensitivity scores for the model."""
+    def estimate_sensitivity_scores(self) -> None:
+        """Estimate sensitivity scores using hessian approximation."""
         self.model.eval()
 
         def _default_is_param_grad_enabled(pname, model):