The user can customize WeightAveraging updates by overriding the should_update() method

Seppo Enarvi · Seppo Enarvi · commit 51b9a06ef485 · 2025-02-04T16:58:18.000+02:00
diff --git a/src/lightning/pytorch/callbacks/weight_averaging.py b/src/lightning/pytorch/callbacks/weight_averaging.py
@@ -30,21 +30,11 @@
 from lightning.pytorch.utilities.types import STEP_OUTPUT
 
 
-def _return_true(x: int) -> bool:
-    return True
-
-
-def _return_false(x: int) -> bool:
-    return False
-
-
 class WeightAveraging(Callback):
     r"""A callback that updates an averaged model for Stochastic Weight Averaging (SWA) or Exponential Moving Average
     (EMA) after each training step.
 
-    The user should provide either `update_on_step` or `update_on_epoch`, a function that determines when the average
-    model should be updated. If neither function is provided, the average model will be updated after every optimizer
-    step.
+    The user can customize when the average model is updated by overriding the ``should_update()`` method.
 
     During validation and after the training finishes, the current model parameters will be replaced with the averaged
     values.
@@ -55,40 +45,44 @@ class WeightAveraging(Callback):
         avg_fn: The averaging function used to update the parameters. The function must take in an
             :class:`AveragedModel` parameter, a current model parameter, and the number of models already averaged. If
             ``None``, an equally weighted average will be used.
-        update_on_step: A function that takes the number of optimizer steps taken, and returns ``True`` if the average
-            model should be updated.
-        update_on_epoch: A function that takes the zero-based epoch number, and returns ``True`` if the average model
-            should be updated.
 
     """
 
     def __init__(
         self,
         device: Optional[Union[torch.device, int]] = torch.device("cpu"),
         avg_fn: Optional[Callable[[Tensor, Tensor, Union[Tensor, int]], Tensor]] = None,
-        update_on_step: Optional[Callable[[int], bool]] = None,
-        update_on_epoch: Optional[Callable[[int], bool]] = None,
     ):
         self._device = device
         self._avg_fn = avg_fn
-
-        if (update_on_step is None) and (update_on_epoch is None):
-            self._update_on_step: Callable[[int], bool] = _return_true
-            self._update_on_epoch: Callable[[int], bool] = _return_false
-        else:
-            self._update_on_step = _return_false if update_on_step is None else update_on_step
-            self._update_on_epoch = _return_false if update_on_epoch is None else update_on_epoch
-
         self._average_model: Optional[AveragedModel] = None
 
         # Number of optimizer steps taken, when the average model was last updated. Initializing this with zero ensures
-        # that the average model will be first updated after the first optimizer step, which takes place after N batches
-        # when using accumulate_grad_batches=N.
+        # that self.should_update() will be first called after the first optimizer step, which takes place after N
+        # batches when using accumulate_grad_batches=N.
         self._latest_update_step = 0
         # The epoch after which the average model was last updated. The first epoch is 0, so initializing this to a
-        # negative value means that if update_on_step(0) returns True, the first update is after the first epoch.
+        # negative value means that if self.should_update(epoch_idx=0) returns True, the first update is after the first
+        # epoch.
         self._latest_update_epoch = -1
 
+    def should_update(self, step_idx: Optional[int] = None, epoch_idx: Optional[int] = None) -> bool:
+        """Called after every optimizer step and after every training epoch to check whether the average model should
+        be updated.
+
+        One of the arguments is set to the zero-based index of the last training step or epoch. The user can customize
+        when the average model gets updated by overriding this method.
+
+        Args:
+            step_idx: Index of the last optimizer step, or ``None`` when called at the epoch end.
+            epoch_idx: Index of the last epoch, or ``None`` when called after an optimizer step.
+
+        Returns:
+            ``True`` if the average model should be updated and ``False`` if not.
+
+        """
+        return step_idx is not None
+
     def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None:
         """Called when fit, validate, test, predict, or tune begins.
 
@@ -109,7 +103,7 @@ def on_train_batch_end(
     ) -> None:
         """Called when a training batch ends.
 
-        Updates the :class:`AveragedModel` parameters, if requested by ``update_on_step()``.
+        Updates the :class:`AveragedModel` parameters, if requested by ``self.should_update()``.
 
         Args:
             trainer: The current :class:`~lightning.pytorch.trainer.trainer.Trainer` instance.
@@ -119,22 +113,25 @@ def on_train_batch_end(
             batch_idx: Index of the training batch.
 
         """
-        if self._update_on_step(trainer.global_step) and (trainer.global_step > self._latest_update_step):
+        # trainer.global_step is the number of optimizer steps taken so far, i.e. 1 after the first optimizer step. To
+        # make step_idx consistent with epoch_idx, we'll pass a zero-based index.
+        step_idx = trainer.global_step - 1
+        if (trainer.global_step > self._latest_update_step) and self.should_update(step_idx=step_idx):
             assert self._average_model is not None
             self._average_model.update_parameters(pl_module)
             self._latest_update_step = trainer.global_step
 
     def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Called when a training epoch ends.
 
-        Updates the :class:`AveragedModel` parameters, if requested by ``update_on_epoch()``.
+        Updates the :class:`AveragedModel` parameters, if requested by ``self.should_update()``.
 
         Args:
             trainer: The current :class:`~lightning.pytorch.trainer.trainer.Trainer` instance.
             pl_module: The current :class:`~lightning.pytorch.core.LightningModule` instance.
 
         """
-        if self._update_on_epoch(trainer.current_epoch) and (trainer.current_epoch > self._latest_update_epoch):
+        if (trainer.current_epoch > self._latest_update_epoch) and self.should_update(epoch_idx=trainer.current_epoch):
             assert self._average_model is not None
             self._average_model.update_parameters(pl_module)
             self._latest_update_epoch = trainer.current_epoch
@@ -218,17 +215,21 @@ def on_save_checkpoint(
 
         """
         if self._average_model is None:
-            raise Exception("Trying to save a checkpoint, but no average model (outside fit). Don't know what to do.")
-
-        rank_zero_info("The average model parameters will be saved to the state_dict in the checkpoint.")
-        average_model_state = self._average_model.state_dict()
-        checkpoint["current_model_state"] = checkpoint["state_dict"]
-        checkpoint["state_dict"] = {
-            name[7:]: value for name, value in average_model_state.items() if name.startswith("module.")
-        }
-        checkpoint["averaging_state"] = {
-            name: value for name, value in average_model_state.items() if not name.startswith("module.")
-        }
+            rank_zero_info(
+                "You're using the WeightAveraging callback, but saving a checkpoint outside the 'fit' stage. The state "
+                "of the WeightAveraging callback won't be saved in the checkpoint. If training has finished, the "
+                "average model parameters will be saved to the state_dict in the checkpoint."
+            )
+        else:
+            rank_zero_info("The average model parameters will be saved to the state_dict in the checkpoint.")
+            average_model_state = self._average_model.state_dict()
+            checkpoint["current_model_state"] = checkpoint["state_dict"]
+            checkpoint["state_dict"] = {
+                name[7:]: value for name, value in average_model_state.items() if name.startswith("module.")
+            }
+            checkpoint["averaging_state"] = {
+                name: value for name, value in average_model_state.items() if not name.startswith("module.")
+            }
 
     def on_load_checkpoint(
         self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: dict[str, Any]
@@ -244,9 +245,12 @@ def on_load_checkpoint(
 
         """
         if self._average_model is None:
-            raise Exception("Trying to load a checkpoint, but no average model (outside fit). Don't know what to do.")
-
-        if ("current_model_state" in checkpoint) and ("averaging_state" in checkpoint):
+            rank_zero_warn(
+                "You're using the WeightAveraging callback, but loading a checkpoint outside the 'fit' stage. The "
+                "WeightAveraging state cannot be restored. If you're using the checkpoint for prediction or testing, "
+                "you can ignore this warning. To disable the warning, remove the WeightAveraging callback."
+            )
+        elif ("current_model_state" in checkpoint) and ("averaging_state" in checkpoint):
             rank_zero_info("Found current_model_state in the checkpoint. This will be used to initialize the model.")
             average_model_state = {"module." + name: value for name, value in checkpoint["state_dict"].items()}
             average_model_state |= checkpoint["averaging_state"]
diff --git a/tests/tests_pytorch/callbacks/test_weight_averaging.py b/tests/tests_pytorch/callbacks/test_weight_averaging.py
@@ -119,15 +119,15 @@ def on_train_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
 
 class SWATestCallback(WeightAveraging):
     def __init__(self, **kwargs: Any) -> None:
-        avg_fn = get_swa_avg_fn()
-        update_on_epoch = lambda x: x in (3, 5, 7)
-        super().__init__(avg_fn=avg_fn, update_on_epoch=update_on_epoch, **kwargs)
-
+        super().__init__(avg_fn=get_swa_avg_fn(), **kwargs)
         self.swap_calls = 0
         self.copy_calls = 0
         # Record the first epoch, as if we are resuming from a checkpoint this may not be equal to 0.
         self.first_epoch: Optional[int] = None
 
+    def should_update(self, step_idx: Optional[int] = None, epoch_idx: Optional[int] = None) -> bool:
+        return epoch_idx in (3, 5, 7)
+
     def _swap_models(self, *args: Any, **kwargs: Any):
         self.swap_calls += 1
         return super()._swap_models(*args, **kwargs)