Test that stopping and resuming won't make a difference in the final model

Seppo Enarvi · Seppo Enarvi · commit 42d91cd55576 · 2025-02-10T12:30:39.000+02:00
diff --git a/src/lightning/pytorch/callbacks/weight_averaging.py b/src/lightning/pytorch/callbacks/weight_averaging.py
@@ -23,6 +23,7 @@
 import torch
 from torch import Tensor
 from torch.optim.swa_utils import AveragedModel
+from typing_extensions import override
 
 import lightning.pytorch as pl
 from lightning.pytorch.callbacks.callback import Callback
@@ -50,10 +51,15 @@ class WeightAveraging(Callback):
 
     def __init__(
         self,
-        device: Optional[Union[torch.device, int]] = torch.device("cpu"),
+        device: Optional[Union[torch.device, str, int]] = "cpu",
         avg_fn: Optional[Callable[[Tensor, Tensor, Union[Tensor, int]], Tensor]] = None,
     ):
-        self._device = device
+        # The default value is a string so that jsonargparse knows how to serialize it.
+        if isinstance(device, str):
+            self._device: Optional[Union[torch.device, int]] = torch.device(device)
+        else:
+            self._device = device
+
         self._avg_fn = avg_fn
         self._average_model: Optional[AveragedModel] = None
 
@@ -83,6 +89,7 @@ def should_update(self, step_idx: Optional[int] = None, epoch_idx: Optional[int]
         """
         return step_idx is not None
 
+    @override
     def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None:
         """Called when fit, validate, test, predict, or tune begins.
 
@@ -98,6 +105,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: s
             device = self._device or pl_module.device
             self._average_model = AveragedModel(model=pl_module, device=device, avg_fn=self._avg_fn, use_buffers=True)
 
+    @override
     def on_train_batch_end(
         self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", outputs: STEP_OUTPUT, batch: Any, batch_idx: int
     ) -> None:
@@ -121,6 +129,7 @@ def on_train_batch_end(
             self._average_model.update_parameters(pl_module)
             self._latest_update_step = trainer.global_step
 
+    @override
     def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Called when a training epoch ends.
 
@@ -136,6 +145,7 @@ def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModu
             self._average_model.update_parameters(pl_module)
             self._latest_update_epoch = trainer.current_epoch
 
+    @override
     def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Called when training ends.
 
@@ -147,8 +157,10 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
 
         """
         assert self._average_model is not None
+        rank_zero_info("Loading the average model parameters to the final model.")
         self._copy_average_to_current(pl_module)
 
+    @override
     def on_validation_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Called when a validation epoch begins.
 
@@ -163,6 +175,7 @@ def on_validation_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.Lightn
             rank_zero_info("Loading the average model parameters for validation.")
             self._swap_models(pl_module)
 
+    @override
     def on_validation_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Called when a validation epoch ends.
 
@@ -177,6 +190,7 @@ def on_validation_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.Lightnin
             rank_zero_info("Recovering the current model parameters after validation.")
             self._swap_models(pl_module)
 
+    @override
     def state_dict(self) -> dict[str, Any]:
         """Called when saving a checkpoint.
 
@@ -188,6 +202,7 @@ def state_dict(self) -> dict[str, Any]:
         """
         return {"latest_update_step": self._latest_update_step}
 
+    @override
     def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         """Called when loading a checkpoint.
 
@@ -199,6 +214,7 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         """
         self._latest_update_step = state_dict["latest_update_step"]
 
+    @override
     def on_save_checkpoint(
         self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: dict[str, Any]
     ) -> None:
@@ -231,6 +247,7 @@ def on_save_checkpoint(
                 name: value for name, value in average_model_state.items() if not name.startswith("module.")
             }
 
+    @override
     def on_load_checkpoint(
         self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: dict[str, Any]
     ) -> None:
diff --git a/tests/tests_pytorch/callbacks/test_weight_averaging.py b/tests/tests_pytorch/callbacks/test_weight_averaging.py
@@ -12,43 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from copy import deepcopy
 from pathlib import Path
 from typing import Any, Optional
 
 import pytest
 import torch
 from torch import Tensor, nn
 from torch.optim.swa_utils import get_swa_avg_fn
-from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader, Dataset
 
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.callbacks import WeightAveraging
 from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
 from tests_pytorch.helpers.runif import RunIf
 
 
-class WeightAveragingTestModel(BoringModel):
-    def __init__(
-        self, batch_norm: bool = True, iterable_dataset: bool = False, crash_on_epoch: Optional[int] = None
-    ) -> None:
+class TestModel(BoringModel):
+    def __init__(self, batch_norm: bool = True) -> None:
         super().__init__()
         layers = [nn.Linear(32, 32)]
         if batch_norm:
             layers.append(nn.BatchNorm1d(32))
         layers += [nn.ReLU(), nn.Linear(32, 2)]
         self.layer = nn.Sequential(*layers)
-        self.iterable_dataset = iterable_dataset
-        self.crash_on_epoch = crash_on_epoch
+        self.crash_on_epoch = None
 
     def training_step(self, batch: Tensor, batch_idx: int) -> None:
         if self.crash_on_epoch and self.trainer.current_epoch >= self.crash_on_epoch:
-            raise Exception("CRASH TEST")
+            raise Exception("CRASH")
         return super().training_step(batch, batch_idx)
 
-    def train_dataloader(self) -> None:
-        dataset_class = RandomIterableDataset if self.iterable_dataset else RandomDataset
-        return DataLoader(dataset_class(32, 32), batch_size=4)
-
     def configure_optimizers(self) -> None:
         return torch.optim.SGD(self.layer.parameters(), lr=0.1)
 
@@ -194,95 +188,115 @@ def setup(self, trainer, pl_module, stage) -> None:
 @pytest.mark.parametrize("batch_norm", [True, False])
 @pytest.mark.parametrize("iterable_dataset", [True, False])
 def test_ema(tmp_path, batch_norm: bool, iterable_dataset: bool):
-    _train(tmp_path, EMATestCallback(), batch_norm=batch_norm, iterable_dataset=iterable_dataset)
+    model = TestModel(batch_norm=batch_norm)
+    dataset = RandomIterableDataset(32, 32) if iterable_dataset else RandomDataset(32, 32)
+    _train(model, dataset, tmp_path, EMATestCallback())
 
 
 @pytest.mark.parametrize(
     "accelerator", [pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), pytest.param("mps", marks=RunIf(mps=True))]
 )
 def test_ema_accelerator(tmp_path, accelerator):
-    _train(tmp_path, EMATestCallback(), accelerator=accelerator, devices=1)
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+    _train(model, dataset, tmp_path, EMATestCallback(), accelerator=accelerator, devices=1)
 
 
 @RunIf(min_cuda_gpus=2, standalone=True)
 def test_ema_ddp(tmp_path):
-    _train(tmp_path, EMATestCallback(devices=2), strategy="ddp", accelerator="gpu", devices=2)
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+    _train(model, dataset, tmp_path, EMATestCallback(devices=2), strategy="ddp", accelerator="gpu", devices=2)
 
 
 @RunIf(min_cuda_gpus=2)
 def test_ema_ddp_spawn(tmp_path):
-    _train(tmp_path, EMATestCallback(devices=2), strategy="ddp_spawn", accelerator="gpu", devices=2)
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+    _train(model, dataset, tmp_path, EMATestCallback(devices=2), strategy="ddp_spawn", accelerator="gpu", devices=2)
 
 
 @RunIf(skip_windows=True)
 def test_ema_ddp_spawn_cpu(tmp_path):
-    _train(tmp_path, EMATestCallback(devices=2), strategy="ddp_spawn", accelerator="cpu", devices=2)
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+    _train(model, dataset, tmp_path, EMATestCallback(devices=2), strategy="ddp_spawn", accelerator="cpu", devices=2)
 
 
-@pytest.mark.parametrize("crash_on_epoch", [1, 3])
+@pytest.mark.parametrize("crash_on_epoch", [1, 3, 5])
 def test_ema_resume(tmp_path, crash_on_epoch):
-    _train_and_resume(tmp_path, crash_on_epoch=crash_on_epoch)
+    dataset = RandomDataset(32, 32)
+    model1 = TestModel()
+    model2 = deepcopy(model1)
+
+    _train(model1, dataset, tmp_path, EMATestCallback())
+
+    model2.crash_on_epoch = crash_on_epoch
+    model2 = _train_and_resume(model2, dataset, tmp_path)
+
+    for param1, param2 in zip(model1.parameters(), model2.parameters()):
+        assert torch.allclose(param1, param2, atol=0.001)
 
 
 @RunIf(skip_windows=True)
 def test_ema_resume_ddp(tmp_path):
-    _train_and_resume(tmp_path, crash_on_epoch=3, use_ddp=True)
+    model = TestModel()
+    model.crash_on_epoch = 3
+    dataset = RandomDataset(32, 32)
+    _train_and_resume(model, dataset, tmp_path, strategy="ddp_spawn", devices=2)
 
 
 def test_swa(tmp_path):
-    _train(tmp_path, SWATestCallback())
+    model = TestModel()
+    dataset = RandomDataset(32, 32)
+    _train(model, dataset, tmp_path, SWATestCallback())
 
 
 def _train(
+    model: TestModel,
+    dataset: Dataset,
     tmp_path: str,
     callback: WeightAveraging,
-    batch_norm: bool = True,
     strategy: str = "auto",
     accelerator: str = "cpu",
     devices: int = 1,
-    iterable_dataset: bool = False,
     checkpoint_path: Optional[str] = None,
-    crash_on_epoch: Optional[int] = None,
-) -> None:
+    will_crash: bool = False,
+) -> TestModel:
+    deterministic = accelerator == "cpu"
     trainer = Trainer(
-        default_root_dir=tmp_path,
-        enable_progress_bar=False,
-        enable_model_summary=False,
+        accelerator=accelerator,
+        strategy=strategy,
+        devices=devices,
         logger=False,
+        callbacks=callback,
         max_epochs=8,
         num_sanity_val_steps=0,
-        callbacks=callback,
+        enable_checkpointing=will_crash,
+        enable_progress_bar=False,
+        enable_model_summary=False,
         accumulate_grad_batches=2,
-        strategy=strategy,
-        accelerator=accelerator,
-        devices=devices,
-    )
-    model = WeightAveragingTestModel(
-        batch_norm=batch_norm, iterable_dataset=iterable_dataset, crash_on_epoch=crash_on_epoch
+        deterministic=deterministic,
+        default_root_dir=tmp_path,
     )
-
-    if crash_on_epoch is None:
-        trainer.fit(model, ckpt_path=checkpoint_path)
+    dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
+    if will_crash:
+        with pytest.raises(Exception, match="CRASH"):
+            trainer.fit(model, dataloader, ckpt_path=checkpoint_path)
     else:
-        with pytest.raises(Exception, match="CRASH TEST"):
-            trainer.fit(model, ckpt_path=checkpoint_path)
-
+        trainer.fit(model, dataloader, ckpt_path=checkpoint_path)
     assert trainer.lightning_module == model
 
 
-def _train_and_resume(tmp_path: str, crash_on_epoch: int, use_ddp: bool = False) -> None:
-    strategy = "ddp_spawn" if use_ddp else "auto"
-    devices = 2 if use_ddp else 1
-
-    _train(
-        tmp_path, EMATestCallback(devices=devices), strategy=strategy, devices=devices, crash_on_epoch=crash_on_epoch
-    )
+def _train_and_resume(model: TestModel, dataset: Dataset, tmp_path: str, devices: int = 1, **kwargs) -> TestModel:
+    _train(model, dataset, tmp_path, EMATestCallback(devices=devices), devices=devices, will_crash=True, **kwargs)
 
     checkpoint_dir = Path(tmp_path) / "checkpoints"
     checkpoint_names = os.listdir(checkpoint_dir)
     assert len(checkpoint_names) == 1
     checkpoint_path = str(checkpoint_dir / checkpoint_names[0])
 
-    _train(
-        tmp_path, EMATestCallback(devices=devices), strategy=strategy, devices=devices, checkpoint_path=checkpoint_path
-    )
+    model = TestModel.load_from_checkpoint(checkpoint_path)
+    callback = EMATestCallback(devices=devices)
+    _train(model, dataset, tmp_path, callback, devices=devices, checkpoint_path=checkpoint_path, **kwargs)
+    return model