refactor checkpoint loading for training type plugins (#7928)

awaelchli · pre-commit-ci[bot] · carmocca · web-flow · commit 42c7f2725e12 · 2021-06-11T14:05:11.000+01:00
* plugin loading logic * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * integrate loading for test * fix * fix * unused iport * Update pytorch_lightning/trainer/connectors/checkpoint_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -18,7 +18,7 @@
 from collections import OrderedDict
 from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple, Union
 
 import torch
 
@@ -524,37 +524,34 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
         else:
             super().save_checkpoint(checkpoint, filepath)
 
-    def restore_model_state_from_ckpt_path(
-        self,
-        ckpt_path: str,
-        map_location: Callable = lambda storage, loc: storage,
-    ) -> Tuple[Dict, bool]:
-        if not self.save_full_weights and self.world_size > 1:
-            # Rely on deepspeed to load the checkpoint and necessary information
-            from pytorch_lightning.trainer.states import TrainerFn
-            is_fitting = self.lightning_module.trainer.state.fn == TrainerFn.FITTING
-            save_dir = self._filepath_to_dir(ckpt_path)
-
-            if self.zero_stage_3:
-                # TODO: Currently required as this call is missing within the deepspeed engine.
-                self.deepspeed_engine.optimizer._partition_all_parameters()
-
-            _, client_state = self.deepspeed_engine.load_checkpoint(
-                save_dir, load_optimizer_states=is_fitting, load_lr_scheduler_states=is_fitting
-            )
+    def load_checkpoint_file(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
+        if self.save_full_weights or self.world_size == 1:
+            # Broadcast to ensure we load from the rank 0 checkpoint
+            # This doesn't have to be the case when using deepspeed sharded checkpointing
+            checkpoint_path = self.broadcast(checkpoint_path)
+            return super().load_checkpoint_file(checkpoint_path)
+
+        # Rely on deepspeed to load the checkpoint and necessary information
+        from pytorch_lightning.trainer.states import TrainerFn
+        is_fitting = self.lightning_module.trainer.state.fn == TrainerFn.FITTING
+        save_dir = self._filepath_to_dir(checkpoint_path)
 
-            # restore datamodule states
-            if self.lightning_module.trainer.datamodule is not None:
-                self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state)
+        if self.zero_stage_3:
+            # TODO: Currently required as this call is missing within the deepspeed engine.
+            self.deepspeed_engine.optimizer._partition_all_parameters()
+
+        _, client_state = self.deepspeed_engine.load_checkpoint(
+            save_dir, load_optimizer_states=is_fitting, load_lr_scheduler_states=is_fitting
+        )
+        return client_state
 
-            # hook: give user access to checkpoint if needed.
-            self.lightning_module.on_load_checkpoint(client_state)
-            return client_state, False
+    def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
+        # override to do nothing, deepspeed engine already loaded the weights in `load_checkpoint_file()`
+        pass
 
-        # Broadcast to ensure we load from the rank 0 checkpoint
-        # This doesn't have to be the case when using deepspeed sharded checkpointing
-        ckpt_path = self.broadcast(ckpt_path)
-        return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location)
+    def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
+        # override to do nothing, deepspeed engine already loaded the states in `load_checkpoint_file()`
+        pass
 
     def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
         if self._original_accumulate_grad_batches is None:
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 import contextlib
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generator, Iterable, Optional, Tuple, TypeVar, Union
+from pathlib import Path
+from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, TypeVar, Union
 
 import torch
 from torch import Tensor
@@ -148,6 +149,17 @@ def results(self) -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]:
     def rpc_enabled(self) -> bool:
         return False
 
+    def load_checkpoint_file(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
+        return pl_load(checkpoint_path, map_location=(lambda storage, loc: storage))
+
+    def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
+        self.lightning_module.load_state_dict(checkpoint["state_dict"])
+
+    def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
+        optimizer_states = checkpoint["optimizer_states"]
+        for optimizer, opt_state in zip(self.lightning_module.trainer.accelerator.optimizers, optimizer_states):
+            optimizer.load_state_dict(opt_state)
+
     def start_training(self, trainer: 'pl.Trainer') -> None:
         # double dispatch to initiate the training loop
         self._results = trainer.run_stage()
@@ -227,33 +239,6 @@ def setup_optimizers_in_pre_dispatch(self) -> bool:
         """
         return False
 
-    def restore_model_state_from_ckpt_path(
-        self,
-        ckpt_path: str,
-        map_location: Callable = lambda storage, loc: storage,
-    ) -> Tuple[Dict, bool]:
-        """
-        This function is used to load and restore the model state.
-
-        Args:
-            ckpt_path: Path to a checkpoint
-            map_location: lambda function to map checkpoint location
-
-        Return
-            checkpoint: Return loaded checkpoint
-            bool: Wether to load optimizer / lr_schedulers states from checkpoint
-
-        """
-        ckpt = pl_load(ckpt_path, map_location=map_location)
-        # restore datamodule states
-        if self.lightning_module.trainer.datamodule is not None:
-            self.lightning_module.trainer.datamodule.on_load_checkpoint(ckpt)
-
-        # hook: give user access to checkpoint if needed.
-        self.lightning_module.on_load_checkpoint(ckpt)
-        self.lightning_module.load_state_dict(ckpt['state_dict'])
-        return ckpt, True
-
     def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
         """
         Provide a hook to count optimizer step calls.
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -38,10 +38,7 @@ def __init__(self, trainer, resume_from_checkpoint: Optional[Union[str, Path]] =
         self.resume_checkpoint_path = resume_from_checkpoint
         # used to validate checkpointing logic
         self.has_trained = False
-
         self._loaded_checkpoint = dict()
-        # FIXME: remove in https://github.com/PyTorchLightning/pytorch-lightning/pull/7652
-        self._load_optimizer_states = True
 
     @property
     def hpc_resume_path(self) -> Optional[str]:
@@ -76,11 +73,7 @@ def resume_start(self) -> None:
             raise FileNotFoundError(f"Checkpoint at {checkpoint_path} not found. Aborting training.")
 
         rank_zero_info(f"Restoring states from the checkpoint file at {checkpoint_path}")
-        checkpoint, load_optimizer_states = self.trainer.training_type_plugin.restore_model_state_from_ckpt_path(
-            checkpoint_path, map_location=lambda storage, loc: storage
-        )
-        self._loaded_checkpoint = checkpoint
-        self._load_optimizer_states = load_optimizer_states
+        self._loaded_checkpoint = self.trainer.training_type_plugin.load_checkpoint_file(checkpoint_path)
 
     def resume_end(self) -> None:
         """ Signal the connector that all states have resumed and memory for the checkpoint object can be released. """
@@ -110,6 +103,8 @@ def restore(self, checkpoint_path: Optional[Union[Path, str]] = None) -> bool:
         self.resume_start()
         model = self.trainer.lightning_module
 
+        self.restore_model_state(model, self._loaded_checkpoint)
+
         if self.trainer._device_type == DeviceType.GPU:
             model.cuda(self.trainer.root_gpu)
 
@@ -124,6 +119,8 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         """
         Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object
         """
+        if not checkpoint:
+            return
 
         # restore datamodule states
         if self.trainer.datamodule is not None:
@@ -133,7 +130,16 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         model.on_load_checkpoint(checkpoint)
 
         # restore model state_dict
-        model.load_state_dict(checkpoint['state_dict'])
+        self.trainer.training_type_plugin.load_model_state_dict(checkpoint)
+
+    def restore_model_weights(self, checkpoint_path: Optional[Union[str, Path]]) -> None:
+        """ Restore only the model weights. """
+        checkpoint = self._loaded_checkpoint
+        if checkpoint_path is not None:
+            checkpoint = self.trainer.training_type_plugin.load_checkpoint_file(checkpoint_path)
+
+        self.trainer.lightning_module.on_load_checkpoint(checkpoint)
+        self.trainer.training_type_plugin.load_model_state_dict(checkpoint)
 
     def restore_training_state(self, checkpoint: Dict[str, Any]) -> None:
         """
@@ -199,7 +205,7 @@ def restore_progress(self) -> None:
 
     def restore_optimizers_and_schedulers(self) -> None:
         """ Restores the optimizers and learning rate scheduler states from the pre-loaded checkpoint. """
-        if not self._load_optimizer_states or not self._loaded_checkpoint:
+        if not self._loaded_checkpoint:
             return
 
         # validation
@@ -213,7 +219,7 @@ def restore_optimizers_and_schedulers(self) -> None:
 
     def restore_optimizers(self) -> None:
         """ Restores the optimizer states from the pre-loaded checkpoint. """
-        if not self._load_optimizer_states or not self._loaded_checkpoint:
+        if not self._loaded_checkpoint:
             return
 
         # restore the optimizers
@@ -231,7 +237,7 @@ def restore_optimizers(self) -> None:
 
     def restore_lr_schedulers(self) -> None:
         """ Restores the learning rate scheduler states from the pre-loaded checkpoint. """
-        if not self._load_optimizer_states or not self._loaded_checkpoint:
+        if not self._loaded_checkpoint:
             return
 
         # restore the lr schedulers
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -1154,9 +1154,7 @@ def __load_ckpt_weights(self, ckpt_path: Optional[str]) -> Optional[str]:
         if not self._device_type == DeviceType.TPU:
             self.training_type_plugin.barrier()
 
-        self.training_type_plugin.restore_model_state_from_ckpt_path(
-            ckpt_path, map_location=lambda storage, loc: storage
-        )
+        self.checkpoint_connector.restore_model_weights(ckpt_path)
         return ckpt_path
 
     def _call_setup_hook(self, model: LightningModule) -> None: