fix logger creating directory structure too early in DDP (#6380)

awaelchli · web-flow · commit fc6d4027334b · 2021-03-09T09:49:59.000Z
* fix

* add simple test

* fix imports

* add changelog

* tighter test with on_fit_start hook closer to the dispatch call

* move class inside test f unction

* add a comment
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -107,7 +107,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `ModelPruning(make_pruning_permanent=True)` pruning buffers getting removed when saved during training ([#6073](https://github.com/PyTorchLightning/pytorch-lightning/pull/6073))
 
 
-- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit`  ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272))
+- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit` ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272))
 
 
 - Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275))
@@ -134,6 +134,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed an issue where the tuner would not tune the learning rate if also tuning the batch size ([#4688](https://github.com/PyTorchLightning/pytorch-lightning/pull/4688))
 
 
+- Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380))
+
+
 ## [1.2.2] - 2021-03-02
 
 ### Added
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -381,21 +381,6 @@ def __init__(
         # Callback system
         self.on_init_end()
 
-    def setup_trainer(self, model: LightningModule):
-        """
-        Sanity check a few things before starting actual training or testing.
-
-        Args:
-            model: The model to run sanity test on.
-        """
-
-        # log hyper-parameters
-        if self.logger is not None:
-            # save exp to get started (this is where the first experiment logs are written)
-            self.logger.log_hyperparams(model.hparams_initial)
-            self.logger.log_graph(model)
-            self.logger.save()
-
     def fit(
         self,
         model: LightningModule,
@@ -444,7 +429,6 @@ def fit(
         self.call_setup_hook(model)
         self.call_hook("on_before_accelerator_backend_setup", model)
         self.accelerator.setup(self, model)  # note: this sets up self.lightning_module
-        self.setup_trainer(model)
 
         # ----------------------------
         # INSPECT THE CORE LOOPS
@@ -509,6 +493,13 @@ def fit(
     def pre_dispatch(self):
         self.accelerator.pre_dispatch()
 
+        # log hyper-parameters
+        if self.logger is not None:
+            # save exp to get started (this is where the first experiment logs are written)
+            self.logger.log_hyperparams(self.lightning_module.hparams_initial)
+            self.logger.log_graph(self.lightning_module)
+            self.logger.save()
+
     def post_dispatch(self):
         self.accelerator.post_dispatch()
         self.accelerator.teardown()
diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 from unittest import mock
+from unittest.mock import Mock
 
-from pytorch_lightning import Trainer
+from pytorch_lightning import Callback, Trainer
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -66,3 +67,39 @@ def test_global_zero_only_logging_ddp_spawn(tmpdir):
         weights_summary=None,
     )
     trainer.fit(model)
+
+
+def test_first_logger_call_in_subprocess(tmpdir):
+    """
+    Test that the Trainer does not call the logger too early. Only when the worker processes are initialized
+    do we have access to the rank and know which one is the main process.
+    """
+
+    class LoggerCallsObserver(Callback):
+
+        def on_fit_start(self, trainer, pl_module):
+            # this hook is executed directly before Trainer.pre_dispatch
+            # logger should not write any logs until this point
+            assert not trainer.logger.method_calls
+            assert not os.listdir(trainer.logger.save_dir)
+
+        def on_train_start(self, trainer, pl_module):
+            assert trainer.logger.method_call
+            trainer.logger.log_hyperparams.assert_called_once()
+            trainer.logger.log_graph.assert_called_once()
+
+    logger = Mock()
+    logger.version = "0"
+    logger.name = "name"
+    logger.save_dir = tmpdir
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=1,
+        logger=logger,
+        callbacks=[LoggerCallsObserver()]
+    )
+    trainer.fit(model)