Merge branch 'master' into better-err-message

carmocca · web-flow · commit 28fad41a8c9c · 2021-03-27T02:47:31.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,9 +9,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+
 - Added more explicit exception message when trying to execute `trainer.test()` or `trainer.validate()` with `fast_dev_run=True` ([#6667](https://github.com/PyTorchLightning/pytorch-lightning/pull/6667))
 
 
+- Trigger warning when non-metric logged value with multi processes hasn't been reduced ([#6417](https://github.com/PyTorchLightning/pytorch-lightning/pull/6417))
+
+
 - Added a way to print to terminal without breaking up the progress bar ([#5470](https://github.com/PyTorchLightning/pytorch-lightning/pull/5470))
 
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -82,7 +82,7 @@ jobs:
       displayName: 'Testing: standard'
 
     - bash: |
-        sh tests/special_tests.sh
+        bash tests/special_tests.sh
       displayName: 'Testing: special'
 
     - bash: |
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import time
 from typing import Type
 
@@ -21,113 +20,13 @@
 
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.plugins import DDPSpawnShardedPlugin
-from tests.accelerators import DDPLauncher
 from tests.helpers.boring_model import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
 
-@RunIf(min_gpus=1, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_one_gpu():
-    plugin_parity_test(
-        gpus=1,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@RunIf(min_gpus=1, skip_windows=True, fairscale=True, amp_native=True)
-def test_ddp_sharded_plugin_correctness_amp_one_gpu():
-    plugin_parity_test(
-        gpus=1,
-        precision=16,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.")
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_multi_gpu():
-    plugin_parity_test(
-        gpus=2,
-        model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True)
-def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
-    plugin_parity_test(
-        gpus=2,
-        precision=16,
-        model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True)
-def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
-    plugin_parity_test(
-        gpus=2,
-        precision=16,
-        model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@RunIf(min_gpus=2, fairscale=True)
-@pytest.mark.skipif(
-    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
-)
-@DDPLauncher.run("--accelerator ddp --gpus 2 --precision 32")
-def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None):
-    plugin_parity_test(
-        gpus=args.gpus,
-        precision=args.precision,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@RunIf(min_gpus=2, fairscale=True)
-@pytest.mark.skipif(
-    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
-)
-@DDPLauncher.run("--accelerator ddp --gpus 2  --precision 16")
-def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
-    plugin_parity_test(
-        gpus=args.gpus,
-        precision=args.precision,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
-    """
-        Ensures same results using multiple optimizers across multiple GPUs
-    """
-    plugin_parity_test(
-        gpus=2,
-        model_cls=SeedTrainLoaderMultipleOptimizersModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
-    """
-        Ensures using multiple optimizers across multiple GPUs with manual optimization
-    """
-    plugin_parity_test(
-        gpus=2,
-        model_cls=SeedTrainLoaderManualModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
 class SeedTrainLoaderModel(BoringModel):
     """
-        Overrides training loader to ensure we enforce the same seed for all DDP processes.
+    Overrides training loader to ensure we enforce the same seed for all DDP processes.
     """
 
     def train_dataloader(self):
@@ -177,7 +76,7 @@ class SeedTrainLoaderMultipleOptimizersModel(SeedTrainLoaderModel):
     def training_step(self, batch, batch_idx, optimizer_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
-        return {"loss": loss}
+        return {'loss': loss}
 
     def training_epoch_end(self, outputs) -> None:
         # outputs should be an array with an entry per optimizer
@@ -279,11 +178,48 @@ def plugin_parity_test(
     # Assert speed parity by ensuring percentage difference between custom/ddp is below threshold
     percent_diff = (custom_model_time - ddp_time) / custom_model_time
 
-    assert percent_diff <= max_percent_speed_diff, \
-        f'Custom DDP plugin was too slow compared to DDP, Custom Plugin Time: {custom_model_time}, DDP Time: {ddp_time}'
+    assert (
+        percent_diff <= max_percent_speed_diff
+    ), f'Custom DDP plugin was too slow compared to DDP, Custom Plugin Time: {custom_model_time}, DDP Time: {ddp_time}'
 
     if use_cuda:
         # Assert CUDA memory parity
-        assert max_memory_custom <= max_memory_ddp, \
-            f'Custom plugin used too much memory compared to DDP,' \
+        assert max_memory_custom <= max_memory_ddp, (
+            'Custom plugin used too much memory compared to DDP, '
             f'Custom Mem: {max_memory_custom}, DDP Mem: {max_memory_ddp}'
+        )
+
+
+@RunIf(skip_windows=True, fairscale=True)
+@pytest.mark.parametrize(
+    'kwargs',
+    [
+        pytest.param(dict(gpus=1, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=1)),
+        pytest.param(
+            dict(gpus=1, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=1, amp_native=True)
+        ),
+        pytest.param(dict(gpus=2, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=2)),
+        pytest.param(
+            dict(gpus=2, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=2, amp_native=True)
+        ),
+        pytest.param(
+            dict(gpus=2, model_cls=SeedTrainLoaderMultipleOptimizersModel),
+            marks=[
+                RunIf(min_gpus=2),
+                pytest.mark.skip(reason='TODO: Current issue with multiple optimizers and FairScale.'),
+            ],
+        ),
+        pytest.param(
+            dict(gpus=2, model_cls=SeedTrainLoaderManualModel),
+            marks=[
+                RunIf(min_gpus=2),
+                pytest.mark.skip(reason='TODO: Current issue with multiple optimizers and FairScale.'),
+            ],
+        ),
+    ],
+)
+def test_ddp_spawn_sharded_plugin(kwargs):
+    if kwargs['gpus'] > 1:
+        # TODO: decrease speed diff since only 2 GPUs sharding 2 optimizers
+        kwargs['max_percent_speed_diff'] = 0.25
+    plugin_parity_test(**kwargs)
diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
@@ -633,6 +633,12 @@ def rename_keys(self, map_dict: dict):
             meta[dest] = meta[source]
             del meta[source]
 
+    def get_non_metrics_keys(self):
+        """
+        This function is used to filter metric keys for which the value isn't a Metric
+        """
+        return [k for k, v in self.items() if not isinstance(v, Metric)]
+
 
 def choose_last(x):
     if isinstance(x, (torch.Tensor, list)):
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
@@ -148,7 +148,9 @@ def describe(self) -> None:
         # so to avoid them, we open and close the files within this function
         # by calling `_prepare_streams` and `teardown`
         self._prepare_streams()
-        self._write_stream(self.summary())
+        summary = self.summary()
+        if summary:
+            self._write_stream(summary)
         if self._output_file is not None:
             self._output_file.flush()
         self.teardown(stage=self._stage)
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 from weakref import proxy
 
 import torch
@@ -21,6 +22,19 @@
 from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import DistributedType, LightningEnum
+from pytorch_lightning.utilities.warnings import WarningCache
+
+log = logging.getLogger(__name__)
+
+
+class MetricWarningCache(WarningCache):
+
+    def __init__(self):
+        super().__init__()
+        self.warned_metrics = []
+
+
+warning_cache = MetricWarningCache()
 
 
 class ResultStoreType(LightningEnum):
@@ -52,8 +66,10 @@ class HookResultStore:
     Those data structures enables us to reduce properly Result object when batch loop is finished.
     """
 
-    def __init__(self, fx_name: str) -> None:
+    def __init__(self, fx_name: str, all_gather_fn: Callable, should_warn: bool) -> None:
         self._fx_name = fx_name
+        self._all_gather_fn = all_gather_fn
+        self._should_warn = should_warn
         self._internals = {}
         self._internals_reduced = {}
         self._internal_type = None
@@ -109,6 +125,20 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non
 
         func = getattr(opt_metric, func_name)
         metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs)
+        if self._should_warn:
+            for non_metric_key in opt_metric.get_non_metrics_keys():
+                if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics:
+                    metric = self._all_gather_fn(metrics_to_log[non_metric_key])
+                    if any(metric[0] != m for m in metric[1:]):
+                        warning_cache.warn(
+                            f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} "
+                            "doesn't appear to be the same accross all processes. "
+                            "HINT: One could either do: `self.log(..., sync_dist=True, sync_fn=torch.mean)`"
+                            " to force mean reduction across processes which can be inaccurate or implement"
+                            " a `torchmetrics.Metric`"
+                        )
+                    warning_cache.warned_metrics.append(non_metric_key)
+
         results.append(metrics_to_log)
 
     def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> List[Dict]:
@@ -227,6 +257,12 @@ class EpochResultStore:
 
     def __init__(self, trainer: 'pl.Trainer') -> None:
         self.trainer = proxy(trainer)
+
+        # Add warning only for distributed (expect rpc as main worker is running the code).
+        _should_warn = trainer.accelerator_connector.is_distributed
+        _should_warn &= not trainer.training_type_plugin.rpc_enabled
+        self._should_warn = _should_warn
+
         self.reset()
 
     def __getitem__(self, key: str) -> Any:
@@ -278,7 +314,8 @@ def cache_result(self) -> None:
             info = self.info
             fx_name = info["fx_name"]
 
-            self._internals.setdefault(fx_name, HookResultStore(fx_name))
+            all_gather_fn = self.trainer.lightning_module.all_gather
+            self._internals.setdefault(fx_name, HookResultStore(fx_name, all_gather_fn, self._should_warn))
 
             # attach capture batch_size
             Result.attach_batch_size(self._batch_size, hook_result)
diff --git a/tests/accelerators/__init__.py b/tests/accelerators/__init__.py
@@ -1,12 +0,0 @@
-try:
-    from dtrun.launcher import DDPLauncher
-except ImportError:
-
-    class DDPLauncher:
-
-        def run(cmd_line, **kwargs):
-
-            def inner(func):
-                pass
-
-            return inner
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
@@ -20,7 +20,7 @@
 import torch
 
 from pytorch_lightning import Trainer
-from tests.accelerators import ddp_model, DDPLauncher
+from tests.accelerators import ddp_model
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 from tests.utilities.distributed import call_training_script
@@ -71,19 +71,6 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir):
         assert out['test_acc'] > 0.7
 
 
-@RunIf(min_gpus=2)
-@DDPLauncher.run(
-    "--max_epochs [max_epochs] --gpus 2 --accelerator [accelerator]",
-    max_epochs=["1"],
-    accelerator=["ddp", "ddp_spawn"]
-)
-def test_cli_to_pass(tmpdir, args=None):
-    """
-    This test verify we can call function using test_cli name
-    """
-    return '1'
-
-
 @RunIf(skip_windows=True)
 @pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine")
 def test_torch_distributed_backend_env_variables(tmpdir):
diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py
@@ -15,6 +15,7 @@
 import sys
 from unittest import mock
 
+import pytest
 import torch
 
 from tests.helpers.runif import RunIf
@@ -28,6 +29,9 @@
 from tests.helpers.boring_model import BoringModel  # noqa: E402
 
 
+# TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml)
+# use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)`
+@pytest.mark.skip("Multi-node testing is currently disabled")
 @RunIf(special=True)
 def test_logging_sync_dist_true_ddp(tmpdir):
     """
@@ -65,6 +69,9 @@ def validation_step(self, batch, batch_idx):
     assert trainer.logged_metrics['bar'] == fake_result
 
 
+# TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml)
+# use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)`
+@pytest.mark.skip("Multi-node testing is currently disabled")
 @RunIf(special=True)
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test__validation_step__log(tmpdir):
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
diff --git a/tests/trainer/logging_/test_train_loop_logging_1_0.py b/tests/trainer/logging_/test_train_loop_logging_1_0.py
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py