diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7b86aab64130a..a94f3a59399ae 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -240,12 +240,13 @@ def _validate_accelerator_and_devices(self) -> None: raise MisconfigurationException( f"You passed `devices={self.devices}` but haven't specified" " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping," - f" got `accelerator={self.distributed_backend}`." + f" got `accelerator={self.distributed_backend!r}`." ) def _validate_accelerator_type(self) -> None: if self._accelerator_type and self._accelerator_type != self._device_type: - raise MisconfigurationException( + # internal error: should not happen. + raise ValueError( f"Mismatch between the requested accelerator type ({self._accelerator_type})" f" and assigned device type ({self._device_type})." ) @@ -255,25 +256,16 @@ def _warn_if_devices_flag_ignored(self) -> None: if self.devices is None: return devices_warning = f"The flag `devices={self.devices}` will be ignored, as you have set" - if self.distributed_backend == "auto": + if self.distributed_backend in ("auto", DeviceType.TPU): if self.tpu_cores is not None: rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") - elif self.ipus is not None: - rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") - elif self.gpus is not None: - rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") - elif self.num_processes != 1: - rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") - elif self.distributed_backend == DeviceType.TPU: - if self.tpu_cores is not None: - rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") - elif self.distributed_backend == DeviceType.IPU: + elif self.distributed_backend in ("auto", DeviceType.IPU): if self.ipus is not None: rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") - elif self.distributed_backend == DeviceType.GPU: + elif self.distributed_backend in ("auto", DeviceType.GPU): if self.gpus is not None: rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") - elif self.distributed_backend == DeviceType.CPU: + elif self.distributed_backend in ("auto", DeviceType.CPU): if self.num_processes != 1: rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") @@ -294,26 +286,27 @@ def _handle_accelerator_and_distributed_backend( ) -> None: if distributed_backend is not None: rank_zero_deprecation( - f"`Trainer(distributed_backend={distributed_backend})` has been deprecated and will be removed in v1.5." - f" Use `Trainer(strategy={distributed_backend})` instead." + f"`Trainer(distributed_backend={distributed_backend!r})` " + "has been deprecated and will be removed in v1.5." + f" Use `Trainer(strategy={distributed_backend!r})` instead." ) if self.strategy is not None: raise MisconfigurationException( - f"You have passed `Trainer(strategy={self.strategy})` but have" - f" also passed `Trainer(distributed_backend={distributed_backend})`." - f"HINT: Use just `Trainer(strategy={self.strategy})` instead." + f"You have passed `Trainer(strategy={self.strategy!r})` but have" + f" also passed `Trainer(distributed_backend={distributed_backend!r})`." + f" HINT: Use just `Trainer(strategy={self.strategy!r})` instead." ) if accelerator is not None and accelerator in list(DistributedType): rank_zero_deprecation( - f"Passing {accelerator} `strategy` to the `accelerator` flag in Trainer has been deprecated" - f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator})` instead." + f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" + f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." ) if self.strategy is not None: raise MisconfigurationException( - f"You have passed `Trainer(strategy={self.strategy})` but have" - f" also passed `Trainer(accelerator={accelerator})`." - f"HINT: Use just `Trainer(strategy={self.strategy})` instead." + f"You have passed `Trainer(strategy={self.strategy!r})` but have" + f" also passed `Trainer(accelerator={accelerator!r})`." + f" HINT: Use just `Trainer(strategy={self.strategy!r})` instead." ) def _set_training_type_plugin(self) -> None: @@ -329,7 +322,7 @@ def handle_given_plugins(self) -> None: for plug in self.plugins: if self.strategy is not None and self._is_plugin_training_type(plug): raise MisconfigurationException( - f"You have passed `Trainer(strategy={self.strategy})`" + f"You have passed `Trainer(strategy={self.strategy!r})`" f" and you can only specify one training type plugin, but you have passed {plug} as a plugin." ) if self._is_plugin_training_type(plug): @@ -503,7 +496,7 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool: if accelerator == DeviceType.CPU: if not isinstance(self.devices, int): raise MisconfigurationException( - "The flag `devices` only supports integer for `accelerator='cpu'`," + "The flag `devices` must be an int with `accelerator='cpu'`," f" got `devices={self.devices}` instead." ) self.num_processes = self.devices @@ -816,7 +809,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): elif self.num_gpus > 1 and not _use_cpu: rank_zero_warn( "You requested multiple GPUs but did not specify a backend, e.g." - ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.' + ' `Trainer(strategy="dp"|"ddp"|"ddp2")`. Setting `strategy="ddp_spawn"` for you.' ) self.distributed_backend = DistributedType.DDP_SPAWN @@ -833,7 +826,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): self._distrib_type = DistributedType.DDP_SPAWN if self.num_gpus > 0: rank_zero_warn( - "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs." + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) self.parallel_device_ids = None if self.num_processes is None: @@ -859,7 +852,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1): if self._distrib_type in (DistributedType.DP, DistributedType.DDP2): rank_zero_warn( - f"{self._distrib_type} is not supported on CPUs, hence setting the distributed type to `ddp`." + f"{self._distrib_type.value!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) self._distrib_type = DistributedType.DDP else: @@ -887,8 +880,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.num_nodes > 1 and not using_valid_distributed: # throw error to force user to choose a supported distributed type such as ddp or ddp2 raise MisconfigurationException( - "Your chosen distributed type does not support num_nodes > 1. " - "Please set accelerator=ddp or accelerator=ddp2." + "Your chosen strategy does not support `num_nodes > 1`. Please set `strategy=('ddp'|'ddp2')`." ) def _set_horovod_backend(self): @@ -910,7 +902,8 @@ def check_interactive_compatibility(self): if _IS_INTERACTIVE and self._distrib_type is not None and not self._distrib_type.is_interactive_compatible(): raise MisconfigurationException( - f"Selected distributed backend {self._distrib_type} is not compatible with an interactive" + f"`Trainer(strategy={self._distrib_type.value!r})` or" + f" `Trainer(accelerator={self._distrib_type.value!r})` is not compatible with an interactive" " environment. Run your code as a script, or choose one of the compatible backends:" f" {', '.join(DistributedType.interactive_compatible_types())}." " In case you are spawning processes yourself, make sure to include the Trainer" diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index aa165feb89e79..dcda2f9779c6d 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -447,10 +447,10 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("pytorch_lightning.utilities._IS_INTERACTIVE", return_value=True) @mock.patch("torch.cuda.device_count", return_value=2) def test_ipython_incompatible_backend_error(*_): - with pytest.raises(MisconfigurationException, match="backend ddp is not compatible"): + with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): Trainer(accelerator="ddp", gpus=2) - with pytest.raises(MisconfigurationException, match="backend ddp2 is not compatible"): + with pytest.raises(MisconfigurationException, match=r"strategy='ddp2'\)`.*is not compatible"): Trainer(accelerator="ddp2", gpus=2) @@ -615,14 +615,14 @@ def test_set_devices_if_none_gpu(): def test_devices_with_cpu_only_supports_integer(): - with pytest.raises(MisconfigurationException, match="The flag `devices` only supports integer"): + with pytest.raises(MisconfigurationException, match="The flag `devices` must be an int"): Trainer(accelerator="cpu", devices="1,3") @pytest.mark.parametrize("training_type", ["ddp2", "dp"]) def test_unsupported_distrib_types_on_cpu(training_type): - with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting the distributed type to `ddp`."): + with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"): trainer = Trainer(accelerator=training_type, num_processes=2) assert trainer._distrib_type == DistributedType.DDP diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index df5444ac776a6..622ead614b1c7 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -222,7 +222,6 @@ def on_train_end(self, trainer, pl_module): @RunIf(tpu=True) def test_ddp_cpu_not_supported_on_tpus(): - with pytest.raises(MisconfigurationException, match="`accelerator='ddp_cpu'` is not supported on TPU machines"): Trainer(accelerator="ddp_cpu")