|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 | import logging
|
15 |
| -import os |
16 |
| -import shutil |
17 |
| -import signal |
18 |
| -import tempfile |
19 |
| -import time |
20 | 15 | from datetime import timedelta
|
21 |
| -from pathlib import Path |
22 | 16 | from typing import Any, Callable, Dict, List, Optional, Union
|
23 | 17 |
|
24 | 18 | import torch
|
|
52 | 46 | from pytorch_lightning.strategies.strategy import TBroadcast
|
53 | 47 | from pytorch_lightning.trainer.states import TrainerFn
|
54 | 48 | from pytorch_lightning.utilities.distributed import register_ddp_comm_hook
|
55 |
| -from pytorch_lightning.utilities.exceptions import DeadlockDetectedException |
56 |
| -from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn |
| 49 | +from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only |
57 | 50 | from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep
|
58 | 51 |
|
59 | 52 | if _FAIRSCALE_AVAILABLE:
|
@@ -101,9 +94,6 @@ def __init__(
|
101 | 94 | self._ddp_comm_wrapper = ddp_comm_wrapper
|
102 | 95 | self._model_averaging_period = model_averaging_period
|
103 | 96 | self._model_averager: Optional[ModelAverager] = None
|
104 |
| - self._pids: List[int] = [] |
105 |
| - self._sync_dir: Optional[str] = None |
106 |
| - self._rank_0_will_call_children_scripts: bool = False |
107 | 97 | self._process_group_backend: Optional[str] = process_group_backend
|
108 | 98 | self._timeout: Optional[timedelta] = timeout
|
109 | 99 |
|
@@ -145,18 +135,12 @@ def _configure_launcher(self) -> None:
|
145 | 135 | assert self.cluster_environment is not None
|
146 | 136 | if not self.cluster_environment.creates_processes_externally:
|
147 | 137 | self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes)
|
148 |
| - self._rank_0_will_call_children_scripts = True |
149 | 138 |
|
150 | 139 | def setup_environment(self) -> None:
|
151 | 140 | self.setup_distributed()
|
152 | 141 | super().setup_environment()
|
153 | 142 |
|
154 | 143 | def setup(self, trainer: "pl.Trainer") -> None:
|
155 |
| - # share ddp pids to all processes |
156 |
| - self._rank_0_will_call_children_scripts = bool(self.broadcast(self._rank_0_will_call_children_scripts)) |
157 |
| - if self._should_run_deadlock_detection(): |
158 |
| - self._share_information_to_prevent_deadlock() |
159 |
| - |
160 | 144 | assert self.accelerator is not None
|
161 | 145 | self.accelerator.setup(trainer)
|
162 | 146 |
|
@@ -391,73 +375,6 @@ def register_strategies(cls, strategy_registry: Dict) -> None:
|
391 | 375 | description=f"{cls.__class__.__name__}",
|
392 | 376 | )
|
393 | 377 |
|
394 |
| - def _should_run_deadlock_detection(self) -> bool: |
395 |
| - """Determines whether the plugin will perform process reconciliation in case of errors. |
396 |
| -
|
397 |
| - If the environment variable `PL_RECONCILE_PROCESS` is set, run detection regardless of the cluster environment. |
398 |
| - By default this is disabled. Otherwise, if the cluster environment creates the processes, allow the scheduler / |
399 |
| - parent process to perform the process termination, external to Lightning. |
400 |
| - """ |
401 |
| - return os.getenv("PL_RECONCILE_PROCESS", "0") == "1" or self._rank_0_will_call_children_scripts |
402 |
| - |
403 |
| - def _share_information_to_prevent_deadlock(self) -> None: |
404 |
| - self._share_pids() |
405 |
| - |
406 |
| - # there should be a unique sync_dir per nodes. |
407 |
| - if self.local_rank == 0: |
408 |
| - # create a temporary directory used to synchronize processes on deadlock. |
409 |
| - self._sync_dir = tempfile.mkdtemp() |
410 |
| - |
411 |
| - sync_dirs = [] |
412 |
| - global_node_rank_zero = 0 |
413 |
| - for _ in range(self.num_nodes): |
414 |
| - sync_dirs.append(self.broadcast(self._sync_dir, global_node_rank_zero)) |
415 |
| - global_node_rank_zero += self.world_size // self.num_nodes |
416 |
| - |
417 |
| - self._sync_dir = sync_dirs[self.node_rank] |
418 |
| - |
419 |
| - def _share_pids(self) -> None: |
420 |
| - """Make all DDP processes aware of all processes pids.""" |
421 |
| - self.barrier() |
422 |
| - pids = self.all_gather(torch.tensor(os.getpid(), device=self.root_device)) |
423 |
| - pids = pids.cpu().numpy().tolist() |
424 |
| - self._pids = pids if isinstance(pids, list) else [pids] |
425 |
| - |
426 |
| - def reconciliate_processes(self, trace: str) -> None: |
427 |
| - if self.world_size < 2: |
428 |
| - return |
429 |
| - |
430 |
| - if not self._should_run_deadlock_detection(): |
431 |
| - return |
432 |
| - |
433 |
| - sync_dir = self._sync_dir |
434 |
| - |
435 |
| - if not sync_dir: |
436 |
| - rank_zero_warn("Error handling mechanism for deadlock detection is uninitialized. Skipping check.") |
437 |
| - return |
438 |
| - |
439 |
| - # The cluster may be configured to periodically purge the `/tmp` |
440 |
| - # directory, in which case `sync_dir` may not exist anymore at this |
441 |
| - # point. Idempotently create it to ensure its existence. |
442 |
| - Path(sync_dir).mkdir(parents=True, exist_ok=True) |
443 |
| - |
444 |
| - # save a file locally. |
445 |
| - torch.save(True, os.path.join(sync_dir, f"{self.global_rank}.pl")) |
446 |
| - |
447 |
| - # sleep for a short time |
448 |
| - time.sleep(3) |
449 |
| - |
450 |
| - # return if all processes wrote a file in the `sync_dir`. |
451 |
| - # todo (tchaton) Add support for non-shared file-system which will fail. |
452 |
| - if len(os.listdir(sync_dir)) == (self.world_size // self.num_nodes): |
453 |
| - return |
454 |
| - |
455 |
| - for pid in self._pids: |
456 |
| - if pid != os.getpid(): |
457 |
| - os.kill(pid, signal.SIGKILL) |
458 |
| - shutil.rmtree(sync_dir) |
459 |
| - raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}") |
460 |
| - |
461 | 378 | def teardown(self) -> None:
|
462 | 379 | log.detail(f"{self.__class__.__name__}: tearing down strategy")
|
463 | 380 |
|
|
0 commit comments