Skip to content

Commit 77fbfd9

Browse files
ananthsubawaelchli
authored and
SeanNaren
committed
[fix] Better support for rank_zero_only setting for SLURM and torchelastic (#6802)
Co-authored-by: Adrian Wälchli <[email protected]> (cherry picked from commit 86e1d9f)
1 parent bc25c23 commit 77fbfd9

File tree

3 files changed

+70
-1
lines changed

3 files changed

+70
-1
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
170170

171171
### Fixed
172172

173+
- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))
174+
175+
173176
- Sanitize `None` params during pruning ([#6836](https://github.com/PyTorchLightning/pytorch-lightning/pull/6836))
174177

175178

pytorch_lightning/utilities/distributed.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,18 @@ def wrapped_fn(*args, **kwargs):
4444
return wrapped_fn
4545

4646

47+
# TODO: this should be part of the cluster environment
48+
def _get_rank() -> int:
49+
rank_keys = ('RANK', 'SLURM_PROCID', 'LOCAL_RANK')
50+
for key in rank_keys:
51+
rank = os.environ.get(key)
52+
if rank is not None:
53+
return int(rank)
54+
return 0
55+
56+
4757
# add the attribute to the function but don't overwrite in case Trainer has already set it
48-
rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))
58+
rank_zero_only.rank = getattr(rank_zero_only, 'rank', _get_rank())
4959

5060

5161
def _warn(*args, **kwargs):

tests/utilities/test_distributed.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Copyright The PyTorch Lightning team.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import os
15+
from typing import Mapping
16+
from unittest import mock
17+
18+
import pytest
19+
20+
21+
@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"SLURM_PROCID": "0"}])
22+
def test_rank_zero_known_cluster_envs(env_vars: Mapping[str, str]):
23+
""" Test that SLURM environment variables are properly checked for rank_zero_only. """
24+
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
25+
rank_zero_only.rank = _get_rank()
26+
27+
with mock.patch.dict(os.environ, env_vars):
28+
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
29+
rank_zero_only.rank = _get_rank()
30+
31+
@rank_zero_only
32+
def foo(): # The return type is optional because on non-zero ranks it will not be called
33+
return 1
34+
35+
x = foo()
36+
assert x == 1
37+
38+
39+
@pytest.mark.parametrize("rank_key,rank", [
40+
("RANK", "1"),
41+
("SLURM_PROCID", "2"),
42+
("LOCAL_RANK", "3"),
43+
])
44+
def test_rank_zero_none_set(rank_key, rank):
45+
""" Test that function is not called when rank environment variables are not global zero. """
46+
47+
with mock.patch.dict(os.environ, {rank_key: rank}):
48+
from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
49+
rank_zero_only.rank = _get_rank()
50+
51+
@rank_zero_only
52+
def foo():
53+
return 1
54+
55+
x = foo()
56+
assert x is None

0 commit comments

Comments
 (0)