Skip to content

Commit aade4fb

Browse files
carmoccapytorchmergebot
authored andcommitted
Expose the rendezvous keepalive arguments (pytorch#145228)
Enables support for this: ```python from torch.distributed.launcher.api import LaunchConfig config = LaunchConfig( ..., rdzv_configs={"keep_alive_interval": 1122, "heartbeat_timeout": 321, "keep_alive_max_attempt" 5}, ) ``` These arguments are currently hard-coded inside torchrun. The default values are not suitable for jobs with thousands of ranks. Today, `rdzv_configs` only allows the keys `join_timeout`, `last_call_timeout`, `close_timeout` Pull Request resolved: pytorch#145228 Approved by: https://github.com/wconstab
1 parent a929e11 commit aade4fb

File tree

1 file changed

+27
-3
lines changed

1 file changed

+27
-3
lines changed

torch/distributed/elastic/rendezvous/dynamic_rendezvous.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ class RendezvousTimeout:
132132
The time within which the rendezvous is expected to close after a
133133
call to :py:meth:`RendezvousHandler.set_closed` or
134134
:py:meth:`RendezvousHandler.shutdown`.
135-
keep_alive:
135+
heartbeat:
136136
The time within which a keep-alive heartbeat is expected to
137137
complete.
138138
"""
@@ -1017,6 +1017,8 @@ def from_backend(
10171017
max_nodes: int,
10181018
local_addr: Optional[str] = None,
10191019
timeout: Optional[RendezvousTimeout] = None,
1020+
keep_alive_interval: int = 5,
1021+
keep_alive_max_attempt: int = 3,
10201022
):
10211023
"""Create a new :py:class:`DynamicRendezvousHandler`.
10221024
@@ -1035,6 +1037,12 @@ def from_backend(
10351037
The local node address.
10361038
timeout:
10371039
The timeout configuration of the rendezvous.
1040+
keep_alive_interval:
1041+
The amount of time a node waits before sending a heartbeat to keep
1042+
it alive in the rendezvous.
1043+
keep_alive_max_attempt:
1044+
The maximum number of failed heartbeat attempts after which a node
1045+
is considered dead.
10381046
"""
10391047
# We associate each handler instance with a unique node descriptor.
10401048
node = cls._node_desc_generator.generate(local_addr)
@@ -1044,8 +1052,8 @@ def from_backend(
10441052
min_nodes,
10451053
max_nodes,
10461054
timeout or RendezvousTimeout(),
1047-
keep_alive_interval=timedelta(seconds=5),
1048-
keep_alive_max_attempt=3,
1055+
keep_alive_interval=timedelta(seconds=keep_alive_interval),
1056+
keep_alive_max_attempt=keep_alive_max_attempt,
10491057
)
10501058

10511059
state_holder = _BackendRendezvousStateHolder(backend, settings)
@@ -1405,13 +1413,27 @@ def create_handler(
14051413
| | :py:meth:`RendezvousHandler.shutdown`. Defaults to |
14061414
| | 30 seconds. |
14071415
+-------------------+------------------------------------------------------+
1416+
| heartbeat | The time, in seconds, within which a keep-alive |
1417+
| | heartbeat is expected to complete |
1418+
+-------------------+------------------------------------------------------+
14081419
"""
14091420
try:
14101421
timeout = RendezvousTimeout(
14111422
_get_timeout(params, "join"),
14121423
_get_timeout(params, "last_call"),
14131424
_get_timeout(params, "close"),
1425+
_get_timeout(params, "heartbeat"),
14141426
)
1427+
keep_alive_interval = params.get_as_int("keep_alive_interval", 5)
1428+
if keep_alive_interval is None:
1429+
raise TypeError(
1430+
"You passed 'keep_alive_interval=None' as a rendezvous configuration option"
1431+
)
1432+
keep_alive_max_attempt = params.get_as_int("keep_alive_max_attempt", 3)
1433+
if keep_alive_max_attempt is None:
1434+
raise TypeError(
1435+
"You passed 'keep_alive_max_attempt=None' as a rendezvous configuration option"
1436+
)
14151437

14161438
return DynamicRendezvousHandler.from_backend(
14171439
params.run_id,
@@ -1421,6 +1443,8 @@ def create_handler(
14211443
params.max_nodes,
14221444
params.local_addr,
14231445
timeout,
1446+
keep_alive_interval=keep_alive_interval,
1447+
keep_alive_max_attempt=keep_alive_max_attempt,
14241448
)
14251449
except Exception as e:
14261450
construct_and_record_rdzv_event(

0 commit comments

Comments
 (0)