|
13 | 13 | import shutil
|
14 | 14 | import textwrap
|
15 | 15 | import time
|
| 16 | +import traceback |
16 | 17 | import typing
|
17 | 18 | from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
18 | 19 |
|
|
21 | 22 | import psutil
|
22 | 23 | from typing_extensions import Literal
|
23 | 24 |
|
| 25 | +import sky |
24 | 26 | from sky import backends
|
25 | 27 | from sky import exceptions
|
26 | 28 | from sky import global_user_state
|
|
32 | 34 | from sky.skylet import constants
|
33 | 35 | from sky.skylet import job_lib
|
34 | 36 | from sky.skylet import log_lib
|
| 37 | +from sky.usage import usage_lib |
35 | 38 | from sky.utils import common_utils
|
36 | 39 | from sky.utils import log_utils
|
37 | 40 | from sky.utils import rich_utils
|
38 | 41 | from sky.utils import subprocess_utils
|
39 | 42 | from sky.utils import ux_utils
|
40 | 43 |
|
41 | 44 | if typing.TYPE_CHECKING:
|
42 |
| - import sky |
43 | 45 | from sky import dag as dag_lib
|
44 | 46 |
|
45 | 47 | logger = sky_logging.init_logger(__name__)
|
@@ -85,6 +87,43 @@ class UserSignal(enum.Enum):
|
85 | 87 |
|
86 | 88 |
|
87 | 89 | # ====== internal functions ======
|
| 90 | +def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None: |
| 91 | + """Terminate the cluster.""" |
| 92 | + retry_cnt = 0 |
| 93 | + # In some cases, e.g. botocore.exceptions.NoCredentialsError due to AWS |
| 94 | + # metadata service throttling, the failed sky.down attempt can take 10-11 |
| 95 | + # seconds. In this case, we need the backoff to significantly reduce the |
| 96 | + # rate of requests - that is, significantly increase the time between |
| 97 | + # requests. We set the initial backoff to 15 seconds, so that once it grows |
| 98 | + # exponentially it will quickly dominate the 10-11 seconds that we already |
| 99 | + # see between requests. We set the max backoff very high, since it's |
| 100 | + # generally much more important to eventually succeed than to fail fast. |
| 101 | + backoff = common_utils.Backoff( |
| 102 | + initial_backoff=15, |
| 103 | + # 1.6 ** 5 = 10.48576 < 20, so we won't hit this with default max_retry |
| 104 | + max_backoff_factor=20) |
| 105 | + while True: |
| 106 | + try: |
| 107 | + usage_lib.messages.usage.set_internal() |
| 108 | + sky.down(cluster_name) |
| 109 | + return |
| 110 | + except exceptions.ClusterDoesNotExist: |
| 111 | + # The cluster is already down. |
| 112 | + logger.debug(f'The cluster {cluster_name} is already down.') |
| 113 | + return |
| 114 | + except Exception as e: # pylint: disable=broad-except |
| 115 | + retry_cnt += 1 |
| 116 | + if retry_cnt >= max_retry: |
| 117 | + raise RuntimeError( |
| 118 | + f'Failed to terminate the cluster {cluster_name}.') from e |
| 119 | + logger.error( |
| 120 | + f'Failed to terminate the cluster {cluster_name}. Retrying.' |
| 121 | + f'Details: {common_utils.format_exception(e)}') |
| 122 | + with ux_utils.enable_traceback(): |
| 123 | + logger.error(f' Traceback: {traceback.format_exc()}') |
| 124 | + time.sleep(backoff.current_backoff()) |
| 125 | + |
| 126 | + |
88 | 127 | def get_job_status(backend: 'backends.CloudVmRayBackend',
|
89 | 128 | cluster_name: str) -> Optional['job_lib.JobStatus']:
|
90 | 129 | """Check the status of the job running on a managed job cluster.
|
@@ -202,18 +241,9 @@ def update_managed_job_status(job_id: Optional[int] = None):
|
202 | 241 | cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
|
203 | 242 | handle = global_user_state.get_handle_from_cluster_name(
|
204 | 243 | cluster_name)
|
| 244 | + # If the cluster exists, terminate it. |
205 | 245 | if handle is not None:
|
206 |
| - backend = backend_utils.get_backend_from_handle(handle) |
207 |
| - # TODO(cooperc): Add backoff |
208 |
| - max_retry = 3 |
209 |
| - for retry_cnt in range(max_retry): |
210 |
| - try: |
211 |
| - backend.teardown(handle, terminate=True) |
212 |
| - break |
213 |
| - except RuntimeError: |
214 |
| - logger.error('Failed to tear down the cluster ' |
215 |
| - f'{cluster_name!r}. Retrying ' |
216 |
| - f'[{retry_cnt}/{max_retry}].') |
| 246 | + terminate_cluster(cluster_name) |
217 | 247 |
|
218 | 248 | # The controller process for this managed job is not running: it must
|
219 | 249 | # have exited abnormally, and we should set the job status to
|
|
0 commit comments