[jobs] backoff cluster teardown (#4562)

cg505 · web-flow · commit 38ba39faac76 · 2025-01-16T23:32:40.000Z
* move terminate_cluster into utils

* [jobs] backoff cluster teardown

* use terminate_cluster in update_managed_job_status

* fix unit test

* add details on backoff
diff --git a/sky/jobs/controller.py b/sky/jobs/controller.py
@@ -243,7 +243,7 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                     self._download_log_and_stream(task_id, handle)
                 # Only clean up the cluster, not the storages, because tasks may
                 # share storages.
-                recovery_strategy.terminate_cluster(cluster_name=cluster_name)
+                managed_job_utils.terminate_cluster(cluster_name=cluster_name)
                 return True
 
             # For single-node jobs, non-terminated job_status indicates a
@@ -342,7 +342,7 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                     # those clusters again may fail.
                     logger.info('Cleaning up the preempted or failed cluster'
                                 '...')
-                    recovery_strategy.terminate_cluster(cluster_name)
+                    managed_job_utils.terminate_cluster(cluster_name)
 
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
@@ -478,7 +478,7 @@ def _cleanup(job_id: int, dag_yaml: str):
         assert task.name is not None, task
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, job_id)
-        recovery_strategy.terminate_cluster(cluster_name)
+        managed_job_utils.terminate_cluster(cluster_name)
         # Clean up Storages with persistent=False.
         # TODO(zhwu): this assumes the specific backend.
         backend = cloud_vm_ray_backend.CloudVmRayBackend()
diff --git a/sky/jobs/recovery_strategy.py b/sky/jobs/recovery_strategy.py
@@ -43,30 +43,6 @@
 _AUTODOWN_MINUTES = 5
 
 
-def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
-    """Terminate the cluster."""
-    retry_cnt = 0
-    while True:
-        try:
-            usage_lib.messages.usage.set_internal()
-            sky.down(cluster_name)
-            return
-        except exceptions.ClusterDoesNotExist:
-            # The cluster is already down.
-            logger.debug(f'The cluster {cluster_name} is already down.')
-            return
-        except Exception as e:  # pylint: disable=broad-except
-            retry_cnt += 1
-            if retry_cnt >= max_retry:
-                raise RuntimeError(
-                    f'Failed to terminate the cluster {cluster_name}.') from e
-            logger.error(
-                f'Failed to terminate the cluster {cluster_name}. Retrying.'
-                f'Details: {common_utils.format_exception(e)}')
-            with ux_utils.enable_traceback():
-                logger.error(f'  Traceback: {traceback.format_exc()}')
-
-
 class StrategyExecutor:
     """Handle the launching, recovery and termination of managed job clusters"""
 
@@ -193,7 +169,7 @@ def _try_cancel_all_jobs(self):
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            terminate_cluster(self.cluster_name)
+            managed_job_utils.terminate_cluster(self.cluster_name)
 
     def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -380,7 +356,7 @@ def _launch(self,
 
                 # If we get here, the launch did not succeed. Tear down the
                 # cluster and retry.
-                terminate_cluster(self.cluster_name)
+                managed_job_utils.terminate_cluster(self.cluster_name)
                 if max_retry is not None and retry_cnt >= max_retry:
                     # Retry forever if max_retry is None.
                     if raise_on_failure:
@@ -473,7 +449,7 @@ def recover(self) -> float:
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            terminate_cluster(self.cluster_name)
+            managed_job_utils.terminate_cluster(self.cluster_name)
 
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
@@ -531,7 +507,7 @@ def recover(self) -> float:
 
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        terminate_cluster(self.cluster_name)
+        managed_job_utils.terminate_cluster(self.cluster_name)
 
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '
diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py
@@ -13,6 +13,7 @@
 import shutil
 import textwrap
 import time
+import traceback
 import typing
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
@@ -21,6 +22,7 @@
 import psutil
 from typing_extensions import Literal
 
+import sky
 from sky import backends
 from sky import exceptions
 from sky import global_user_state
@@ -32,14 +34,14 @@
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
+from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import log_utils
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
 
 if typing.TYPE_CHECKING:
-    import sky
     from sky import dag as dag_lib
 
 logger = sky_logging.init_logger(__name__)
@@ -85,6 +87,43 @@ class UserSignal(enum.Enum):
 
 
 # ====== internal functions ======
+def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
+    """Terminate the cluster."""
+    retry_cnt = 0
+    # In some cases, e.g. botocore.exceptions.NoCredentialsError due to AWS
+    # metadata service throttling, the failed sky.down attempt can take 10-11
+    # seconds. In this case, we need the backoff to significantly reduce the
+    # rate of requests - that is, significantly increase the time between
+    # requests. We set the initial backoff to 15 seconds, so that once it grows
+    # exponentially it will quickly dominate the 10-11 seconds that we already
+    # see between requests. We set the max backoff very high, since it's
+    # generally much more important to eventually succeed than to fail fast.
+    backoff = common_utils.Backoff(
+        initial_backoff=15,
+        # 1.6 ** 5 = 10.48576 < 20, so we won't hit this with default max_retry
+        max_backoff_factor=20)
+    while True:
+        try:
+            usage_lib.messages.usage.set_internal()
+            sky.down(cluster_name)
+            return
+        except exceptions.ClusterDoesNotExist:
+            # The cluster is already down.
+            logger.debug(f'The cluster {cluster_name} is already down.')
+            return
+        except Exception as e:  # pylint: disable=broad-except
+            retry_cnt += 1
+            if retry_cnt >= max_retry:
+                raise RuntimeError(
+                    f'Failed to terminate the cluster {cluster_name}.') from e
+            logger.error(
+                f'Failed to terminate the cluster {cluster_name}. Retrying.'
+                f'Details: {common_utils.format_exception(e)}')
+            with ux_utils.enable_traceback():
+                logger.error(f'  Traceback: {traceback.format_exc()}')
+            time.sleep(backoff.current_backoff())
+
+
 def get_job_status(backend: 'backends.CloudVmRayBackend',
                    cluster_name: str) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
@@ -202,18 +241,9 @@ def update_managed_job_status(job_id: Optional[int] = None):
             cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
             handle = global_user_state.get_handle_from_cluster_name(
                 cluster_name)
+            # If the cluster exists, terminate it.
             if handle is not None:
-                backend = backend_utils.get_backend_from_handle(handle)
-                # TODO(cooperc): Add backoff
-                max_retry = 3
-                for retry_cnt in range(max_retry):
-                    try:
-                        backend.teardown(handle, terminate=True)
-                        break
-                    except RuntimeError:
-                        logger.error('Failed to tear down the cluster '
-                                     f'{cluster_name!r}. Retrying '
-                                     f'[{retry_cnt}/{max_retry}].')
+                terminate_cluster(cluster_name)
 
         # The controller process for this managed job is not running: it must
         # have exited abnormally, and we should set the job status to
diff --git a/tests/unit_tests/test_jobs_utils.py b/tests/unit_tests/test_jobs_utils.py
@@ -1,7 +1,7 @@
 from unittest import mock
 
 from sky.exceptions import ClusterDoesNotExist
-from sky.jobs import recovery_strategy
+from sky.jobs import utils
 
 
 @mock.patch('sky.down')
@@ -16,7 +16,7 @@ def test_terminate_cluster_retry_on_value_error(mock_set_internal,
     ]
 
     # Call should succeed after retries
-    recovery_strategy.terminate_cluster('test-cluster')
+    utils.terminate_cluster('test-cluster')
 
     # Verify sky.down was called 3 times
     assert mock_sky_down.call_count == 3
@@ -38,7 +38,7 @@ def test_terminate_cluster_handles_nonexistent_cluster(mock_set_internal,
     mock_sky_down.side_effect = ClusterDoesNotExist('test-cluster')
 
     # Call should succeed silently
-    recovery_strategy.terminate_cluster('test-cluster')
+    utils.terminate_cluster('test-cluster')
 
     # Verify sky.down was called once
     assert mock_sky_down.call_count == 1