|
36 | 36 | # 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
|
37 | 37 | MAX_JOB_CHECKING_RETRY = 10
|
38 | 38 |
|
| 39 | +# Minutes to job cluster autodown. This should be significantly larger than |
| 40 | +# managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the |
| 41 | +# cluster before its status can be updated by the job controller. |
| 42 | +_AUTODOWN_MINUTES = 5 |
| 43 | + |
39 | 44 |
|
40 | 45 | def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
|
41 | 46 | """Terminate the cluster."""
|
@@ -302,11 +307,17 @@ def _launch(self,
|
302 | 307 | usage_lib.messages.usage.set_internal()
|
303 | 308 | # Detach setup, so that the setup failure can be detected
|
304 | 309 | # by the controller process (job_status -> FAILED_SETUP).
|
305 |
| - sky.launch(self.dag, |
306 |
| - cluster_name=self.cluster_name, |
307 |
| - detach_setup=True, |
308 |
| - detach_run=True, |
309 |
| - _is_launched_by_jobs_controller=True) |
| 310 | + sky.launch( |
| 311 | + self.dag, |
| 312 | + cluster_name=self.cluster_name, |
| 313 | + # We expect to tear down the cluster as soon as the job is |
| 314 | + # finished. However, in case the controller dies, set |
| 315 | + # autodown to try and avoid a resource leak. |
| 316 | + idle_minutes_to_autostop=_AUTODOWN_MINUTES, |
| 317 | + down=True, |
| 318 | + detach_setup=True, |
| 319 | + detach_run=True, |
| 320 | + _is_launched_by_jobs_controller=True) |
310 | 321 | logger.info('Managed job cluster launched.')
|
311 | 322 | except (exceptions.InvalidClusterNameError,
|
312 | 323 | exceptions.NoCloudAccessError,
|
|
0 commit comments