Skip to content

Commit f8f613d

Browse files
authored
[GCP] Add retry for transient error during launching GCP clusters (#2669)
* Add retry for flaky error during launching GCP clusters * handle error * format * Do not log out stderr * Add retry for gcloud crash * fix retry return code
1 parent 491701c commit f8f613d

File tree

2 files changed

+45
-6
lines changed

2 files changed

+45
-6
lines changed

sky/backends/cloud_vm_ray_backend.py

+41-5
Original file line numberDiff line numberDiff line change
@@ -784,7 +784,16 @@ def _update_blocklist_on_gcp_error(
784784
else:
785785
# No such structured error response found.
786786
assert not exception_list, stderr
787-
if 'was not found' in stderr:
787+
if 'Head node fetch timed out' in stderr:
788+
# Example: click.exceptions.ClickException: Head node fetch
789+
# timed out. Failed to create head node.
790+
# This is a transient error, but we have retried in need_ray_up
791+
# and failed. So we skip this zone.
792+
logger.info('Got \'Head node fetch timed out\' in '
793+
f'{zone.name}.')
794+
self._blocked_resources.add(
795+
launchable_resources.copy(zone=zone.name))
796+
elif 'was not found' in stderr:
788797
# Example: The resource
789798
# 'projects/<id>/zones/zone/acceleratorTypes/nvidia-tesla-v100'
790799
# was not found.
@@ -891,7 +900,16 @@ def _update_blocklist_on_azure_error(
891900
in s.strip() or '(ReadOnlyDisabledSubscription)' in s.strip())
892901
]
893902
if not errors:
894-
if 'rsync: command not found' in stderr:
903+
if 'Head node fetch timed out' in stderr:
904+
# Example: click.exceptions.ClickException: Head node fetch
905+
# timed out. Failed to create head node.
906+
# This is a transient error, but we have retried in need_ray_up
907+
# and failed. So we skip this region.
908+
logger.info('Got \'Head node fetch timed out\' in '
909+
f'{region.name}.')
910+
self._blocked_resources.add(
911+
launchable_resources.copy(region=region.name))
912+
elif 'rsync: command not found' in stderr:
895913
with ux_utils.print_exception_no_traceback():
896914
raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
897915
logger.info('====== stdout ======')
@@ -1912,16 +1930,34 @@ def need_ray_up(
19121930
'Retrying due to list request rate limit exceeded.')
19131931
return True
19141932

1933+
# https://github.com/skypilot-org/skypilot/issues/2666
1934+
if ('Head node fetch timed out. Failed to create head node.'
1935+
in stderr):
1936+
logger.info(
1937+
'Retrying head node provisioning due to head fetching '
1938+
'timeout.')
1939+
return True
1940+
19151941
# https://github.com/skypilot-org/skypilot/issues/1797
19161942
# "The resource 'projects/xxx/zones/us-central1-b/instances/ray-yyy-head-<hash>-compute' was not found" # pylint: disable=line-too-long
19171943
pattern = (r'\'code\': \'RESOURCE_NOT_FOUND\'.*The resource'
19181944
r'.*instances\/.*-compute\' was not found')
19191945
result = re.search(pattern, stderr)
19201946
if result is not None:
19211947
# Retry. Unlikely will succeed if it's due to no capacity.
1922-
logger.info(
1923-
'Retrying due to the possibly flaky RESOURCE_NOT_FOUND '
1924-
'error.')
1948+
logger.info('Retrying due to the possibly transient '
1949+
'RESOURCE_NOT_FOUND error.')
1950+
logger.debug(f'-- Stderr --\n{stderr}\n ----')
1951+
return True
1952+
1953+
# "The resource 'projects/skypilot-375900/regions/us-central1/subnetworks/default' is not ready". Details: "[{'message': "The resource 'projects/xxx/regions/us-central1/subnetworks/default' is not ready", 'domain': 'global', 'reason': 'resourceNotReady'}]"> # pylint: disable=line-too-long
1954+
pattern = (r'is not ready(.*)\'reason\': \'resourceNotReady\'')
1955+
result = re.search(pattern, stderr)
1956+
if result is not None:
1957+
# Retry. Unlikely will succeed if it's due to no capacity.
1958+
logger.info('Retrying due to the possibly transient '
1959+
'resourceNotReady error.')
1960+
logger.debug(f'-- Stderr --\n{stderr}\n ----')
19251961
return True
19261962

19271963
if isinstance(to_provision_cloud, clouds.Lambda):

sky/clouds/gcp.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,10 @@ def _list_reservations_for_instance_type(
694694
)
695695
returncode, stdout, stderr = subprocess_utils.run_with_retries(
696696
list_reservations_cmd,
697-
retry_returncode=[255],
697+
# 1: means connection aborted (although it shows 22 in the error,
698+
# but the actual error code is 1)
699+
# Example: ERROR: gcloud crashed (ConnectionError): ('Connection aborted.', OSError(22, 'Invalid argument')) # pylint: disable=line-too-long
700+
retry_returncode=[255, 1],
698701
)
699702
subprocess_utils.handle_returncode(
700703
returncode,

0 commit comments

Comments
 (0)