Skip to content

Commit 4e46cf4

Browse files
[k8s] Enable multiple kubernetes contexts for failover (#3968)
* wip * Fix * format * format * Fix context and namespace used * update * fix * Fix feasibility check * fix image for k8s * patch k8s tests * format * format * format * Fix tests * avoid -s * Fix acc detection * format * Update docs/source/reference/config.rst Co-authored-by: Romil Bhardwaj <[email protected]> * refactor a little * Add docs for k8s context update * Use all pods in a context * Add policy * Fix unsupported features and other kube calls * Add policies * Fix backward compatbility * Add smoke test * set * fix typing * Add check for local k8s cluster in smoke test * Add skypilot config * Fix smoke * Make loging log once * format * format --------- Co-authored-by: Romil Bhardwaj <[email protected]>
1 parent 4740ea8 commit 4e46cf4

File tree

21 files changed

+599
-165
lines changed

21 files changed

+599
-165
lines changed

.github/workflows/pytest.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,4 @@ jobs:
5757
pip install pytest pytest-xdist pytest-env>=0.6 memory-profiler==0.61.0
5858
5959
- name: Run tests with pytest
60-
run: SKYPILOT_DISABLE_USAGE_COLLECTION=1 SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK=1 pytest -n 1 --dist no ${{ matrix.test-path }}
60+
run: SKYPILOT_DISABLE_USAGE_COLLECTION=1 SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK=1 pytest -n 0 --dist no ${{ matrix.test-path }}

docs/source/cloud-setup/policy.rst

+16
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Example usage:
1313
- :ref:`disable-public-ip-policy`
1414
- :ref:`use-spot-for-gpu-policy`
1515
- :ref:`enforce-autostop-policy`
16+
- :ref:`dynamic-kubernetes-contexts-update-policy`
1617

1718

1819
To implement and use an admin policy:
@@ -193,3 +194,18 @@ Enforce Autostop for all Tasks
193194
.. literalinclude:: ../../../examples/admin_policy/enforce_autostop.yaml
194195
:language: yaml
195196
:caption: `Config YAML for using EnforceAutostopPolicy <https://github.com/skypilot-org/skypilot/blob/master/examples/admin_policy/enforce_autostop.yaml>`_
197+
198+
199+
.. _dynamic-kubernetes-contexts-update-policy:
200+
201+
Dynamically Update Kubernetes Contexts to Use
202+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
203+
204+
.. literalinclude:: ../../../examples/admin_policy/example_policy/example_policy/skypilot_policy.py
205+
:language: python
206+
:pyobject: DynamicKubernetesContextsUpdatePolicy
207+
:caption: `DynamicKubernetesContextsUpdatePolicy <https://github.com/skypilot-org/skypilot/blob/master/examples/admin_policy/example_policy/example_policy/skypilot_policy.py>`_
208+
209+
.. literalinclude:: ../../../examples/admin_policy/dynamic_kubernetes_contexts_update.yaml
210+
:language: yaml
211+
:caption: `Config YAML for using DynamicKubernetesContextsUpdatePolicy <https://github.com/skypilot-org/skypilot/blob/master/examples/admin_policy/dynamic_kubernetes_contexts_update.yaml>`_

docs/source/reference/config.rst

+13
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,19 @@ Available fields and semantics:
495495
# Default: 'SERVICE_ACCOUNT'.
496496
remote_identity: my-k8s-service-account
497497
498+
# Allowed context names to use for Kubernetes clusters (optional).
499+
#
500+
# SkyPilot will try provisioning and failover Kubernetes contexts in the
501+
# same order as they are specified here. E.g., SkyPilot will try using
502+
# context1 first. If it is out of resources or unreachable, it will failover
503+
# and try context2.
504+
#
505+
# If not specified, only the current active context is used for launching
506+
# new clusters.
507+
allowed_contexts:
508+
- context1
509+
- context2
510+
498511
# Attach custom metadata to Kubernetes objects created by SkyPilot
499512
#
500513
# Uses the same schema as Kubernetes metadata object: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#objectmeta-v1-meta
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
admin_policy: example_policy.DynamicKubernetesContextsUpdatePolicy
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Example admin policy module and prebuilt policies."""
22
from example_policy.skypilot_policy import AddLabelsPolicy
33
from example_policy.skypilot_policy import DisablePublicIpPolicy
4+
from example_policy.skypilot_policy import DynamicKubernetesContextsUpdatePolicy
45
from example_policy.skypilot_policy import EnforceAutostopPolicy
56
from example_policy.skypilot_policy import RejectAllPolicy
67
from example_policy.skypilot_policy import UseSpotForGpuPolicy

examples/admin_policy/example_policy/example_policy/skypilot_policy.py

+46
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
"""Example prebuilt admin policies."""
2+
import subprocess
3+
24
import sky
35

46

@@ -119,3 +121,47 @@ def validate_and_mutate(
119121
return sky.MutatedUserRequest(
120122
task=user_request.task,
121123
skypilot_config=user_request.skypilot_config)
124+
125+
126+
def update_current_kubernetes_clusters_from_registry():
127+
"""Mock implementation of updating kubernetes clusters from registry."""
128+
# All cluster names can be fetched from an organization's internal API.
129+
NEW_CLUSTER_NAMES = ['my-cluster']
130+
for cluster_name in NEW_CLUSTER_NAMES:
131+
# Update the local kubeconfig with the new cluster credentials.
132+
subprocess.run(
133+
f'gcloud container clusters get-credentials {cluster_name} '
134+
'--region us-central1-c',
135+
shell=True,
136+
check=False)
137+
138+
139+
def get_allowed_contexts():
140+
"""Mock implementation of getting allowed kubernetes contexts."""
141+
from sky.provision.kubernetes import utils
142+
contexts = utils.get_all_kube_config_context_names()
143+
return contexts[:2]
144+
145+
146+
class DynamicKubernetesContextsUpdatePolicy(sky.AdminPolicy):
147+
"""Example policy: update the kubernetes context to use."""
148+
149+
@classmethod
150+
def validate_and_mutate(
151+
cls, user_request: sky.UserRequest) -> sky.MutatedUserRequest:
152+
"""Updates the kubernetes context to use."""
153+
# Append any new kubernetes clusters in local kubeconfig. An example
154+
# implementation of this method can be:
155+
# 1. Query an organization's internal Kubernetes cluster registry,
156+
# which can be some internal API, or a secret vault.
157+
# 2. Append the new credentials to the local kubeconfig.
158+
update_current_kubernetes_clusters_from_registry()
159+
# Get the allowed contexts for the user. Similarly, it can retrieve
160+
# the latest allowed contexts from an organization's internal API.
161+
allowed_contexts = get_allowed_contexts()
162+
163+
# Update the kubernetes allowed contexts in skypilot config.
164+
config = user_request.skypilot_config
165+
config.set_nested(('kubernetes', 'allowed_contexts'), allowed_contexts)
166+
return sky.MutatedUserRequest(task=user_request.task,
167+
skypilot_config=config)

sky/adaptors/kubernetes.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,17 @@ def _load_config(context: Optional[str] = None):
7575
suffix += f' Error: {str(e)}'
7676
# Check if exception was due to no current-context
7777
if 'Expected key current-context' in str(e):
78-
err_str = ('Failed to load Kubernetes configuration. '
79-
'Kubeconfig does not contain any valid context(s).'
80-
f'{suffix}\n'
81-
' If you were running a local Kubernetes '
82-
'cluster, run `sky local up` to start the cluster.')
78+
err_str = (
79+
f'Failed to load Kubernetes configuration for {context!r}. '
80+
'Kubeconfig does not contain any valid context(s).'
81+
f'{suffix}\n'
82+
' If you were running a local Kubernetes '
83+
'cluster, run `sky local up` to start the cluster.')
8384
else:
84-
err_str = ('Failed to load Kubernetes configuration. '
85-
'Please check if your kubeconfig file exists at '
86-
f'~/.kube/config and is valid.{suffix}')
85+
err_str = (
86+
f'Failed to load Kubernetes configuration for {context!r}. '
87+
'Please check if your kubeconfig file exists at '
88+
f'~/.kube/config and is valid.{suffix}')
8789
err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
8890
with ux_utils.print_exception_no_traceback():
8991
raise ValueError(err_str) from None

sky/authentication.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -378,11 +378,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
378378
public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
379379
secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME
380380
secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
381-
namespace = config['provider'].get(
382-
'namespace',
383-
kubernetes_utils.get_current_kube_config_context_namespace())
384381
context = config['provider'].get(
385382
'context', kubernetes_utils.get_current_kube_config_context_name())
383+
namespace = config['provider'].get(
384+
'namespace',
385+
kubernetes_utils.get_kube_config_context_namespace(context))
386386
k8s = kubernetes.kubernetes
387387
with open(public_key_path, 'r', encoding='utf-8') as f:
388388
public_key = f.read()
@@ -425,8 +425,8 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
425425
ssh_jump_name,
426426
nodeport_mode,
427427
private_key_path=private_key_path,
428-
namespace=namespace,
429-
context=context)
428+
context=context,
429+
namespace=namespace)
430430
elif network_mode == port_forward_mode:
431431
# Using `kubectl port-forward` creates a direct tunnel to the pod and
432432
# does not require a ssh jump pod.
@@ -441,7 +441,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
441441
# on GKE.
442442
ssh_target = config['cluster_name'] + '-head'
443443
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
444-
ssh_target, port_forward_mode, private_key_path=private_key_path)
444+
ssh_target,
445+
port_forward_mode,
446+
private_key_path=private_key_path,
447+
context=context,
448+
namespace=namespace)
445449
else:
446450
# This should never happen because we check for this in from_str above.
447451
raise ValueError(f'Unsupported networking mode: {network_mode_str}')

sky/backends/cloud_vm_ray_backend.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -2082,7 +2082,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
20822082
"""
20832083
# Bump if any fields get added/removed/changed, and add backward
20842084
# compaitibility logic in __setstate__.
2085-
_VERSION = 8
2085+
_VERSION = 9
20862086

20872087
def __init__(
20882088
self,
@@ -2516,6 +2516,19 @@ def __setstate__(self, state):
25162516
if version < 8:
25172517
self.cached_cluster_info = None
25182518

2519+
if version < 9:
2520+
# For backward compatibility, we should update the region of a
2521+
# SkyPilot cluster on Kubernetes to the actual context it is using.
2522+
# pylint: disable=import-outside-toplevel
2523+
launched_resources = state['launched_resources']
2524+
if isinstance(launched_resources.cloud, clouds.Kubernetes):
2525+
yaml_config = common_utils.read_yaml(
2526+
os.path.expanduser(state['_cluster_yaml']))
2527+
context = kubernetes_utils.get_context_from_config(
2528+
yaml_config['provider'])
2529+
state['launched_resources'] = launched_resources.copy(
2530+
region=context)
2531+
25192532
self.__dict__.update(state)
25202533

25212534
# Because the update_cluster_ips and update_ssh_ports

sky/cli.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -3026,14 +3026,11 @@ def show_gpus(
30263026
kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
30273027
sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
30283028

3029-
if cloud_is_kubernetes and region is not None:
3030-
raise click.UsageError(
3031-
'The --region flag cannot be set with --cloud kubernetes.')
3032-
30333029
def _list_to_str(lst):
30343030
return ', '.join([str(e) for e in lst])
30353031

30363032
def _get_kubernetes_realtime_gpu_table(
3033+
context: Optional[str] = None,
30373034
name_filter: Optional[str] = None,
30383035
quantity_filter: Optional[int] = None):
30393036
if quantity_filter:
@@ -3048,7 +3045,7 @@ def _get_kubernetes_realtime_gpu_table(
30483045
gpus_only=True,
30493046
clouds='kubernetes',
30503047
name_filter=name_filter,
3051-
region_filter=region,
3048+
region_filter=context,
30523049
quantity_filter=quantity_filter,
30533050
case_sensitive=False)
30543051
assert (set(counts.keys()) == set(capacity.keys()) == set(
@@ -3078,11 +3075,11 @@ def _get_kubernetes_realtime_gpu_table(
30783075
])
30793076
return realtime_gpu_table
30803077

3081-
def _get_kubernetes_node_info_table():
3078+
def _get_kubernetes_node_info_table(context: Optional[str]):
30823079
node_table = log_utils.create_table(
30833080
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
30843081

3085-
node_info_dict = kubernetes_utils.get_kubernetes_node_info()
3082+
node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
30863083
for node_name, node_info in node_info_dict.items():
30873084
node_table.add_row([
30883085
node_name, node_info.gpu_type,
@@ -3116,11 +3113,13 @@ def _output():
31163113
print_section_titles = False
31173114
# If cloud is kubernetes, we want to show real-time capacity
31183115
if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
3116+
context = region
31193117
try:
31203118
# If --cloud kubernetes is not specified, we want to catch
31213119
# the case where no GPUs are available on the cluster and
31223120
# print the warning at the end.
3123-
k8s_realtime_table = _get_kubernetes_realtime_gpu_table()
3121+
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3122+
context)
31243123
except ValueError as e:
31253124
if not cloud_is_kubernetes:
31263125
# Make it a note if cloud is not kubernetes
@@ -3129,9 +3128,10 @@ def _output():
31293128
else:
31303129
print_section_titles = True
31313130
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3132-
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3131+
f'Kubernetes GPUs (Context: {context})'
3132+
f'{colorama.Style.RESET_ALL}\n')
31333133
yield from k8s_realtime_table.get_string()
3134-
k8s_node_table = _get_kubernetes_node_info_table()
3134+
k8s_node_table = _get_kubernetes_node_info_table(context)
31353135
yield '\n\n'
31363136
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
31373137
f'Kubernetes per node GPU availability'

0 commit comments

Comments
 (0)