Skip to content

Commit c0f7d7f

Browse files
authored
Migrate from MCAD to AppWrapper v1beta2 (#521)
* rename mcad to appwrapper * remove dispatch_priority (not supported by v1beta2 AppWrapper) * remove instascale * remove priority/affinity from template -- not compatible with Kueue * make mocked objects easier to maintain by removing unnecessary metadata * port appwrapper status to v1beta2 names * prune mocked appwrappers * eliminate dependency on workload.codeflare.dev/appwrapper label * Finish converting AppWrappers to v1beta2 * fix incomplete rebase * rebase: remove instascale from new testcase * add e2e test for appwrapper containing a raycluster * Also must add local_queue label to appwrappers * user labels should also be added to ray cluster wrapped in appwrapper * fix more incorrect test cases that were assuming that appwrappers don't get a localqueue * sdk_user must have rbacs to create appwrappers for e2e test to succeed * elide AppWrappers from top-level documentation
1 parent 56b4478 commit c0f7d7f

17 files changed

+1166
-2112
lines changed

.github/workflows/e2e_tests.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ jobs:
114114
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
115115
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
116116
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
117+
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
118+
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
117119
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
118120
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
119121
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues

docs/cluster-configuration.md

+3-7
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,13 @@ cluster = Cluster(ClusterConfiguration(
1818
max_cpus=1, # Default 1
1919
min_memory=2, # Default 2
2020
max_memory=2, # Default 2
21-
mcad=True, # Default True
21+
num_gpus=0, # Default 0
2222
image="quay.io/project-codeflare/ray:latest-py39-cu118", # Mandatory Field
2323
machine_types=["m5.xlarge", "g4dn.xlarge"],
2424
labels={"exampleLabel": "example", "secondLabel": "example"},
2525
))
2626
```
2727

28-
Upon creating a cluster configuration with `mcad=True` an appwrapper will be created featuring the Ray Cluster and any Routes, Ingresses or Secrets that are needed to be created along side it.<br>
29-
From there a user can call `cluster.up()` and `cluster.down()` to create and remove the appwrapper thus creating and removing the Ray Cluster.
30-
31-
In cases where `mcad=False` a yaml file will be created with the individual Ray Cluster, Route/Ingress and Secret included.<br>
32-
The Ray Cluster and service will be created by KubeRay directly and the other components will be individually created.
33-
3428
The `labels={"exampleLabel": "example"}` parameter can be used to apply additional labels to the RayCluster resource.
29+
30+
After creating their`cluster`, a user can call `cluster.up()` and `cluster.down()` to respectively create or remove the Ray Cluster.

src/codeflare_sdk/cluster/awload.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def submit(self) -> None:
6262
api_instance = client.CustomObjectsApi(api_config_handler())
6363
api_instance.create_namespaced_custom_object(
6464
group="workload.codeflare.dev",
65-
version="v1beta1",
65+
version="v1beta2",
6666
namespace=self.namespace,
6767
plural="appwrappers",
6868
body=self.awyaml,
@@ -87,7 +87,7 @@ def remove(self) -> None:
8787
api_instance = client.CustomObjectsApi(api_config_handler())
8888
api_instance.delete_namespaced_custom_object(
8989
group="workload.codeflare.dev",
90-
version="v1beta1",
90+
version="v1beta2",
9191
namespace=self.namespace,
9292
plural="appwrappers",
9393
name=self.name,

src/codeflare_sdk/cluster/cluster.py

+28-72
Original file line numberDiff line numberDiff line change
@@ -103,26 +103,6 @@ def job_client(self):
103103
)
104104
return self._job_submission_client
105105

106-
def evaluate_dispatch_priority(self):
107-
priority_class = self.config.dispatch_priority
108-
109-
try:
110-
config_check()
111-
api_instance = client.CustomObjectsApi(api_config_handler())
112-
priority_classes = api_instance.list_cluster_custom_object(
113-
group="scheduling.k8s.io",
114-
version="v1",
115-
plural="priorityclasses",
116-
)
117-
except Exception as e: # pragma: no cover
118-
return _kube_api_error_handling(e)
119-
120-
for pc in priority_classes["items"]:
121-
if pc["metadata"]["name"] == priority_class:
122-
return pc["value"]
123-
print(f"Priority class {priority_class} is not available in the cluster")
124-
return None
125-
126106
def validate_image_config(self):
127107
"""
128108
Validates that the image configuration is not empty.
@@ -152,18 +132,6 @@ def create_app_wrapper(self):
152132
self.validate_image_config()
153133

154134
# Before attempting to create the cluster AW, let's evaluate the ClusterConfig
155-
if self.config.dispatch_priority:
156-
if not self.config.mcad:
157-
raise ValueError(
158-
"Invalid Cluster Configuration, cannot have dispatch priority without MCAD"
159-
)
160-
priority_val = self.evaluate_dispatch_priority()
161-
if priority_val == None:
162-
raise ValueError(
163-
"Invalid Cluster Configuration, AppWrapper not generated"
164-
)
165-
else:
166-
priority_val = None
167135

168136
name = self.config.name
169137
namespace = self.config.namespace
@@ -178,12 +146,10 @@ def create_app_wrapper(self):
178146
workers = self.config.num_workers
179147
template = self.config.template
180148
image = self.config.image
181-
instascale = self.config.instascale
182-
mcad = self.config.mcad
149+
appwrapper = self.config.appwrapper
183150
instance_types = self.config.machine_types
184151
env = self.config.envs
185152
image_pull_secrets = self.config.image_pull_secrets
186-
dispatch_priority = self.config.dispatch_priority
187153
write_to_file = self.config.write_to_file
188154
verify_tls = self.config.verify_tls
189155
local_queue = self.config.local_queue
@@ -202,13 +168,10 @@ def create_app_wrapper(self):
202168
workers=workers,
203169
template=template,
204170
image=image,
205-
instascale=instascale,
206-
mcad=mcad,
171+
appwrapper=appwrapper,
207172
instance_types=instance_types,
208173
env=env,
209174
image_pull_secrets=image_pull_secrets,
210-
dispatch_priority=dispatch_priority,
211-
priority_val=priority_val,
212175
write_to_file=write_to_file,
213176
verify_tls=verify_tls,
214177
local_queue=local_queue,
@@ -230,13 +193,13 @@ def up(self):
230193
try:
231194
config_check()
232195
api_instance = client.CustomObjectsApi(api_config_handler())
233-
if self.config.mcad:
196+
if self.config.appwrapper:
234197
if self.config.write_to_file:
235198
with open(self.app_wrapper_yaml) as f:
236199
aw = yaml.load(f, Loader=yaml.FullLoader)
237200
api_instance.create_namespaced_custom_object(
238201
group="workload.codeflare.dev",
239-
version="v1beta1",
202+
version="v1beta2",
240203
namespace=namespace,
241204
plural="appwrappers",
242205
body=aw,
@@ -245,7 +208,7 @@ def up(self):
245208
aw = yaml.safe_load(self.app_wrapper_yaml)
246209
api_instance.create_namespaced_custom_object(
247210
group="workload.codeflare.dev",
248-
version="v1beta1",
211+
version="v1beta2",
249212
namespace=namespace,
250213
plural="appwrappers",
251214
body=aw,
@@ -284,10 +247,10 @@ def down(self):
284247
try:
285248
config_check()
286249
api_instance = client.CustomObjectsApi(api_config_handler())
287-
if self.config.mcad:
250+
if self.config.appwrapper:
288251
api_instance.delete_namespaced_custom_object(
289252
group="workload.codeflare.dev",
290-
version="v1beta1",
253+
version="v1beta2",
291254
namespace=namespace,
292255
plural="appwrappers",
293256
name=self.app_wrapper_name,
@@ -306,30 +269,28 @@ def status(
306269
"""
307270
ready = False
308271
status = CodeFlareClusterStatus.UNKNOWN
309-
if self.config.mcad:
272+
if self.config.appwrapper:
310273
# check the app wrapper status
311274
appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
312275
if appwrapper:
313276
if appwrapper.status in [
314-
AppWrapperStatus.RUNNING,
315-
AppWrapperStatus.COMPLETED,
316-
AppWrapperStatus.RUNNING_HOLD_COMPLETION,
277+
AppWrapperStatus.RESUMING,
278+
AppWrapperStatus.RESETTING,
317279
]:
318280
ready = False
319281
status = CodeFlareClusterStatus.STARTING
320282
elif appwrapper.status in [
321283
AppWrapperStatus.FAILED,
322-
AppWrapperStatus.DELETED,
323284
]:
324285
ready = False
325286
status = CodeFlareClusterStatus.FAILED # should deleted be separate
326287
return status, ready # exit early, no need to check ray status
327288
elif appwrapper.status in [
328-
AppWrapperStatus.PENDING,
329-
AppWrapperStatus.QUEUEING,
289+
AppWrapperStatus.SUSPENDED,
290+
AppWrapperStatus.SUSPENDING,
330291
]:
331292
ready = False
332-
if appwrapper.status == AppWrapperStatus.PENDING:
293+
if appwrapper.status == AppWrapperStatus.SUSPENDED:
333294
status = CodeFlareClusterStatus.QUEUED
334295
else:
335296
status = CodeFlareClusterStatus.QUEUEING
@@ -501,7 +462,7 @@ def job_logs(self, job_id: str) -> str:
501462

502463
def from_k8_cluster_object(
503464
rc,
504-
mcad=True,
465+
appwrapper=True,
505466
write_to_file=False,
506467
verify_tls=True,
507468
):
@@ -534,11 +495,10 @@ def from_k8_cluster_object(
534495
"resources"
535496
]["limits"]["nvidia.com/gpu"]
536497
),
537-
instascale=True if machine_types else False,
538498
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
539499
0
540500
]["image"],
541-
mcad=mcad,
501+
appwrapper=appwrapper,
542502
write_to_file=write_to_file,
543503
verify_tls=verify_tls,
544504
local_queue=rc["metadata"]
@@ -597,15 +557,15 @@ def list_all_clusters(namespace: str, print_to_console: bool = True):
597557
return clusters
598558

599559

600-
def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False):
560+
def list_all_queued(
561+
namespace: str, print_to_console: bool = True, appwrapper: bool = False
562+
):
601563
"""
602564
Returns (and prints by default) a list of all currently queued-up Ray Clusters
603565
in a given namespace.
604566
"""
605-
if mcad:
606-
resources = _get_app_wrappers(
607-
namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]
608-
)
567+
if appwrapper:
568+
resources = _get_app_wrappers(namespace, filter=[AppWrapperStatus.SUSPENDED])
609569
if print_to_console:
610570
pretty_print.print_app_wrappers_status(resources)
611571
else:
@@ -675,10 +635,10 @@ def get_cluster(
675635

676636
for rc in rcs["items"]:
677637
if rc["metadata"]["name"] == cluster_name:
678-
mcad = _check_aw_exists(cluster_name, namespace)
638+
appwrapper = _check_aw_exists(cluster_name, namespace)
679639
return Cluster.from_k8_cluster_object(
680640
rc,
681-
mcad=mcad,
641+
appwrapper=appwrapper,
682642
write_to_file=write_to_file,
683643
verify_tls=verify_tls,
684644
)
@@ -721,7 +681,7 @@ def _check_aw_exists(name: str, namespace: str) -> bool:
721681
api_instance = client.CustomObjectsApi(api_config_handler())
722682
aws = api_instance.list_namespaced_custom_object(
723683
group="workload.codeflare.dev",
724-
version="v1beta1",
684+
version="v1beta2",
725685
namespace=namespace,
726686
plural="appwrappers",
727687
)
@@ -781,7 +741,7 @@ def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
781741
api_instance = client.CustomObjectsApi(api_config_handler())
782742
aws = api_instance.list_namespaced_custom_object(
783743
group="workload.codeflare.dev",
784-
version="v1beta1",
744+
version="v1beta2",
785745
namespace=namespace,
786746
plural="appwrappers",
787747
)
@@ -851,7 +811,7 @@ def _get_app_wrappers(
851811
api_instance = client.CustomObjectsApi(api_config_handler())
852812
aws = api_instance.list_namespaced_custom_object(
853813
group="workload.codeflare.dev",
854-
version="v1beta1",
814+
version="v1beta2",
855815
namespace=namespace,
856816
plural="appwrappers",
857817
)
@@ -945,18 +905,14 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
945905

946906

947907
def _map_to_app_wrapper(aw) -> AppWrapper:
948-
if "status" in aw and "canrun" in aw["status"]:
908+
if "status" in aw:
949909
return AppWrapper(
950910
name=aw["metadata"]["name"],
951-
status=AppWrapperStatus(aw["status"]["state"].lower()),
952-
can_run=aw["status"]["canrun"],
953-
job_state=aw["status"]["queuejobstate"],
911+
status=AppWrapperStatus(aw["status"]["phase"].lower()),
954912
)
955913
return AppWrapper(
956914
name=aw["metadata"]["name"],
957-
status=AppWrapperStatus("queueing"),
958-
can_run=False,
959-
job_state="Still adding to queue",
915+
status=AppWrapperStatus("suspended"),
960916
)
961917

962918

src/codeflare_sdk/cluster/config.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,10 @@ class ClusterConfiguration:
4646
max_memory: typing.Union[int, str] = 2
4747
num_gpus: int = 0
4848
template: str = f"{dir}/templates/base-template.yaml"
49-
instascale: bool = False
50-
mcad: bool = False
49+
appwrapper: bool = False
5150
envs: dict = field(default_factory=dict)
5251
image: str = ""
5352
image_pull_secrets: list = field(default_factory=list)
54-
dispatch_priority: str = None
5553
write_to_file: bool = False
5654
verify_tls: bool = True
5755
labels: dict = field(default_factory=dict)

src/codeflare_sdk/cluster/model.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,17 @@ class RayClusterStatus(Enum):
3737

3838
class AppWrapperStatus(Enum):
3939
"""
40-
Defines the possible reportable states of an AppWrapper.
40+
Defines the possible reportable phases of an AppWrapper.
4141
"""
4242

43-
QUEUEING = "queueing"
44-
PENDING = "pending"
43+
SUSPENDED = "suspended"
44+
RESUMING = "resuming"
4545
RUNNING = "running"
46+
RESETTING = "resetting"
47+
SUSPENDING = "suspending"
48+
SUCCEEDED = "succeeded"
4649
FAILED = "failed"
47-
DELETED = "deleted"
48-
COMPLETED = "completed"
49-
RUNNING_HOLD_COMPLETION = "runningholdcompletion"
50+
TERMINATING = "terminating"
5051

5152

5253
class CodeFlareClusterStatus(Enum):
@@ -91,5 +92,3 @@ class AppWrapper:
9192

9293
name: str
9394
status: AppWrapperStatus
94-
can_run: bool
95-
job_state: str

0 commit comments

Comments
 (0)