@@ -131,52 +131,7 @@ def create_app_wrapper(self):
131
131
# Validate image configuration
132
132
self .validate_image_config ()
133
133
134
- # Before attempting to create the cluster AW, let's evaluate the ClusterConfig
135
-
136
- name = self .config .name
137
- namespace = self .config .namespace
138
- head_cpus = self .config .head_cpus
139
- head_memory = self .config .head_memory
140
- head_gpus = self .config .head_gpus
141
- min_cpu = self .config .min_cpus
142
- max_cpu = self .config .max_cpus
143
- min_memory = self .config .min_memory
144
- max_memory = self .config .max_memory
145
- gpu = self .config .num_gpus
146
- workers = self .config .num_workers
147
- template = self .config .template
148
- image = self .config .image
149
- appwrapper = self .config .appwrapper
150
- instance_types = self .config .machine_types
151
- env = self .config .envs
152
- image_pull_secrets = self .config .image_pull_secrets
153
- write_to_file = self .config .write_to_file
154
- verify_tls = self .config .verify_tls
155
- local_queue = self .config .local_queue
156
- labels = self .config .labels
157
- return generate_appwrapper (
158
- name = name ,
159
- namespace = namespace ,
160
- head_cpus = head_cpus ,
161
- head_memory = head_memory ,
162
- head_gpus = head_gpus ,
163
- min_cpu = min_cpu ,
164
- max_cpu = max_cpu ,
165
- min_memory = min_memory ,
166
- max_memory = max_memory ,
167
- gpu = gpu ,
168
- workers = workers ,
169
- template = template ,
170
- image = image ,
171
- appwrapper = appwrapper ,
172
- instance_types = instance_types ,
173
- env = env ,
174
- image_pull_secrets = image_pull_secrets ,
175
- write_to_file = write_to_file ,
176
- verify_tls = verify_tls ,
177
- local_queue = local_queue ,
178
- labels = labels ,
179
- )
134
+ return generate_appwrapper (self )
180
135
181
136
# creates a new cluster with the provided or default spec
182
137
def up (self ):
@@ -460,6 +415,29 @@ def job_logs(self, job_id: str) -> str:
460
415
"""
461
416
return self .job_client .get_job_logs (job_id )
462
417
418
+ @staticmethod
419
+ def _head_worker_resources_from_rc_dict (rc : Dict ) -> Tuple [dict , dict ]:
420
+ head_custom_resources , worker_custom_resources = {}, {}
421
+ for resource in rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ][
422
+ "containers"
423
+ ][0 ]["resources" ]["limits" ].keys ():
424
+ if resource in ["memory" , "cpu" ]:
425
+ continue
426
+ worker_custom_resources [resource ] = rc ["spec" ]["workerGroupSpecs" ][0 ][
427
+ "template"
428
+ ]["spec" ]["containers" ][0 ]["resources" ]["limits" ][resource ]
429
+
430
+ for resource in rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][
431
+ 0
432
+ ]["resources" ]["limits" ].keys ():
433
+ if resource in ["memory" , "cpu" ]:
434
+ continue
435
+ head_custom_resources [resource ] = rc ["spec" ]["headGroupSpec" ][0 ][
436
+ "template"
437
+ ]["spec" ]["containers" ][0 ]["resources" ]["limits" ][resource ]
438
+
439
+ return head_custom_resources , worker_custom_resources
440
+
463
441
def from_k8_cluster_object (
464
442
rc ,
465
443
appwrapper = True ,
@@ -473,6 +451,11 @@ def from_k8_cluster_object(
473
451
else []
474
452
)
475
453
454
+ (
455
+ head_custom_resources ,
456
+ worker_custom_resources ,
457
+ ) = Cluster ._head_worker_resources_from_rc_dict (rc )
458
+
476
459
cluster_config = ClusterConfiguration (
477
460
name = rc ["metadata" ]["name" ],
478
461
namespace = rc ["metadata" ]["namespace" ],
@@ -490,11 +473,8 @@ def from_k8_cluster_object(
490
473
max_memory = rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ][
491
474
"containers"
492
475
][0 ]["resources" ]["limits" ]["memory" ],
493
- num_gpus = int (
494
- rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ]["containers" ][0 ][
495
- "resources"
496
- ]["limits" ]["nvidia.com/gpu" ]
497
- ),
476
+ worker_custom_resource_requests = worker_custom_resources ,
477
+ head_custom_resource_requests = head_custom_resources ,
498
478
image = rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ]["containers" ][
499
479
0
500
480
]["image" ],
@@ -875,6 +855,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
875
855
protocol = "https"
876
856
dashboard_url = f"{ protocol } ://{ ingress .spec .rules [0 ].host } "
877
857
858
+ (
859
+ head_custom_resources ,
860
+ worker_custom_resources ,
861
+ ) = Cluster ._head_worker_resources_from_rc_dict (rc )
862
+
878
863
return RayCluster (
879
864
name = rc ["metadata" ]["name" ],
880
865
status = status ,
@@ -889,17 +874,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
889
874
worker_cpu = rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ]["containers" ][
890
875
0
891
876
]["resources" ]["limits" ]["cpu" ],
892
- worker_gpu = 0 , # hard to detect currently how many gpus, can override it with what the user asked for
877
+ worker_custom_resources = worker_custom_resources ,
893
878
namespace = rc ["metadata" ]["namespace" ],
894
879
head_cpus = rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][0 ][
895
880
"resources"
896
881
]["limits" ]["cpu" ],
897
882
head_mem = rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][0 ][
898
883
"resources"
899
884
]["limits" ]["memory" ],
900
- head_gpu = rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][0 ][
901
- "resources"
902
- ]["limits" ]["nvidia.com/gpu" ],
885
+ head_custom_resources = head_custom_resources ,
903
886
dashboard = dashboard_url ,
904
887
)
905
888
@@ -924,12 +907,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
924
907
worker_mem_min = cluster .config .min_memory ,
925
908
worker_mem_max = cluster .config .max_memory ,
926
909
worker_cpu = cluster .config .min_cpus ,
927
- worker_gpu = cluster .config .num_gpus ,
910
+ worker_custom_resources = cluster .config .worker_custom_resource_requests ,
928
911
namespace = cluster .config .namespace ,
929
912
dashboard = cluster .cluster_dashboard_uri (),
930
913
head_cpus = cluster .config .head_cpus ,
931
914
head_mem = cluster .config .head_memory ,
932
- head_gpu = cluster .config .head_gpus ,
915
+ head_custom_resources = cluster .config .head_custom_resource_requests ,
933
916
)
934
917
if ray .status == CodeFlareClusterStatus .READY :
935
918
ray .status = RayClusterStatus .READY
0 commit comments