@@ -131,48 +131,7 @@ def create_app_wrapper(self):
131
131
# Validate image configuration
132
132
self .validate_image_config ()
133
133
134
- # Before attempting to create the cluster AW, let's evaluate the ClusterConfig
135
-
136
- name = self .config .name
137
- namespace = self .config .namespace
138
- head_cpus = self .config .head_cpus
139
- head_memory = self .config .head_memory
140
- num_head_gpus = self .config .num_head_gpus
141
- worker_cpu_requests = self .config .worker_cpu_requests
142
- worker_cpu_limits = self .config .worker_cpu_limits
143
- worker_memory_requests = self .config .worker_memory_requests
144
- worker_memory_limits = self .config .worker_memory_limits
145
- num_worker_gpus = self .config .num_worker_gpus
146
- workers = self .config .num_workers
147
- template = self .config .template
148
- image = self .config .image
149
- appwrapper = self .config .appwrapper
150
- env = self .config .envs
151
- image_pull_secrets = self .config .image_pull_secrets
152
- write_to_file = self .config .write_to_file
153
- local_queue = self .config .local_queue
154
- labels = self .config .labels
155
- return generate_appwrapper (
156
- name = name ,
157
- namespace = namespace ,
158
- head_cpus = head_cpus ,
159
- head_memory = head_memory ,
160
- num_head_gpus = num_head_gpus ,
161
- worker_cpu_requests = worker_cpu_requests ,
162
- worker_cpu_limits = worker_cpu_limits ,
163
- worker_memory_requests = worker_memory_requests ,
164
- worker_memory_limits = worker_memory_limits ,
165
- num_worker_gpus = num_worker_gpus ,
166
- workers = workers ,
167
- template = template ,
168
- image = image ,
169
- appwrapper = appwrapper ,
170
- env = env ,
171
- image_pull_secrets = image_pull_secrets ,
172
- write_to_file = write_to_file ,
173
- local_queue = local_queue ,
174
- labels = labels ,
175
- )
134
+ return generate_appwrapper (self )
176
135
177
136
# creates a new cluster with the provided or default spec
178
137
def up (self ):
@@ -456,6 +415,29 @@ def job_logs(self, job_id: str) -> str:
456
415
"""
457
416
return self .job_client .get_job_logs (job_id )
458
417
418
+ @staticmethod
419
+ def _head_worker_extended_resources_from_rc_dict (rc : Dict ) -> Tuple [dict , dict ]:
420
+ head_extended_resources , worker_extended_resources = {}, {}
421
+ for resource in rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ][
422
+ "containers"
423
+ ][0 ]["resources" ]["limits" ].keys ():
424
+ if resource in ["memory" , "cpu" ]:
425
+ continue
426
+ worker_extended_resources [resource ] = rc ["spec" ]["workerGroupSpecs" ][0 ][
427
+ "template"
428
+ ]["spec" ]["containers" ][0 ]["resources" ]["limits" ][resource ]
429
+
430
+ for resource in rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][
431
+ 0
432
+ ]["resources" ]["limits" ].keys ():
433
+ if resource in ["memory" , "cpu" ]:
434
+ continue
435
+ head_extended_resources [resource ] = rc ["spec" ]["headGroupSpec" ]["template" ][
436
+ "spec"
437
+ ]["containers" ][0 ]["resources" ]["limits" ][resource ]
438
+
439
+ return head_extended_resources , worker_extended_resources
440
+
459
441
def from_k8_cluster_object (
460
442
rc ,
461
443
appwrapper = True ,
@@ -469,6 +451,11 @@ def from_k8_cluster_object(
469
451
else []
470
452
)
471
453
454
+ (
455
+ head_extended_resources ,
456
+ worker_extended_resources ,
457
+ ) = Cluster ._head_worker_extended_resources_from_rc_dict (rc )
458
+
472
459
cluster_config = ClusterConfiguration (
473
460
name = rc ["metadata" ]["name" ],
474
461
namespace = rc ["metadata" ]["namespace" ],
@@ -486,11 +473,8 @@ def from_k8_cluster_object(
486
473
worker_memory_limits = rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ][
487
474
"containers"
488
475
][0 ]["resources" ]["limits" ]["memory" ],
489
- num_worker_gpus = int (
490
- rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ]["containers" ][0 ][
491
- "resources"
492
- ]["limits" ]["nvidia.com/gpu" ]
493
- ),
476
+ worker_extended_resource_requests = worker_extended_resources ,
477
+ head_extended_resource_requests = head_extended_resources ,
494
478
image = rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ]["containers" ][
495
479
0
496
480
]["image" ],
@@ -871,6 +855,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
871
855
protocol = "https"
872
856
dashboard_url = f"{ protocol } ://{ ingress .spec .rules [0 ].host } "
873
857
858
+ (
859
+ head_extended_resources ,
860
+ worker_extended_resources ,
861
+ ) = Cluster ._head_worker_extended_resources_from_rc_dict (rc )
862
+
874
863
return RayCluster (
875
864
name = rc ["metadata" ]["name" ],
876
865
status = status ,
@@ -885,17 +874,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
885
874
worker_cpu = rc ["spec" ]["workerGroupSpecs" ][0 ]["template" ]["spec" ]["containers" ][
886
875
0
887
876
]["resources" ]["limits" ]["cpu" ],
888
- worker_gpu = 0 , # hard to detect currently how many gpus, can override it with what the user asked for
877
+ worker_extended_resources = worker_extended_resources ,
889
878
namespace = rc ["metadata" ]["namespace" ],
890
879
head_cpus = rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][0 ][
891
880
"resources"
892
881
]["limits" ]["cpu" ],
893
882
head_mem = rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][0 ][
894
883
"resources"
895
884
]["limits" ]["memory" ],
896
- head_gpu = rc ["spec" ]["headGroupSpec" ]["template" ]["spec" ]["containers" ][0 ][
897
- "resources"
898
- ]["limits" ]["nvidia.com/gpu" ],
885
+ head_extended_resources = head_extended_resources ,
899
886
dashboard = dashboard_url ,
900
887
)
901
888
@@ -920,12 +907,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
920
907
worker_mem_min = cluster .config .worker_memory_requests ,
921
908
worker_mem_max = cluster .config .worker_memory_limits ,
922
909
worker_cpu = cluster .config .worker_cpu_requests ,
923
- worker_gpu = cluster .config .num_worker_gpus ,
910
+ worker_extended_resources = cluster .config .worker_extended_resource_requests ,
924
911
namespace = cluster .config .namespace ,
925
912
dashboard = cluster .cluster_dashboard_uri (),
926
913
head_cpus = cluster .config .head_cpus ,
927
914
head_mem = cluster .config .head_memory ,
928
- head_gpu = cluster .config .num_head_gpus ,
915
+ head_extended_resources = cluster .config .head_extended_resource_requests ,
929
916
)
930
917
if ray .status == CodeFlareClusterStatus .READY :
931
918
ray .status = RayClusterStatus .READY
0 commit comments