Skip to content

Commit ee9876d

Browse files
author
Alex Fan
committed
Update SDK args
1 parent 47654d1 commit ee9876d

11 files changed

+193
-120
lines changed

src/codeflare_sdk/cluster/cluster.py

+25-25
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,12 @@ def create_app_wrapper(self):
137137
namespace = self.config.namespace
138138
head_cpus = self.config.head_cpus
139139
head_memory = self.config.head_memory
140-
head_gpus = self.config.head_gpus
141-
min_cpu = self.config.min_cpus
142-
max_cpu = self.config.max_cpus
143-
min_memory = self.config.min_memory
144-
max_memory = self.config.max_memory
145-
gpu = self.config.num_gpus
140+
num_head_gpus = self.config.num_head_gpus
141+
worker_cpu_requests = self.config.worker_cpu_requests
142+
worker_cpu_limits = self.config.worker_cpu_limits
143+
worker_memory_requests = self.config.worker_memory_requests
144+
worker_memory_limits = self.config.worker_memory_limits
145+
num_worker_gpus = self.config.num_worker_gpus
146146
workers = self.config.num_workers
147147
template = self.config.template
148148
image = self.config.image
@@ -157,12 +157,12 @@ def create_app_wrapper(self):
157157
namespace=namespace,
158158
head_cpus=head_cpus,
159159
head_memory=head_memory,
160-
head_gpus=head_gpus,
161-
min_cpu=min_cpu,
162-
max_cpu=max_cpu,
163-
min_memory=min_memory,
164-
max_memory=max_memory,
165-
gpu=gpu,
160+
num_head_gpus=num_head_gpus,
161+
worker_cpu_requests=worker_cpu_requests,
162+
worker_cpu_limits=worker_cpu_limits,
163+
worker_memory_requests=worker_memory_requests,
164+
worker_memory_limits=worker_memory_limits,
165+
num_worker_gpus=num_worker_gpus,
166166
workers=workers,
167167
template=template,
168168
image=image,
@@ -318,7 +318,7 @@ def status(
318318

319319
if print_to_console:
320320
# overriding the number of gpus with requested
321-
cluster.worker_gpu = self.config.num_gpus
321+
cluster.worker_gpu = self.config.num_worker_gpus
322322
pretty_print.print_cluster_status(cluster)
323323
elif print_to_console:
324324
if status == CodeFlareClusterStatus.UNKNOWN:
@@ -474,19 +474,19 @@ def from_k8_cluster_object(
474474
namespace=rc["metadata"]["namespace"],
475475
machine_types=machine_types,
476476
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
477-
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
477+
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
478478
"containers"
479479
][0]["resources"]["requests"]["cpu"],
480-
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
480+
worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
481481
"containers"
482482
][0]["resources"]["limits"]["cpu"],
483-
min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
484-
"containers"
485-
][0]["resources"]["requests"]["memory"],
486-
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
483+
worker_memory_requests=rc["spec"]["workerGroupSpecs"][0]["template"][
484+
"spec"
485+
]["containers"][0]["resources"]["requests"]["memory"],
486+
worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
487487
"containers"
488488
][0]["resources"]["limits"]["memory"],
489-
num_gpus=int(
489+
num_worker_gpus=int(
490490
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
491491
"resources"
492492
]["limits"]["nvidia.com/gpu"]
@@ -917,15 +917,15 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
917917
name=cluster.config.name,
918918
status=cluster.status(print_to_console=False)[0],
919919
workers=cluster.config.num_workers,
920-
worker_mem_min=cluster.config.min_memory,
921-
worker_mem_max=cluster.config.max_memory,
922-
worker_cpu=cluster.config.min_cpus,
923-
worker_gpu=cluster.config.num_gpus,
920+
worker_mem_min=cluster.config.worker_memory_requests,
921+
worker_mem_max=cluster.config.worker_memory_limits,
922+
worker_cpu=cluster.config.worker_cpu_requests,
923+
worker_gpu=cluster.config.num_worker_gpus,
924924
namespace=cluster.config.namespace,
925925
dashboard=cluster.cluster_dashboard_uri(),
926926
head_cpus=cluster.config.head_cpus,
927927
head_mem=cluster.config.head_memory,
928-
head_gpu=cluster.config.head_gpus,
928+
head_gpu=cluster.config.num_head_gpus,
929929
)
930930
if ray.status == CodeFlareClusterStatus.READY:
931931
ray.status = RayClusterStatus.READY

src/codeflare_sdk/cluster/config.py

+55-14
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from dataclasses import dataclass, field
2222
import pathlib
2323
import typing
24+
import warnings
2425

2526
dir = pathlib.Path(__file__).parent.parent.resolve()
2627

@@ -37,14 +38,20 @@ class ClusterConfiguration:
3738
head_info: list = field(default_factory=list)
3839
head_cpus: typing.Union[int, str] = 2
3940
head_memory: typing.Union[int, str] = 8
40-
head_gpus: int = 0
41+
head_gpus: int = None # Deprecating
42+
num_head_gpus: int = 0
4143
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
42-
min_cpus: typing.Union[int, str] = 1
43-
max_cpus: typing.Union[int, str] = 1
44+
worker_cpu_requests: typing.Union[int, str] = 1
45+
worker_cpu_limits: typing.Union[int, str] = 1
46+
min_cpus: typing.Union[int, str] = None # Deprecating
47+
max_cpus: typing.Union[int, str] = None # Deprecating
4448
num_workers: int = 1
45-
min_memory: typing.Union[int, str] = 2
46-
max_memory: typing.Union[int, str] = 2
47-
num_gpus: int = 0
49+
worker_memory_requests: typing.Union[int, str] = 2
50+
worker_memory_limits: typing.Union[int, str] = 2
51+
min_memory: typing.Union[int, str] = None # Deprecating
52+
max_memory: typing.Union[int, str] = None # Deprecating
53+
num_worker_gpus: int = 0
54+
num_gpus: int = None # Deprecating
4855
template: str = f"{dir}/templates/base-template.yaml"
4956
appwrapper: bool = False
5057
envs: dict = field(default_factory=dict)
@@ -59,23 +66,57 @@ def __post_init__(self):
5966
print(
6067
"Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
6168
)
69+
6270
self._memory_to_string()
6371
self._str_mem_no_unit_add_GB()
72+
self._memory_to_resource()
73+
self._gpu_to_resource()
74+
self._cpu_to_resource()
6475

6576
def _str_mem_no_unit_add_GB(self):
6677
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
6778
self.head_memory = f"{self.head_memory}G"
68-
if isinstance(self.min_memory, str) and self.min_memory.isdecimal():
69-
self.min_memory = f"{self.min_memory}G"
70-
if isinstance(self.max_memory, str) and self.max_memory.isdecimal():
71-
self.max_memory = f"{self.max_memory}G"
79+
if (
80+
isinstance(self.worker_memory_requests, str)
81+
and self.worker_memory_requests.isdecimal()
82+
):
83+
self.worker_memory_requests = f"{self.worker_memory_requests}G"
84+
if (
85+
isinstance(self.worker_memory_limits, str)
86+
and self.worker_memory_limits.isdecimal()
87+
):
88+
self.worker_memory_limits = f"{self.worker_memory_limits}G"
7289

7390
def _memory_to_string(self):
7491
if isinstance(self.head_memory, int):
7592
self.head_memory = f"{self.head_memory}G"
76-
if isinstance(self.min_memory, int):
77-
self.min_memory = f"{self.min_memory}G"
78-
if isinstance(self.max_memory, int):
79-
self.max_memory = f"{self.max_memory}G"
93+
if isinstance(self.worker_memory_requests, int):
94+
self.worker_memory_requests = f"{self.worker_memory_requests}G"
95+
if isinstance(self.worker_memory_limits, int):
96+
self.worker_memory_limits = f"{self.worker_memory_limits}G"
97+
98+
def _gpu_to_resource(self):
99+
if self.head_gpus:
100+
warnings.warn("head_gpus is being deprecated, use num_head_gpus")
101+
self.num_head_gpus = self.head_gpus
102+
if self.num_gpus:
103+
warnings.warn("num_gpus is being deprecated, use num_worker_gpus")
104+
self.num_worker_gpus = self.num_gpus
105+
106+
def _cpu_to_resource(self):
107+
if self.min_cpus:
108+
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
109+
self.worker_cpu_requests = self.min_cpus
110+
if self.max_cpus:
111+
warnings.warn("max_cpus is being deprecated, use worker_cpu_limits")
112+
self.worker_cpu_limits = self.max_cpus
113+
114+
def _memory_to_resource(self):
115+
if self.min_memory:
116+
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
117+
self.worker_memory_requests = f"{self.min_memory}G"
118+
if self.max_memory:
119+
warnings.warn("max_memory is being deprecated, use worker_memory_limits")
120+
self.worker_memory_limits = f"{self.max_memory}G"
80121

81122
local_queue: str = None

src/codeflare_sdk/utils/generate_yaml.py

+43-29
Original file line numberDiff line numberDiff line change
@@ -106,47 +106,54 @@ def update_env(spec, env):
106106
container["env"] = env
107107

108108

109-
def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
109+
def update_resources(
110+
spec,
111+
worker_cpu_requests,
112+
worker_cpu_limits,
113+
worker_memory_requests,
114+
worker_memory_limits,
115+
num_worker_gpus,
116+
):
110117
container = spec.get("containers")
111118
for resource in container:
112119
requests = resource.get("resources").get("requests")
113120
if requests is not None:
114-
requests["cpu"] = min_cpu
115-
requests["memory"] = min_memory
116-
requests["nvidia.com/gpu"] = gpu
121+
requests["cpu"] = worker_cpu_requests
122+
requests["memory"] = worker_memory_requests
123+
requests["nvidia.com/gpu"] = num_worker_gpus
117124
limits = resource.get("resources").get("limits")
118125
if limits is not None:
119-
limits["cpu"] = max_cpu
120-
limits["memory"] = max_memory
121-
limits["nvidia.com/gpu"] = gpu
126+
limits["cpu"] = worker_cpu_limits
127+
limits["memory"] = worker_memory_limits
128+
limits["nvidia.com/gpu"] = num_worker_gpus
122129

123130

124131
def update_nodes(
125132
cluster_yaml,
126133
appwrapper_name,
127-
min_cpu,
128-
max_cpu,
129-
min_memory,
130-
max_memory,
131-
gpu,
134+
worker_cpu_requests,
135+
worker_cpu_limits,
136+
worker_memory_requests,
137+
worker_memory_limits,
138+
num_worker_gpus,
132139
workers,
133140
image,
134141
env,
135142
image_pull_secrets,
136143
head_cpus,
137144
head_memory,
138-
head_gpus,
145+
num_head_gpus,
139146
):
140147
head = cluster_yaml.get("spec").get("headGroupSpec")
141-
head["rayStartParams"]["num-gpus"] = str(int(head_gpus))
148+
head["rayStartParams"]["num-gpus"] = str(int(num_head_gpus))
142149

143150
worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0]
144151
# Head counts as first worker
145152
worker["replicas"] = workers
146153
worker["minReplicas"] = workers
147154
worker["maxReplicas"] = workers
148155
worker["groupName"] = "small-group-" + appwrapper_name
149-
worker["rayStartParams"]["num-gpus"] = str(int(gpu))
156+
worker["rayStartParams"]["num-gpus"] = str(int(num_worker_gpus))
150157

151158
for comp in [head, worker]:
152159
spec = comp.get("template").get("spec")
@@ -156,10 +163,17 @@ def update_nodes(
156163
if comp == head:
157164
# TODO: Eventually add head node configuration outside of template
158165
update_resources(
159-
spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus
166+
spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus
160167
)
161168
else:
162-
update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
169+
update_resources(
170+
spec,
171+
worker_cpu_requests,
172+
worker_cpu_limits,
173+
worker_memory_requests,
174+
worker_memory_limits,
175+
num_worker_gpus,
176+
)
163177

164178

165179
def del_from_list_by_name(l: list, target: typing.List[str]) -> list:
@@ -265,12 +279,12 @@ def generate_appwrapper(
265279
namespace: str,
266280
head_cpus: int,
267281
head_memory: int,
268-
head_gpus: int,
269-
min_cpu: int,
270-
max_cpu: int,
271-
min_memory: int,
272-
max_memory: int,
273-
gpu: int,
282+
num_head_gpus: int,
283+
worker_cpu_requests: int,
284+
worker_cpu_limits: int,
285+
worker_memory_requests: int,
286+
worker_memory_limits: int,
287+
num_worker_gpus: int,
274288
workers: int,
275289
template: str,
276290
image: str,
@@ -287,18 +301,18 @@ def generate_appwrapper(
287301
update_nodes(
288302
cluster_yaml,
289303
appwrapper_name,
290-
min_cpu,
291-
max_cpu,
292-
min_memory,
293-
max_memory,
294-
gpu,
304+
worker_cpu_requests,
305+
worker_cpu_limits,
306+
worker_memory_requests,
307+
worker_memory_limits,
308+
num_worker_gpus,
295309
workers,
296310
image,
297311
env,
298312
image_pull_secrets,
299313
head_cpus,
300314
head_memory,
301-
head_gpus,
315+
num_head_gpus,
302316
)
303317
augment_labels(cluster_yaml, labels)
304318
notebook_annotations(cluster_yaml)

tests/e2e/local_interactive_sdk_kind_test.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@ def run_local_interactives(self):
3939
num_workers=1,
4040
head_cpus="500m",
4141
head_memory=2,
42-
min_cpus="500m",
43-
max_cpus=1,
44-
min_memory=1,
45-
max_memory=2,
46-
num_gpus=0,
42+
worker_cpu_requests="500m",
43+
worker_cpu_limits=1,
44+
worker_memory_requests=1,
45+
worker_memory_limits=2,
46+
num_worker_gpus=0,
4747
image=ray_image,
4848
write_to_file=True,
4949
verify_tls=False,

tests/e2e/local_interactive_sdk_oauth_test.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ def run_local_interactives(self):
4444
namespace=self.namespace,
4545
name=cluster_name,
4646
num_workers=1,
47-
min_cpus=1,
48-
max_cpus=1,
49-
min_memory=4,
50-
max_memory=4,
51-
num_gpus=0,
47+
worker_cpu_requests=1,
48+
worker_cpu_limits=1,
49+
worker_memory_requests=4,
50+
worker_memory_limits=4,
51+
num_worker_gpus=0,
5252
image=ray_image,
5353
verify_tls=False,
5454
)

tests/e2e/mnist_raycluster_sdk_kind_test.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ def run_mnist_raycluster_sdk_kind(self):
3737
num_workers=1,
3838
head_cpus="500m",
3939
head_memory=2,
40-
min_cpus="500m",
41-
max_cpus=1,
42-
min_memory=1,
43-
max_memory=2,
44-
num_gpus=0,
40+
worker_cpu_requests="500m",
41+
worker_cpu_limits=1,
42+
worker_memory_requests=1,
43+
worker_memory_limits=2,
44+
num_worker_gpus=0,
4545
image=ray_image,
4646
write_to_file=True,
4747
verify_tls=False,

tests/e2e/mnist_raycluster_sdk_oauth_test.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ def run_mnist_raycluster_sdk_oauth(self):
4444
num_workers=1,
4545
head_cpus="500m",
4646
head_memory=2,
47-
min_cpus="500m",
48-
max_cpus=1,
49-
min_memory=1,
50-
max_memory=2,
51-
num_gpus=0,
47+
worker_cpu_requests="500m",
48+
worker_cpu_limits=1,
49+
worker_memory_requests=1,
50+
worker_memory_limits=2,
51+
num_worker_gpus=0,
5252
image=ray_image,
5353
write_to_file=True,
5454
verify_tls=False,

0 commit comments

Comments
 (0)