Skip to content

update SDK args #547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 25 additions & 25 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,12 @@ def create_app_wrapper(self):
namespace = self.config.namespace
head_cpus = self.config.head_cpus
head_memory = self.config.head_memory
head_gpus = self.config.head_gpus
min_cpu = self.config.min_cpus
max_cpu = self.config.max_cpus
min_memory = self.config.min_memory
max_memory = self.config.max_memory
gpu = self.config.num_gpus
num_head_gpus = self.config.num_head_gpus
worker_cpu_requests = self.config.worker_cpu_requests
worker_cpu_limits = self.config.worker_cpu_limits
worker_memory_requests = self.config.worker_memory_requests
worker_memory_limits = self.config.worker_memory_limits
num_worker_gpus = self.config.num_worker_gpus
workers = self.config.num_workers
template = self.config.template
image = self.config.image
Expand All @@ -157,12 +157,12 @@ def create_app_wrapper(self):
namespace=namespace,
head_cpus=head_cpus,
head_memory=head_memory,
head_gpus=head_gpus,
min_cpu=min_cpu,
max_cpu=max_cpu,
min_memory=min_memory,
max_memory=max_memory,
gpu=gpu,
num_head_gpus=num_head_gpus,
worker_cpu_requests=worker_cpu_requests,
worker_cpu_limits=worker_cpu_limits,
worker_memory_requests=worker_memory_requests,
worker_memory_limits=worker_memory_limits,
num_worker_gpus=num_worker_gpus,
workers=workers,
template=template,
image=image,
Expand Down Expand Up @@ -318,7 +318,7 @@ def status(

if print_to_console:
# overriding the number of gpus with requested
cluster.worker_gpu = self.config.num_gpus
cluster.worker_gpu = self.config.num_worker_gpus
pretty_print.print_cluster_status(cluster)
elif print_to_console:
if status == CodeFlareClusterStatus.UNKNOWN:
Expand Down Expand Up @@ -474,19 +474,19 @@ def from_k8_cluster_object(
namespace=rc["metadata"]["namespace"],
machine_types=machine_types,
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["cpu"],
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["cpu"],
min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_memory_requests=rc["spec"]["workerGroupSpecs"][0]["template"][
"spec"
]["containers"][0]["resources"]["requests"]["memory"],
worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
num_gpus=int(
num_worker_gpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"]
Expand Down Expand Up @@ -917,15 +917,15 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
name=cluster.config.name,
status=cluster.status(print_to_console=False)[0],
workers=cluster.config.num_workers,
worker_mem_min=cluster.config.min_memory,
worker_mem_max=cluster.config.max_memory,
worker_cpu=cluster.config.min_cpus,
worker_gpu=cluster.config.num_gpus,
worker_mem_min=cluster.config.worker_memory_requests,
worker_mem_max=cluster.config.worker_memory_limits,
worker_cpu=cluster.config.worker_cpu_requests,
worker_gpu=cluster.config.num_worker_gpus,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_gpu=cluster.config.head_gpus,
head_gpu=cluster.config.num_head_gpus,
)
if ray.status == CodeFlareClusterStatus.READY:
ray.status = RayClusterStatus.READY
Expand Down
69 changes: 55 additions & 14 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from dataclasses import dataclass, field
import pathlib
import typing
import warnings

dir = pathlib.Path(__file__).parent.parent.resolve()

Expand All @@ -37,14 +38,20 @@ class ClusterConfiguration:
head_info: list = field(default_factory=list)
head_cpus: typing.Union[int, str] = 2
head_memory: typing.Union[int, str] = 8
head_gpus: int = 0
head_gpus: int = None # Deprecating
num_head_gpus: int = 0
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
min_cpus: typing.Union[int, str] = 1
max_cpus: typing.Union[int, str] = 1
worker_cpu_requests: typing.Union[int, str] = 1
worker_cpu_limits: typing.Union[int, str] = 1
min_cpus: typing.Union[int, str] = None # Deprecating
max_cpus: typing.Union[int, str] = None # Deprecating
num_workers: int = 1
min_memory: typing.Union[int, str] = 2
max_memory: typing.Union[int, str] = 2
num_gpus: int = 0
worker_memory_requests: typing.Union[int, str] = 2
worker_memory_limits: typing.Union[int, str] = 2
min_memory: typing.Union[int, str] = None # Deprecating
max_memory: typing.Union[int, str] = None # Deprecating
num_worker_gpus: int = 0
num_gpus: int = None # Deprecating
template: str = f"{dir}/templates/base-template.yaml"
appwrapper: bool = False
envs: dict = field(default_factory=dict)
Expand All @@ -59,23 +66,57 @@ def __post_init__(self):
print(
"Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
)

self._memory_to_string()
self._str_mem_no_unit_add_GB()
self._memory_to_resource()
self._gpu_to_resource()
self._cpu_to_resource()

def _str_mem_no_unit_add_GB(self):
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
self.head_memory = f"{self.head_memory}G"
if isinstance(self.min_memory, str) and self.min_memory.isdecimal():
self.min_memory = f"{self.min_memory}G"
if isinstance(self.max_memory, str) and self.max_memory.isdecimal():
self.max_memory = f"{self.max_memory}G"
if (
isinstance(self.worker_memory_requests, str)
and self.worker_memory_requests.isdecimal()
):
self.worker_memory_requests = f"{self.worker_memory_requests}G"
if (
isinstance(self.worker_memory_limits, str)
and self.worker_memory_limits.isdecimal()
):
self.worker_memory_limits = f"{self.worker_memory_limits}G"

def _memory_to_string(self):
if isinstance(self.head_memory, int):
self.head_memory = f"{self.head_memory}G"
if isinstance(self.min_memory, int):
self.min_memory = f"{self.min_memory}G"
if isinstance(self.max_memory, int):
self.max_memory = f"{self.max_memory}G"
if isinstance(self.worker_memory_requests, int):
self.worker_memory_requests = f"{self.worker_memory_requests}G"
if isinstance(self.worker_memory_limits, int):
self.worker_memory_limits = f"{self.worker_memory_limits}G"

def _gpu_to_resource(self):
if self.head_gpus:
warnings.warn("head_gpus is being deprecated, use num_head_gpus")
self.num_head_gpus = self.head_gpus
if self.num_gpus:
warnings.warn("num_gpus is being deprecated, use num_worker_gpus")
self.num_worker_gpus = self.num_gpus

def _cpu_to_resource(self):
if self.min_cpus:
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
self.worker_cpu_requests = self.min_cpus
if self.max_cpus:
warnings.warn("max_cpus is being deprecated, use worker_cpu_limits")
self.worker_cpu_limits = self.max_cpus

def _memory_to_resource(self):
if self.min_memory:
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
self.worker_memory_requests = f"{self.min_memory}G"
if self.max_memory:
warnings.warn("max_memory is being deprecated, use worker_memory_limits")
self.worker_memory_limits = f"{self.max_memory}G"

local_queue: str = None
72 changes: 43 additions & 29 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,47 +106,54 @@ def update_env(spec, env):
container["env"] = env


def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
def update_resources(
spec,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
):
container = spec.get("containers")
for resource in container:
requests = resource.get("resources").get("requests")
if requests is not None:
requests["cpu"] = min_cpu
requests["memory"] = min_memory
requests["nvidia.com/gpu"] = gpu
requests["cpu"] = worker_cpu_requests
requests["memory"] = worker_memory_requests
requests["nvidia.com/gpu"] = num_worker_gpus
limits = resource.get("resources").get("limits")
if limits is not None:
limits["cpu"] = max_cpu
limits["memory"] = max_memory
limits["nvidia.com/gpu"] = gpu
limits["cpu"] = worker_cpu_limits
limits["memory"] = worker_memory_limits
limits["nvidia.com/gpu"] = num_worker_gpus


def update_nodes(
cluster_yaml,
appwrapper_name,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
workers,
image,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
num_head_gpus,
):
head = cluster_yaml.get("spec").get("headGroupSpec")
head["rayStartParams"]["num-gpus"] = str(int(head_gpus))
head["rayStartParams"]["num-gpus"] = str(int(num_head_gpus))

worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0]
# Head counts as first worker
worker["replicas"] = workers
worker["minReplicas"] = workers
worker["maxReplicas"] = workers
worker["groupName"] = "small-group-" + appwrapper_name
worker["rayStartParams"]["num-gpus"] = str(int(gpu))
worker["rayStartParams"]["num-gpus"] = str(int(num_worker_gpus))

for comp in [head, worker]:
spec = comp.get("template").get("spec")
Expand All @@ -156,10 +163,17 @@ def update_nodes(
if comp == head:
# TODO: Eventually add head node configuration outside of template
update_resources(
spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus
spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus
)
else:
update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
update_resources(
spec,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
)


def del_from_list_by_name(l: list, target: typing.List[str]) -> list:
Expand Down Expand Up @@ -265,12 +279,12 @@ def generate_appwrapper(
namespace: str,
head_cpus: int,
head_memory: int,
head_gpus: int,
min_cpu: int,
max_cpu: int,
min_memory: int,
max_memory: int,
gpu: int,
num_head_gpus: int,
worker_cpu_requests: int,
worker_cpu_limits: int,
worker_memory_requests: int,
worker_memory_limits: int,
num_worker_gpus: int,
workers: int,
template: str,
image: str,
Expand All @@ -287,18 +301,18 @@ def generate_appwrapper(
update_nodes(
cluster_yaml,
appwrapper_name,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
workers,
image,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
num_head_gpus,
)
augment_labels(cluster_yaml, labels)
notebook_annotations(cluster_yaml)
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/local_interactive_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ def run_local_interactives(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
num_worker_gpus=0,
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/local_interactive_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def run_local_interactives(self):
namespace=self.namespace,
name=cluster_name,
num_workers=1,
min_cpus=1,
max_cpus=1,
min_memory=4,
max_memory=4,
num_gpus=0,
worker_cpu_requests=1,
worker_cpu_limits=1,
worker_memory_requests=4,
worker_memory_limits=4,
num_worker_gpus=0,
image=ray_image,
verify_tls=False,
)
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ def run_mnist_raycluster_sdk_kind(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
num_worker_gpus=0,
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/mnist_raycluster_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def run_mnist_raycluster_sdk_oauth(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
num_worker_gpus=0,
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down
Loading