Skip to content

Make Local Queue label optional #580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from os import urandom
from base64 import b64encode
from urllib3.util import parse_url
from kubernetes.client.exceptions import ApiException


def read_template(template):
Expand Down Expand Up @@ -191,8 +192,11 @@ def get_default_kueue_name(namespace: str):
namespace=namespace,
plural="localqueues",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
except ApiException as e: # pragma: no cover
if e.status == 404 or e.status == 403:
return
else:
return _kube_api_error_handling(e)
for lq in local_queues["items"]:
if (
"annotations" in lq["metadata"]
Expand All @@ -201,9 +205,6 @@ def get_default_kueue_name(namespace: str):
== "true"
):
return lq["metadata"]["name"]
raise ValueError(
"Default Local Queue with kueue.x-k8s.io/default-queue: true annotation not found please create a default Local Queue or provide the local_queue name in Cluster Configuration"
)


def local_queue_exists(namespace: str, local_queue_name: str):
Expand All @@ -228,7 +229,9 @@ def local_queue_exists(namespace: str, local_queue_name: str):

def add_queue_label(item: dict, namespace: str, local_queue: Optional[str]):
lq_name = local_queue or get_default_kueue_name(namespace)
if not local_queue_exists(namespace, lq_name):
if lq_name == None:
return
elif not local_queue_exists(namespace, lq_name):
raise ValueError(
"local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration"
)
Expand Down
153 changes: 153 additions & 0 deletions tests/test-case-no-kueue-no-aw.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
annotations:
app.kubernetes.io/managed-by: test-prefix
labels:
controller-tools.k8s.io: '1.0'
name: unit-test-no-kueue
namespace: ns
spec:
autoscalerOptions:
idleTimeoutSeconds: 60
imagePullPolicy: Always
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 500m
memory: 512Mi
upscalingMode: Default
enableInTreeAutoscaling: false
headGroupSpec:
enableIngress: false
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
num-gpus: '0'
serviceType: ClusterIP
template:
spec:
containers:
- image: quay.io/rhoai/ray:2.23.0-py39-cu121
imagePullPolicy: Always
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: ray-head
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
volumeMounts:
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
- mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
- mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
- mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
imagePullSecrets:
- name: unit-test-pull-secret
volumes:
- configMap:
items:
- key: ca-bundle.crt
path: odh-trusted-ca-bundle.crt
name: odh-trusted-ca-bundle
optional: true
name: odh-trusted-ca-cert
- configMap:
items:
- key: odh-ca-bundle.crt
path: odh-ca-bundle.crt
name: odh-trusted-ca-bundle
optional: true
name: odh-ca-cert
rayVersion: 2.23.0
workerGroupSpecs:
- groupName: small-group-unit-test-no-kueue
maxReplicas: 2
minReplicas: 2
rayStartParams:
block: 'true'
num-gpus: '7'
replicas: 2
template:
metadata:
annotations:
key: value
labels:
key: value
spec:
containers:
- image: quay.io/rhoai/ray:2.23.0-py39-cu121
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: machine-learning
resources:
limits:
cpu: 4
memory: 6G
nvidia.com/gpu: 7
requests:
cpu: 3
memory: 5G
nvidia.com/gpu: 7
volumeMounts:
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
- mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
- mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
- mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
subPath: odh-ca-bundle.crt
imagePullSecrets:
- name: unit-test-pull-secret
volumes:
- configMap:
items:
- key: ca-bundle.crt
path: odh-trusted-ca-bundle.crt
name: odh-trusted-ca-bundle
optional: true
name: odh-trusted-ca-cert
- configMap:
items:
- key: odh-ca-bundle.crt
path: odh-ca-bundle.crt
name: odh-trusted-ca-bundle
optional: true
name: odh-ca-cert
23 changes: 23 additions & 0 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,28 @@ def test_cluster_creation(mocker):
)


def test_cluster_no_kueue_no_aw(mocker):
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
mocker.patch(
"kubernetes.client.CustomObjectsApi.get_cluster_custom_object",
return_value={"spec": {"domain": "apps.cluster.awsroute.org"}},
)
mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object")
mocker.patch("os.environ.get", return_value="test-prefix")
config = createClusterConfig()
config.appwrapper = False
config.name = "unit-test-no-kueue"
config.write_to_file = True
cluster = Cluster(config)
assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-no-kueue.yaml"
assert cluster.config.local_queue == None
assert filecmp.cmp(
f"{aw_dir}unit-test-no-kueue.yaml",
f"{parent}/tests/test-case-no-kueue-no-aw.yaml",
shallow=True,
)


def test_create_app_wrapper_raises_error_with_no_image():
config = createClusterConfig()
config.image = "" # Clear the image to test error handling
Expand Down Expand Up @@ -2799,6 +2821,7 @@ def test_rjc_list_jobs(ray_job_client, mocker):

# Make sure to always keep this function last
def test_cleanup():
os.remove(f"{aw_dir}unit-test-no-kueue.yaml")
os.remove(f"{aw_dir}unit-test-cluster.yaml")
os.remove(f"{aw_dir}test.yaml")
os.remove(f"{aw_dir}raytest2.yaml")
Expand Down