diff --git a/docs/cluster-configuration.md b/docs/cluster-configuration.md index bb058fa4e..7684db2ca 100644 --- a/docs/cluster-configuration.md +++ b/docs/cluster-configuration.md @@ -22,39 +22,11 @@ cluster = Cluster(ClusterConfiguration( image="quay.io/project-codeflare/ray:latest-py39-cu118", # Mandatory Field instascale=False, # Default False machine_types=["m5.xlarge", "g4dn.xlarge"], - ingress_domain="example.com" # Default None, Mandatory for Vanilla Kubernetes Clusters - ingress_domain is ignored on OpenShift Clusters as a route is created. - local_interactive=False, # Default False )) ``` -Note: On OpenShift, the `ingress_domain` is only required when `local_interactive` is enabled. - This may change soon. Upon creating a cluster configuration with `mcad=True` an appwrapper will be created featuring the Ray Cluster and any Routes, Ingresses or Secrets that are needed to be created along side it.
From there a user can call `cluster.up()` and `cluster.down()` to create and remove the appwrapper thus creating and removing the Ray Cluster. In cases where `mcad=False` a yaml file will be created with the individual Ray Cluster, Route/Ingress and Secret included.
The Ray Cluster and service will be created by KubeRay directly and the other components will be individually created. - -## Ray Cluster Configuration in a Vanilla Kubernetes environment (Non-OpenShift) -To create a Ray Cluster using the CodeFlare SDK in a Vanilla Kubernetes environment an `ingress_domain` must be passed in the Cluster Configuration. -This is used for the creation of the Ray Dashboard and Client ingresses. - -`ingress_options` can be passed to create a custom Ray Dashboard ingress, `ingress_domain` is still a required variable for the Client route/ingress. -An example of `ingress_options` would look like this. - -``` -ingress_options = { - "ingresses": [ - { - "ingressName": "", - "port": , - "pathType": "", - "path": "", - "host":"", - "annotations": { - "foo": "bar", - "foo": "bar", - } - } - ] -} -``` diff --git a/src/codeflare_sdk.egg-info/SOURCES.txt b/src/codeflare_sdk.egg-info/SOURCES.txt index d922d0dbe..42541f1d2 100644 --- a/src/codeflare_sdk.egg-info/SOURCES.txt +++ b/src/codeflare_sdk.egg-info/SOURCES.txt @@ -13,11 +13,9 @@ src/codeflare_sdk/cluster/cluster.py src/codeflare_sdk/cluster/config.py src/codeflare_sdk/cluster/model.py src/codeflare_sdk/job/__init__.py -src/codeflare_sdk/job/jobs.py src/codeflare_sdk/job/ray_jobs.py src/codeflare_sdk/utils/__init__.py src/codeflare_sdk/utils/generate_cert.py src/codeflare_sdk/utils/generate_yaml.py src/codeflare_sdk/utils/kube_api_helpers.py -src/codeflare_sdk/utils/openshift_oauth.py src/codeflare_sdk/utils/pretty_print.py diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 24cbf9a71..295332ae4 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -179,7 +179,6 @@ def create_app_wrapper(self): mcad = self.config.mcad instance_types = self.config.machine_types env = self.config.envs - local_interactive = self.config.local_interactive image_pull_secrets = self.config.image_pull_secrets dispatch_priority = self.config.dispatch_priority write_to_file = self.config.write_to_file @@ -203,7 +202,6 @@ def create_app_wrapper(self): mcad=mcad, instance_types=instance_types, env=env, - local_interactive=local_interactive, image_pull_secrets=image_pull_secrets, dispatch_priority=dispatch_priority, priority_val=priority_val, @@ -479,13 +477,6 @@ def from_k8_cluster_object( verify_tls=True, ): config_check() - if ( - rc["metadata"]["annotations"]["sdk.codeflare.dev/local_interactive"] - == "True" - ): - local_interactive = True - else: - local_interactive = False machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] @@ -526,7 +517,6 @@ def from_k8_cluster_object( image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], - local_interactive=local_interactive, mcad=mcad, write_to_file=write_to_file, verify_tls=verify_tls, @@ -534,11 +524,8 @@ def from_k8_cluster_object( return Cluster(cluster_config) def local_client_url(self): - if self.config.local_interactive == True: - ingress_domain = _get_ingress_domain(self) - return f"ray://{ingress_domain}" - else: - return "None" + ingress_domain = _get_ingress_domain(self) + return f"ray://{ingress_domain}" def _component_resources_up( self, namespace: str, api_instance: client.CustomObjectsApi @@ -678,13 +665,6 @@ def _delete_resources( plural="rayclusters", name=name, ) - elif resource["kind"] == "Secret": - name = resource["metadata"]["name"] - secret_instance = client.CoreV1Api(api_config_handler()) - secret_instance.delete_namespaced_secret( - namespace=namespace, - name=name, - ) def _create_resources(yamls, namespace: str, api_instance: client.CustomObjectsApi): @@ -697,12 +677,6 @@ def _create_resources(yamls, namespace: str, api_instance: client.CustomObjectsA plural="rayclusters", body=resource, ) - elif resource["kind"] == "Secret": - secret_instance = client.CoreV1Api(api_config_handler()) - secret_instance.create_namespaced_secret( - namespace=namespace, - body=resource, - ) def _check_aw_exists(name: str, namespace: str) -> bool: diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index f6bcac89c..e4d046f93 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -49,7 +49,6 @@ class ClusterConfiguration: mcad: bool = False envs: dict = field(default_factory=dict) image: str = "" - local_interactive: bool = False image_pull_secrets: list = field(default_factory=list) dispatch_priority: str = None write_to_file: bool = False diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 5f6036ac8..356e3494e 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -40,8 +40,6 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: - annotations: - sdk.codeflare.dev/local_interactive: "False" labels: workload.codeflare.dev/appwrapper: "aw-kuberay" controller-tools.k8s.io: "1.0" @@ -117,20 +115,7 @@ spec: - "aw-kuberay" containers: # The Ray head pod - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: "0" - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - name: ray-head + - name: ray-head image: quay.io/project-codeflare/ray:latest-py39-cu118 imagePullPolicy: Always ports: @@ -154,12 +139,6 @@ spec: memory: "8G" nvidia.com/gpu: 0 volumeMounts: - - name: ca-vol - mountPath: "/home/ray/workspace/ca" - readOnly: true - - name: server-cert - mountPath: "/home/ray/workspace/tls" - readOnly: true - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -172,30 +151,7 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt - initContainers: - - command: - - sh - - -c - - cd /home/ray/workspace/tls && openssl req -nodes -newkey rsa:2048 -keyout server.key -out server.csr -subj '/CN=ray-head' && printf "authorityKeyIdentifier=keyid,issuer\nbasicConstraints=CA:FALSE\nsubjectAltName = @alt_names\n[alt_names]\nDNS.1 = 127.0.0.1\nDNS.2 = localhost\nDNS.3 = ${FQ_RAY_IP}\nDNS.4 = $(awk 'END{print $1}' /etc/hosts)\nDNS.5 = rayclient-deployment-name-$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).server-name">./domain.ext && cp /home/ray/workspace/ca/* . && openssl x509 -req -CA ca.crt -CAkey ca.key -in server.csr -out server.crt -days 365 -CAcreateserial -extfile domain.ext - image: quay.io/project-codeflare/ray:latest-py39-cu118 - name: create-cert - # securityContext: - # runAsUser: 1000 - # runAsGroup: 1000 - volumeMounts: - - name: ca-vol - mountPath: "/home/ray/workspace/ca" - readOnly: true - - name: server-cert - mountPath: "/home/ray/workspace/tls" - readOnly: false volumes: - - name: ca-vol - secret: - secretName: ca-secret-deployment-name - optional: false - - name: server-cert - emptyDir: {} - name: odh-trusted-ca-cert configMap: name: odh-trusted-ca-bundle @@ -250,40 +206,9 @@ spec: operator: In values: - "aw-kuberay" - initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - - name: create-cert - image: quay.io/project-codeflare/ray:latest-py39-cu118 - command: - - sh - - -c - - cd /home/ray/workspace/tls && openssl req -nodes -newkey rsa:2048 -keyout server.key -out server.csr -subj '/CN=ray-head' && printf "authorityKeyIdentifier=keyid,issuer\nbasicConstraints=CA:FALSE\nsubjectAltName = @alt_names\n[alt_names]\nDNS.1 = 127.0.0.1\nDNS.2 = localhost\nDNS.3 = ${FQ_RAY_IP}\nDNS.4 = $(awk 'END{print $1}' /etc/hosts)">./domain.ext && cp /home/ray/workspace/ca/* . && openssl x509 -req -CA ca.crt -CAkey ca.key -in server.csr -out server.crt -days 365 -CAcreateserial -extfile domain.ext - # securityContext: - # runAsUser: 1000 - # runAsGroup: 1000 - volumeMounts: - - name: ca-vol - mountPath: "/home/ray/workspace/ca" - readOnly: true - - name: server-cert - mountPath: "/home/ray/workspace/tls" - readOnly: false containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' image: quay.io/project-codeflare/ray:latest-py39-cu118 - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: "0" - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt # environment variables to set in the container.Optional. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ lifecycle: @@ -300,12 +225,6 @@ spec: memory: "12G" nvidia.com/gpu: "1" volumeMounts: - - name: ca-vol - mountPath: "/home/ray/workspace/ca" - readOnly: true - - name: server-cert - mountPath: "/home/ray/workspace/tls" - readOnly: true - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -319,12 +238,6 @@ spec: name: odh-ca-cert subPath: odh-ca-bundle.crt volumes: - - name: ca-vol - secret: - secretName: ca-secret-deployment-name - optional: false - - name: server-cert - emptyDir: {} - name: odh-trusted-ca-cert configMap: name: odh-trusted-ca-bundle @@ -339,15 +252,3 @@ spec: - key: odh-ca-bundle.crt path: odh-ca-bundle.crt optional: true - - replicas: 1 - generictemplate: - apiVersion: v1 - data: - ca.crt: generated_crt - ca.key: generated_key - kind: Secret - metadata: - name: ca-secret-deployment-name - labels: - # allows me to return name of service that Ray operator creates - odh-ray-cluster-service: deployment-name-head-svc diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 2088b9102..95c3d04f0 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -85,20 +85,6 @@ def update_names(yaml, item, appwrapper_name, cluster_name, namespace): lower_meta["labels"]["workload.codeflare.dev/appwrapper"] = appwrapper_name lower_meta["name"] = cluster_name lower_meta["namespace"] = namespace - lower_spec = item.get("generictemplate", {}).get("spec") - if is_openshift_cluster(): - cookie_secret_env_var = { - "name": "COOKIE_SECRET", - "valueFrom": { - "secretKeyRef": { - "key": "cookie_secret", - "name": f"{cluster_name}-oauth-config", - } - }, - } - lower_spec["headGroupSpec"]["template"]["spec"]["containers"][0]["env"].append( - cookie_secret_env_var - ) def update_labels(yaml, instascale, instance_types): @@ -280,106 +266,10 @@ def update_nodes( update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) -def update_ca_secret(ca_secret_item, cluster_name, namespace): - from . import generate_cert - - metadata = ca_secret_item.get("generictemplate", {}).get("metadata") - metadata["name"] = f"ca-secret-{cluster_name}" - metadata["namespace"] = namespace - metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc" - data = ca_secret_item.get("generictemplate", {}).get("data") - data["ca.key"], data["ca.crt"] = generate_cert.generate_ca_cert(365) - - -def enable_local_interactive(resources, cluster_name, namespace): # pragma: no cover - from ..cluster.cluster import _get_ingress_domain - - ca_secret_item = resources["resources"].get("GenericItems")[1] - item = resources["resources"].get("GenericItems")[0] - update_ca_secret(ca_secret_item, cluster_name, namespace) - # update_ca_secret_volumes - item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"]["volumes"][0][ - "secret" - ]["secretName"] = f"ca-secret-{cluster_name}" - item["generictemplate"]["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "volumes" - ][0]["secret"]["secretName"] = f"ca-secret-{cluster_name}" - # update_tls_env - item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ - 0 - ]["env"][1]["value"] = "1" - item["generictemplate"]["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["env"][1]["value"] = "1" - # update_init_container - command = item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ - "initContainers" - ][0].get("command")[2] - - command = command.replace("deployment-name", cluster_name) - - domain = "" ## FIX - We can't retrieve ingress domain - move init container to CFO - - command = command.replace("server-name", domain) - item["generictemplate"]["metadata"]["annotations"][ - "sdk.codeflare.dev/local_interactive" - ] = "True" - - item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ - "initContainers" - ][0].get("command")[2] = command - - def del_from_list_by_name(l: list, target: typing.List[str]) -> list: return [x for x in l if x["name"] not in target] -def disable_raycluster_tls(resources): - generic_template_spec = resources["GenericItems"][0]["generictemplate"]["spec"] - - headGroupTemplateSpec = generic_template_spec["headGroupSpec"]["template"]["spec"] - headGroupTemplateSpec["volumes"] = del_from_list_by_name( - headGroupTemplateSpec.get("volumes", []), - ["ca-vol", "server-cert"], - ) - - c: dict - for c in generic_template_spec["headGroupSpec"]["template"]["spec"]["containers"]: - c["volumeMounts"] = del_from_list_by_name( - c.get("volumeMounts", []), ["ca-vol", "server-cert"] - ) - - if "initContainers" in generic_template_spec["headGroupSpec"]["template"]["spec"]: - del generic_template_spec["headGroupSpec"]["template"]["spec"]["initContainers"] - - for workerGroup in generic_template_spec.get("workerGroupSpecs", []): - workerGroupSpec = workerGroup["template"]["spec"] - workerGroupSpec["volumes"] = del_from_list_by_name( - workerGroupSpec.get("volumes", []), - ["ca-vol", "server-cert"], - ) - for c in workerGroup["template"]["spec"].get("containers", []): - c["volumeMounts"] = del_from_list_by_name( - c.get("volumeMounts", []), ["ca-vol", "server-cert"] - ) - - del generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][ - "initContainers" - ] - - updated_items = [] - for i in resources["GenericItems"][:]: - if "rayclient-deployment-ingress" in i["generictemplate"]["metadata"]["name"]: - continue - if "rayclient-deployment-route" in i["generictemplate"]["metadata"]["name"]: - continue - if "ca-secret-deployment-name" in i["generictemplate"]["metadata"]["name"]: - continue - updated_items.append(i) - - resources["GenericItems"] = updated_items - - def write_user_appwrapper(user_yaml, output_file_name): # Create the directory if it doesn't exist directory_path = os.path.dirname(output_file_name) @@ -392,75 +282,6 @@ def write_user_appwrapper(user_yaml, output_file_name): print(f"Written to: {output_file_name}") -def enable_openshift_oauth(user_yaml, cluster_name, namespace): - config_check() - k8_client = api_config_handler() or client.ApiClient() - tls_mount_location = "/etc/tls/private" - oauth_port = 8443 - oauth_sa_name = f"{cluster_name}-oauth-proxy" - tls_secret_name = f"{cluster_name}-proxy-tls-secret" - tls_volume_name = "proxy-tls-secret" - port_name = "oauth-proxy" - oauth_sidecar = _create_oauth_sidecar_object( - namespace, - tls_mount_location, - oauth_port, - oauth_sa_name, - tls_volume_name, - port_name, - ) - tls_secret_volume = client.V1Volume( - name=tls_volume_name, - secret=client.V1SecretVolumeSource(secret_name=tls_secret_name), - ) - # allows for setting value of Cluster object when initializing object from an existing AppWrapper on cluster - user_yaml["metadata"]["annotations"] = user_yaml["metadata"].get("annotations", {}) - ray_headgroup_pod = user_yaml["spec"]["resources"]["GenericItems"][0][ - "generictemplate" - ]["spec"]["headGroupSpec"]["template"]["spec"] - ray_headgroup_pod["serviceAccount"] = oauth_sa_name - ray_headgroup_pod["volumes"] = ray_headgroup_pod.get("volumes", []) - - # we use a generic api client here so that the serialization function doesn't need to be mocked for unit tests - ray_headgroup_pod["volumes"].append( - client.ApiClient().sanitize_for_serialization(tls_secret_volume) - ) - ray_headgroup_pod["containers"].append( - client.ApiClient().sanitize_for_serialization(oauth_sidecar) - ) - - -def _create_oauth_sidecar_object( - namespace: str, - tls_mount_location: str, - oauth_port: int, - oauth_sa_name: str, - tls_volume_name: str, - port_name: str, -) -> client.V1Container: - return client.V1Container( - args=[ - f"--https-address=:{oauth_port}", - "--provider=openshift", - f"--openshift-service-account={oauth_sa_name}", - "--upstream=http://localhost:8265", - f"--tls-cert={tls_mount_location}/tls.crt", - f"--tls-key={tls_mount_location}/tls.key", - "--cookie-secret=$(COOKIE_SECRET)", - f'--openshift-delegate-urls={{"/":{{"resource":"pods","namespace":"{namespace}","verb":"get"}}}}', - ], - image="registry.redhat.io/openshift4/ose-oauth-proxy@sha256:1ea6a01bf3e63cdcf125c6064cbd4a4a270deaf0f157b3eabb78f60556840366", - name="oauth-proxy", - ports=[client.V1ContainerPort(container_port=oauth_port, name=port_name)], - resources=client.V1ResourceRequirements(limits=None, requests=None), - volume_mounts=[ - client.V1VolumeMount( - mount_path=tls_mount_location, name=tls_volume_name, read_only=True - ) - ], - ) - - def get_default_kueue_name(namespace: str): # If the local queue is set, use it. Otherwise, try to use the default queue. try: @@ -568,7 +389,6 @@ def generate_appwrapper( mcad: bool, instance_types: list, env, - local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int, @@ -619,14 +439,6 @@ def generate_appwrapper( head_gpus, ) - if local_interactive: - enable_local_interactive(resources, cluster_name, namespace) - else: - disable_raycluster_tls(resources["resources"]) - - if is_openshift_cluster(): - enable_openshift_oauth(user_yaml, cluster_name, namespace) - directory_path = os.path.expanduser("~/.codeflare/resources/") outfile = os.path.join(directory_path, appwrapper_name + ".yaml") diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml index aeccf5194..6e969e01b 100644 --- a/tests/test-case-bad.yaml +++ b/tests/test-case-bad.yaml @@ -32,8 +32,6 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: - annotations: - sdk.codeflare.dev/local_interactive: 'False' labels: workload.codeflare.dev/appwrapper: unit-test-cluster controller-tools.k8s.io: '1.0' diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index e13752a44..aaf9324e6 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -2,8 +2,6 @@ apiVersion: ray.io/v1 kind: RayCluster metadata: - annotations: - sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' kueue.x-k8s.io/queue-name: local-queue-default @@ -41,20 +39,7 @@ spec: values: - unit-test-cluster-ray containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 imagePullPolicy: Always lifecycle: preStop: @@ -136,20 +121,7 @@ spec: values: - unit-test-cluster-ray containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 lifecycle: preStop: exec: diff --git a/tests/test-case-prio.yaml b/tests/test-case-prio.yaml index 10e161dee..a4d6e68f2 100644 --- a/tests/test-case-prio.yaml +++ b/tests/test-case-prio.yaml @@ -32,8 +32,6 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: - annotations: - sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' workload.codeflare.dev/appwrapper: prio-test-cluster @@ -71,20 +69,7 @@ spec: values: - prio-test-cluster containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 imagePullPolicy: Always lifecycle: preStop: @@ -167,20 +152,7 @@ spec: values: - prio-test-cluster containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 lifecycle: preStop: exec: diff --git a/tests/test-case.yaml b/tests/test-case.yaml index 78d2e4a54..b97d12a49 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -31,8 +31,6 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: - annotations: - sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' workload.codeflare.dev/appwrapper: unit-test-cluster @@ -70,20 +68,7 @@ spec: values: - unit-test-cluster containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 imagePullPolicy: Always lifecycle: preStop: @@ -165,20 +150,7 @@ spec: values: - unit-test-cluster containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 lifecycle: preStop: exec: diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml index ecab5eac6..c390f619b 100644 --- a/tests/test-default-appwrapper.yaml +++ b/tests/test-default-appwrapper.yaml @@ -29,8 +29,6 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: - annotations: - sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' workload.codeflare.dev/appwrapper: unit-test-default-cluster @@ -59,20 +57,7 @@ spec: template: spec: containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 imagePullPolicy: Always lifecycle: preStop: @@ -144,20 +129,7 @@ spec: key: value spec: containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:latest-py39-cu118 + - image: quay.io/project-codeflare/ray:latest-py39-cu118 lifecycle: preStop: exec: diff --git a/tests/unit_test.py b/tests/unit_test.py index 935cdd100..3892f8e57 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -74,7 +74,7 @@ gen_names, is_openshift_cluster, read_template, - enable_local_interactive, + write_components, ) import openshift @@ -257,7 +257,6 @@ def test_config_creation(): assert config.image_pull_secrets == ["unit-test-pull-secret"] assert config.dispatch_priority == None assert config.mcad == True - assert config.local_interactive == False def test_cluster_creation(mocker): @@ -326,6 +325,7 @@ def test_cluster_creation_no_mcad(mocker): config.write_to_file = True config.mcad = False cluster = Cluster(config) + assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml" assert cluster.app_wrapper_name == "unit-test-cluster-ray" assert filecmp.cmp( @@ -370,23 +370,18 @@ def test_cluster_creation_no_mcad_local_queue(mocker): machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], image="quay.io/project-codeflare/ray:latest-py39-cu118", - write_to_file=False, + write_to_file=True, mcad=False, local_queue="local-queue-default", ) cluster = Cluster(config) - test_resources = [] - expected_resources = [] - test_aw = yaml.load_all(cluster.app_wrapper_yaml, Loader=yaml.FullLoader) - for resource in test_aw: - test_resources.append(resource) - with open( + assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml" + assert cluster.app_wrapper_name == "unit-test-cluster-ray" + assert filecmp.cmp( + f"{aw_dir}unit-test-cluster-ray.yaml", f"{parent}/tests/test-case-no-mcad.yamls", - ) as f: - default_aw = yaml.load_all(f, Loader=yaml.FullLoader) - for resource in default_aw: - expected_resources.append(resource) - assert test_resources == expected_resources + shallow=True, + ) def test_cluster_creation_priority(mocker): @@ -425,7 +420,8 @@ def test_default_cluster_creation(mocker): mcad=True, ) cluster = Cluster(default_config) - test_aw = yaml.safe_load(cluster.app_wrapper_yaml) + test_aw = yaml.load(cluster.app_wrapper_yaml, Loader=yaml.FullLoader) + with open( f"{parent}/tests/test-default-appwrapper.yaml", ) as f: @@ -534,16 +530,12 @@ def test_cluster_up_down(mocker): def test_cluster_up_down_no_mcad(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), ) - mocker.patch("kubernetes.client.ApisApi.get_api_versions") - mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") - mocker.patch( - "kubernetes.client.CustomObjectsApi.get_cluster_custom_object", - return_value={"spec": {"domain": "apps.cluster.awsroute.org"}}, - ) mocker.patch( "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", side_effect=arg_check_apply_effect, @@ -552,6 +544,12 @@ def test_cluster_up_down_no_mcad(mocker): "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", side_effect=arg_check_del_effect, ) + mocker.patch( + "kubernetes.client.CoreV1Api.create_namespaced_secret", + ) + mocker.patch( + "kubernetes.client.CoreV1Api.delete_namespaced_secret", + ) mocker.patch( "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", return_value={"items": []}, @@ -678,7 +676,6 @@ def test_local_client_url(mocker): cluster_config = ClusterConfiguration( name="unit-test-cluster-localinter", namespace="ns", - local_interactive=True, write_to_file=True, ) cluster = Cluster(cluster_config) @@ -994,9 +991,6 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "metadata": { "creationTimestamp": "2024-03-05T09:55:37Z", "generation": 1, - "annotations": { - "sdk.codeflare.dev/local_interactive": "True", - }, "labels": { "appwrapper.mcad.ibm.com": "quicktest", "controller-tools.k8s.io": "1.0", @@ -1806,9 +1800,6 @@ def get_aw_obj(group, version, namespace, plural): "apiVersion": "ray.io/v1", "kind": "RayCluster", "metadata": { - "annotations": { - "sdk.codeflare.dev/local_interactive": "False" - }, "labels": { "workload.codeflare.dev/appwrapper": "quicktest1", "controller-tools.k8s.io": "1.0", @@ -2136,9 +2127,6 @@ def get_aw_obj(group, version, namespace, plural): "apiVersion": "ray.io/v1", "kind": "RayCluster", "metadata": { - "annotations": { - "sdk.codeflare.dev/local_interactive": "False" - }, "labels": { "workload.codeflare.dev/appwrapper": "quicktest2", "controller-tools.k8s.io": "1.0", @@ -2450,7 +2438,6 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 assert cluster_config.num_gpus == 0 - assert cluster_config.local_interactive == True assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2484,7 +2471,6 @@ def test_get_cluster(mocker): assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 assert cluster_config.num_gpus == 0 assert cluster_config.instascale - assert cluster_config.local_interactive assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -3014,151 +3000,6 @@ def test_export_env(): ) -# def test_enable_local_interactive(mocker): -# template = f"{parent}/src/codeflare_sdk/templates/base-template.yaml" -# user_yaml = read_template(template) -# aw_spec = user_yaml.get("spec", None) -# cluster_name = "test-enable-local" -# namespace = "default" -# ingress_domain = "mytest.domain" -# mocker.patch("kubernetes.client.ApisApi.get_api_versions") -# mocker.patch( -# "codeflare_sdk.utils.generate_yaml.is_openshift_cluster", return_value=False -# ) -# volume_mounts = [ -# {"name": "ca-vol", "mountPath": "/home/ray/workspace/ca", "readOnly": True}, -# { -# "name": "server-cert", -# "mountPath": "/home/ray/workspace/tls", -# "readOnly": False, -# }, -# ] -# volumes = [ -# { -# "name": "ca-vol", -# "secret": {"secretName": "ca-secret-test-enable-local"}, -# "optional": False, -# }, -# {"name": "server-cert", "emptyDir": {}}, -# { -# "name": "odh-trusted-ca-cert", -# "configMap": { -# "name": "odh-trusted-ca-bundle", -# "items": [ -# {"key": "ca-bundle.crt", "path": "odh-trusted-ca-bundle.crt"} -# ], -# "optional": True, -# }, -# }, -# { -# "name": "odh-ca-cert", -# "configMap": { -# "name": "odh-trusted-ca-bundle", -# "items": [{"key": "odh-ca-bundle.crt", "path": "odh-ca-bundle.crt"}], -# "optional": True, -# }, -# }, -# ] -# tls_env = [ -# {"name": "RAY_USE_TLS", "value": "1"}, -# {"name": "RAY_TLS_SERVER_CERT", "value": "/home/ray/workspace/tls/server.crt"}, -# {"name": "RAY_TLS_SERVER_KEY", "value": "/home/ray/workspace/tls/server.key"}, -# {"name": "RAY_TLS_CA_CERT", "value": "/home/ray/workspace/tls/ca.crt"}, -# ] -# assert aw_spec != None -# enable_local_interactive(aw_spec, cluster_name, namespace, ingress_domain) -# head_group_spec = aw_spec["resources"]["GenericItems"][0]["generictemplate"][ -# "spec" -# ]["headGroupSpec"] -# worker_group_spec = aw_spec["resources"]["GenericItems"][0]["generictemplate"][ -# "spec" -# ]["workerGroupSpecs"] -# ca_secret = aw_spec["resources"]["GenericItems"][1]["generictemplate"] -# # At a minimal, make sure the following items are presented in the appwrapper spec.resources. -# # 1. headgroup has the initContainers command to generated TLS cert from the mounted CA cert. -# # Note: In this particular command, the DNS.5 in [alt_name] must match the exposed local_client_url: rayclient-{cluster_name}.{namespace}.{ingress_domain} -# assert ( -# head_group_spec["template"]["spec"]["initContainers"][0]["command"][2] -# == f"cd /home/ray/workspace/tls && openssl req -nodes -newkey rsa:2048 -keyout server.key -out server.csr -subj '/CN=ray-head' && printf \"authorityKeyIdentifier=keyid,issuer\\nbasicConstraints=CA:FALSE\\nsubjectAltName = @alt_names\\n[alt_names]\\nDNS.1 = 127.0.0.1\\nDNS.2 = localhost\\nDNS.3 = ${{FQ_RAY_IP}}\\nDNS.4 = $(awk 'END{{print $1}}' /etc/hosts)\\nDNS.5 = rayclient-{cluster_name}-$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).{ingress_domain}\">./domain.ext && cp /home/ray/workspace/ca/* . && openssl x509 -req -CA ca.crt -CAkey ca.key -in server.csr -out server.crt -days 365 -CAcreateserial -extfile domain.ext" -# ) -# assert ( -# head_group_spec["template"]["spec"]["initContainers"][0]["volumeMounts"] -# == volume_mounts -# ) -# assert head_group_spec["template"]["spec"]["volumes"] == volumes - -# # 2. workerGroupSpec has the initContainers command to generated TLS cert from the mounted CA cert. -# assert ( -# worker_group_spec[0]["template"]["spec"]["initContainers"][0]["command"][2] -# == "cd /home/ray/workspace/tls && openssl req -nodes -newkey rsa:2048 -keyout server.key -out server.csr -subj '/CN=ray-head' && printf \"authorityKeyIdentifier=keyid,issuer\\nbasicConstraints=CA:FALSE\\nsubjectAltName = @alt_names\\n[alt_names]\\nDNS.1 = 127.0.0.1\\nDNS.2 = localhost\\nDNS.3 = ${FQ_RAY_IP}\\nDNS.4 = $(awk 'END{print $1}' /etc/hosts)\">./domain.ext && cp /home/ray/workspace/ca/* . && openssl x509 -req -CA ca.crt -CAkey ca.key -in server.csr -out server.crt -days 365 -CAcreateserial -extfile domain.ext" -# ) -# assert ( -# worker_group_spec[0]["template"]["spec"]["initContainers"][0]["volumeMounts"] -# == volume_mounts -# ) -# assert worker_group_spec[0]["template"]["spec"]["volumes"] == volumes - -# # 3. Required Envs to enable TLS encryption between head and workers -# for i in range(len(tls_env)): -# assert ( -# head_group_spec["template"]["spec"]["containers"][0]["env"][i + 1]["name"] -# == tls_env[i]["name"] -# ) -# assert ( -# head_group_spec["template"]["spec"]["containers"][0]["env"][i + 1]["value"] -# == tls_env[i]["value"] -# ) -# assert ( -# worker_group_spec[0]["template"]["spec"]["containers"][0]["env"][i + 1][ -# "name" -# ] -# == tls_env[i]["name"] -# ) -# assert ( -# worker_group_spec[0]["template"]["spec"]["containers"][0]["env"][i + 1][ -# "value" -# ] -# == tls_env[i]["value"] -# ) - -# # 4. Secret with ca.crt and ca.key -# assert ca_secret["kind"] == "Secret" -# assert ca_secret["data"]["ca.crt"] != None -# assert ca_secret["data"]["ca.key"] != None -# assert ca_secret["metadata"]["name"] == f"ca-secret-{cluster_name}" -# assert ca_secret["metadata"]["namespace"] == namespace - - -def test_gen_app_wrapper_with_oauth(mocker: MockerFixture): - mocker.patch("kubernetes.client.ApisApi.get_api_versions") - mocker.patch( - "codeflare_sdk.cluster.cluster.get_current_namespace", - return_value="opendatahub", - ) - mocker.patch( - "codeflare_sdk.utils.generate_yaml.is_openshift_cluster", return_value=True - ) - write_user_appwrapper = MagicMock() - mocker.patch( - "codeflare_sdk.utils.generate_yaml.write_user_appwrapper", write_user_appwrapper - ) - Cluster( - ClusterConfiguration( - "test_cluster", - image="quay.io/project-codeflare/ray:latest-py39-cu118", - write_to_file=True, - mcad=True, - ) - ) - user_yaml = write_user_appwrapper.call_args.args[0] - assert any( - container["name"] == "oauth-proxy" - for container in user_yaml["spec"]["resources"]["GenericItems"][0][ - "generictemplate" - ]["spec"]["headGroupSpec"]["template"]["spec"]["containers"] - ) - - """ Ray Jobs tests """