From 572d21569cde41b742450d7824ecd48b6f735f33 Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Wed, 21 May 2025 20:42:20 +0100 Subject: [PATCH 1/2] test using custom cfo --- .github/workflows/e2e_tests.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index fca6d6e72..42d0c474f 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -39,7 +39,7 @@ jobs: - name: Checkout CodeFlare operator repository uses: actions/checkout@v4 with: - repository: project-codeflare/codeflare-operator + repository: kryanbeane/codeflare-operator path: codeflare-operator - name: Set Go @@ -117,7 +117,7 @@ jobs: pip install poetry poetry install --with test,docs echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 + poetry run pytest -v -s ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 env: GRPC_DNS_RESOLVER: "native" From 3b48ba02f6a7b0c4c23022bbfac4952e8220b820 Mon Sep 17 00:00:00 2001 From: kryanbeane Date: Wed, 21 May 2025 23:39:11 +0100 Subject: [PATCH 2/2] use custom image --- .github/workflows/e2e_tests.yaml | 10 ++++- .../common/utils/generate_cert.py | 39 +++++++++++++----- .../ray/cluster/build_ray_cluster.py | 2 +- tests/e2e/local_interactive_sdk_kind_test.py | 40 +++++++++++++++++-- 4 files changed, 76 insertions(+), 15 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 42d0c474f..ea202e7da 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -17,7 +17,7 @@ concurrency: cancel-in-progress: true env: - CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" + CODEFLARE_OPERATOR_IMG: "quay.io/kryanbeane/codeflare-operator:dev" jobs: kubernetes: @@ -106,8 +106,16 @@ jobs: kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user kubectl create clusterrole pod-creator --verb=get,list --resource=pods kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user + kubectl create clusterrole service-reader --verb=get,list --resource=services + kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user kubectl config use-context sdk-user + - name: Setup tmate session for debugging + if: always() + uses: mxschmitt/action-tmate@v3 + with: + detached: true + - name: Run e2e tests run: | export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} diff --git a/src/codeflare_sdk/common/utils/generate_cert.py b/src/codeflare_sdk/common/utils/generate_cert.py index 7c072da0e..0ba0e1ecd 100644 --- a/src/codeflare_sdk/common/utils/generate_cert.py +++ b/src/codeflare_sdk/common/utils/generate_cert.py @@ -230,22 +230,41 @@ def generate_tls_cert(cluster_name, namespace, days=30): def export_env(cluster_name, namespace): """ - Sets environment variables to configure TLS for a Ray cluster. + Sets environment variables to configure TLS for a Ray client connection when mTLS is enabled. + + The `tls.crt` and `tls.key` files generated by `generate_tls_cert` are client-side credentials, + signed by the cluster's CA. `ca.crt` is the cluster's CA certificate. + + This function sets: + - `RAY_USE_TLS="1"` to enable TLS. + - `RAY_TLS_CA_CERT` to the path of `ca.crt` for server certificate verification. + - `RAY_TLS_CLIENT_CERT` and `RAY_TLS_CLIENT_KEY` to the paths of the client's `tls.crt` + and `tls.key` respectively, for client authentication by the server. + - `RAY_TLS_SERVER_CERT` and `RAY_TLS_SERVER_KEY` are also set to the client's `tls.crt` + and `tls.key`. This is maintained based on previous observations that these might be + utilized by certain Ray client setups, ensuring broad compatibility. Args: cluster_name (str): The name of the Ray cluster. namespace (str): The Kubernetes namespace where the Ray cluster is located. - - Environment Variables Set: - - RAY_USE_TLS: Enables TLS for Ray. - - RAY_TLS_SERVER_CERT: Path to the TLS server certificate. - - RAY_TLS_SERVER_KEY: Path to the TLS server private key. - - RAY_TLS_CA_CERT: Path to the CA certificate. """ tls_dir = os.path.join(os.getcwd(), f"tls-{cluster_name}-{namespace}") + client_cert_path = os.path.join(tls_dir, "tls.crt") + client_key_path = os.path.join(tls_dir, "tls.key") + ca_cert_path = os.path.join(tls_dir, "ca.crt") + os.environ["RAY_USE_TLS"] = "1" - os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt") - os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key") - os.environ["RAY_TLS_CA_CERT"] = os.path.join(tls_dir, "ca.crt") + + # CA certificate for verifying the server + os.environ["RAY_TLS_CA_CERT"] = ca_cert_path + + # Standard mTLS client variables: client's own certificate and key + os.environ["RAY_TLS_CLIENT_CERT"] = client_cert_path + os.environ["RAY_TLS_CLIENT_KEY"] = client_key_path + + # Also set RAY_TLS_SERVER_CERT/KEY to client cert/key, maintaining previous setup style + # while ensuring client certs are explicitly available via RAY_TLS_CLIENT_* + os.environ["RAY_TLS_SERVER_CERT"] = client_cert_path + os.environ["RAY_TLS_SERVER_KEY"] = client_key_path diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py index 2a3436b26..cea591674 100644 --- a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py +++ b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py @@ -130,7 +130,7 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"): "resources": get_resources("500m", "500m", "512Mi", "512Mi"), }, "headGroupSpec": { - "serviceType": "ClusterIP", + "serviceType": "NodePort", "enableIngress": False, "rayStartParams": { "dashboard-host": "0.0.0.0", diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index c20fd8793..37c22be75 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -4,11 +4,12 @@ TokenAuthentication, generate_cert, ) - +import subprocess +import json import pytest import ray import math - +import time from support import * @@ -66,7 +67,40 @@ def run_local_interactives( print(cluster.local_client_url()) ray.shutdown() - ray.init(address=cluster.local_client_url(), logging_level="DEBUG") + + print("RAY DEBUGGING") + print("\n========== PYTHON DEBUG INFO ==========") + print(f"Ray local cluster client URL: {cluster.local_client_url()}") + print(f"Ray cluster client URL: {cluster.cluster_uri()}") + print(f"Cluster name: {cluster_name}") + print(f"Current working directory: {os.getcwd()}") + print(f"Cluster: {cluster}") + print(f"Cluster namespace: {self.namespace}") + print(f"Cluster name: {cluster_name}") + print(f"Cluster config: {cluster.config}") + print(f"Cluster config namespace: {cluster.config.namespace}") + print(f"Cluster config name: {cluster.config.name}") + print(f"Cluster config num_workers: {cluster.config.num_workers}") + print(f"Cluster config num_workers: {cluster.config.num_workers}") + print("END OF RAY DEBUGGING") + + # print("Sleeping for 15 minutes before ray.init for debugging...") + # time.sleep(900) + + svc_json = subprocess.check_output( + f"kubectl get svc -n {self.namespace} {cluster_name}-head-svc -o json", + shell=True, + ) + svc = json.loads(svc_json) + node_port = None + for port in svc["spec"]["ports"]: + if port["port"] == 10001: + node_port = port["nodePort"] + break + + ray_url = f"ray://127.0.0.1:{node_port}" + print(f"Connecting to Ray at: {ray_url}") + ray.init(address=ray_url, logging_level="DEBUG") @ray.remote(num_gpus=number_of_gpus / 2) def heavy_calculation_part(num_iterations):