Skip to content

test using custom cfo #830

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:
cancel-in-progress: true

env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
CODEFLARE_OPERATOR_IMG: "quay.io/kryanbeane/codeflare-operator:dev"

jobs:
kubernetes:
Expand All @@ -39,7 +39,7 @@ jobs:
- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
repository: kryanbeane/codeflare-operator
path: codeflare-operator

- name: Set Go
Expand Down Expand Up @@ -106,8 +106,16 @@ jobs:
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl config use-context sdk-user

- name: Setup tmate session for debugging
if: always()
uses: mxschmitt/action-tmate@v3
with:
detached: true

- name: Run e2e tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
Expand All @@ -117,7 +125,7 @@ jobs:
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
poetry run pytest -v -s ./tests/e2e/local_interactive_sdk_kind_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"

Expand Down
39 changes: 29 additions & 10 deletions src/codeflare_sdk/common/utils/generate_cert.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,22 +230,41 @@ def generate_tls_cert(cluster_name, namespace, days=30):

def export_env(cluster_name, namespace):
"""
Sets environment variables to configure TLS for a Ray cluster.
Sets environment variables to configure TLS for a Ray client connection when mTLS is enabled.

The `tls.crt` and `tls.key` files generated by `generate_tls_cert` are client-side credentials,
signed by the cluster's CA. `ca.crt` is the cluster's CA certificate.

This function sets:
- `RAY_USE_TLS="1"` to enable TLS.
- `RAY_TLS_CA_CERT` to the path of `ca.crt` for server certificate verification.
- `RAY_TLS_CLIENT_CERT` and `RAY_TLS_CLIENT_KEY` to the paths of the client's `tls.crt`
and `tls.key` respectively, for client authentication by the server.
- `RAY_TLS_SERVER_CERT` and `RAY_TLS_SERVER_KEY` are also set to the client's `tls.crt`
and `tls.key`. This is maintained based on previous observations that these might be
utilized by certain Ray client setups, ensuring broad compatibility.

Args:
cluster_name (str):
The name of the Ray cluster.
namespace (str):
The Kubernetes namespace where the Ray cluster is located.

Environment Variables Set:
- RAY_USE_TLS: Enables TLS for Ray.
- RAY_TLS_SERVER_CERT: Path to the TLS server certificate.
- RAY_TLS_SERVER_KEY: Path to the TLS server private key.
- RAY_TLS_CA_CERT: Path to the CA certificate.
"""
tls_dir = os.path.join(os.getcwd(), f"tls-{cluster_name}-{namespace}")
client_cert_path = os.path.join(tls_dir, "tls.crt")
client_key_path = os.path.join(tls_dir, "tls.key")
ca_cert_path = os.path.join(tls_dir, "ca.crt")

os.environ["RAY_USE_TLS"] = "1"
os.environ["RAY_TLS_SERVER_CERT"] = os.path.join(tls_dir, "tls.crt")
os.environ["RAY_TLS_SERVER_KEY"] = os.path.join(tls_dir, "tls.key")
os.environ["RAY_TLS_CA_CERT"] = os.path.join(tls_dir, "ca.crt")

# CA certificate for verifying the server
os.environ["RAY_TLS_CA_CERT"] = ca_cert_path

# Standard mTLS client variables: client's own certificate and key
os.environ["RAY_TLS_CLIENT_CERT"] = client_cert_path
os.environ["RAY_TLS_CLIENT_KEY"] = client_key_path

# Also set RAY_TLS_SERVER_CERT/KEY to client cert/key, maintaining previous setup style
# while ensuring client certs are explicitly available via RAY_TLS_CLIENT_*
os.environ["RAY_TLS_SERVER_CERT"] = client_cert_path
os.environ["RAY_TLS_SERVER_KEY"] = client_key_path
2 changes: 1 addition & 1 deletion src/codeflare_sdk/ray/cluster/build_ray_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
"resources": get_resources("500m", "500m", "512Mi", "512Mi"),
},
"headGroupSpec": {
"serviceType": "ClusterIP",
"serviceType": "NodePort",
"enableIngress": False,
"rayStartParams": {
"dashboard-host": "0.0.0.0",
Expand Down
40 changes: 37 additions & 3 deletions tests/e2e/local_interactive_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
TokenAuthentication,
generate_cert,
)

import subprocess
import json
import pytest
import ray
import math

import time
from support import *


Expand Down Expand Up @@ -66,7 +67,40 @@ def run_local_interactives(
print(cluster.local_client_url())

ray.shutdown()
ray.init(address=cluster.local_client_url(), logging_level="DEBUG")

print("RAY DEBUGGING")
print("\n========== PYTHON DEBUG INFO ==========")
print(f"Ray local cluster client URL: {cluster.local_client_url()}")
print(f"Ray cluster client URL: {cluster.cluster_uri()}")
print(f"Cluster name: {cluster_name}")
print(f"Current working directory: {os.getcwd()}")
print(f"Cluster: {cluster}")
print(f"Cluster namespace: {self.namespace}")
print(f"Cluster name: {cluster_name}")
print(f"Cluster config: {cluster.config}")
print(f"Cluster config namespace: {cluster.config.namespace}")
print(f"Cluster config name: {cluster.config.name}")
print(f"Cluster config num_workers: {cluster.config.num_workers}")
print(f"Cluster config num_workers: {cluster.config.num_workers}")
print("END OF RAY DEBUGGING")

# print("Sleeping for 15 minutes before ray.init for debugging...")
# time.sleep(900)

svc_json = subprocess.check_output(
f"kubectl get svc -n {self.namespace} {cluster_name}-head-svc -o json",
shell=True,
)
svc = json.loads(svc_json)
node_port = None
for port in svc["spec"]["ports"]:
if port["port"] == 10001:
node_port = port["nodePort"]
break

ray_url = f"ray://127.0.0.1:{node_port}"
print(f"Connecting to Ray at: {ray_url}")
ray.init(address=ray_url, logging_level="DEBUG")

@ray.remote(num_gpus=number_of_gpus / 2)
def heavy_calculation_part(num_iterations):
Expand Down
Loading