Testing E2E fixes for kueue integration #30
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: rayjob-e2e-with-kueue | |
on: | |
pull_request: | |
branches: | |
- main | |
- 'release-*' | |
- ray-jobs-feature | |
paths-ignore: | |
- 'docs/**' | |
- '**.adoc' | |
- '**.md' | |
- 'LICENSE' | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
KUEUE_VERSION: "v0.13.3" | |
jobs: | |
kubernetes: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
cache: 'pip' # caching pip dependencies | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
with: | |
worker-nodes: 2 # Multiple nodes for testing Kueue scheduling | |
- name: Verify Kind cluster | |
run: | | |
echo "Checking Kind clusters..." | |
kind get clusters | |
echo "Current kubectl context:" | |
kubectl config current-context | |
echo "Checking nodes:" | |
kubectl get nodes | |
- name: Deploy Kueue | |
run: | | |
echo "Deploying Kueue ${KUEUE_VERSION}" | |
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml | |
# Sleep until the kueue manager is running | |
echo "Waiting for pods in the kueue-system namespace to become ready" | |
while [[ $(kubectl get pods -n kueue-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]] | |
do | |
echo -n "." && sleep 1; | |
done | |
echo "" | |
sleep 5 | |
- name: Deploy KubeRay operator | |
run: | | |
KUBERAY_VERSION="v1.4.0" | |
echo "Deploying KubeRay ${KUBERAY_VERSION}" | |
# Create namespace first | |
kubectl create namespace ray-system || true | |
kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s" | |
# Check all namespaces to see where KubeRay was deployed | |
echo "Checking for KubeRay deployment in all namespaces..." | |
kubectl get deployments -A | grep kuberay || true | |
# Wait for KubeRay operator to be ready | |
echo "Waiting for KubeRay operator to become ready..." | |
kubectl wait --for=condition=Available --timeout=300s deployment/kuberay-operator -n ray-system || { | |
echo "KubeRay operator not found in ray-system, checking other namespaces:" | |
kubectl get pods -A | grep kuberay | |
kubectl get deployments -A | grep kuberay | |
# Try default namespace | |
kubectl wait --for=condition=Available --timeout=30s deployment/kuberay-operator -n default || true | |
# Try kuberay-system namespace | |
kubectl wait --for=condition=Available --timeout=30s deployment/kuberay-operator -n kuberay-system || true | |
} | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Configure RBAC for sdk user with limited permissions | |
run: | | |
# Basic permissions | |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | |
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | |
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | |
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | |
# Ray permissions | |
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters | |
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | |
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status | |
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user | |
# Kueue permissions | |
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | |
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | |
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | |
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | |
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | |
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | |
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads | |
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user | |
# Additional permissions | |
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | |
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | |
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods | |
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user | |
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services | |
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user | |
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward | |
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user | |
kubectl create clusterrole node-reader --verb=get,list --resource=nodes | |
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user | |
kubectl config use-context sdk-user | |
- name: Setup test output directory | |
run: | | |
CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs" | |
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR} | |
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | |
- name: Run RayJob e2e tests | |
run: | | |
set -euo pipefail | |
pip install poetry | |
poetry install --with test,docs | |
# Install the SDK in editable mode | |
pip install -e . | |
echo "Running RayJob e2e tests..." | |
# Set environment variable to prevent default queue assignment for non-Kueue tests | |
export DISABLE_DEFAULT_KUEUE_QUEUE=true | |
# Run only the tests that are designed for Kueue integration | |
poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_existing_cluster_test.py ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py -x > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
- name: Switch to kind-cluster context to print logs | |
if: always() | |
run: kubectl config use-context kind-cluster | |
- name: Print Pytest output log | |
if: always() | |
run: | | |
echo "Printing Pytest output logs" | |
cat ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/pytest_output.log || true | |
- name: Print Kueue operator logs | |
if: always() | |
run: | | |
echo "Printing Kueue operator logs" | |
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kueue-operator.log || true | |
- name: Print KubeRay operator logs | |
if: always() | |
run: | | |
echo "Printing KubeRay operator logs" | |
echo "Checking ray-system namespace contents:" | |
kubectl get all -n ray-system || true | |
echo "Attempting to get KubeRay logs with different selectors:" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \ | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/component=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \ | |
kubectl logs -n ray-system --tail -1 deployment/kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \ | |
echo "Could not find KubeRay operator logs" | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() | |
with: | |
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: logs | |
retention-days: 10 | |
path: | | |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log | |
if-no-files-found: warn |