Skip to content

Commit 14cc729

Browse files
committed
Run PR check for guided notebooks
1 parent f100ba1 commit 14cc729

File tree

6 files changed

+138
-24
lines changed

6 files changed

+138
-24
lines changed
+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"cell_type": "code",
3+
"execution_count": null,
4+
"metadata": {},
5+
"outputs": [],
6+
"source": [
7+
"from time import sleep\n",
8+
"\n",
9+
"finished = False\n",
10+
"while not finished:\n",
11+
" sleep(5)\n",
12+
" status = client.get_job_status(submission_id)\n",
13+
" finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n",
14+
" print(status)\n",
15+
"print(\"Job status \" + status)\n",
16+
"print(\"Logs: \")\n",
17+
"print(client.get_job_logs(submission_id))\n",
18+
"assert status == \"SUCCEEDED\", \"Job failed or was stopped!\""
19+
]
20+
}

.github/workflows/e2e_tests.yaml renamed to .github/workflows/guided_notebook_tests.yaml

+85-15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: e2e
1+
name: Guided notebooks tests
22

33
on:
44
pull_request:
@@ -68,6 +68,13 @@ jobs:
6868
python-version: '3.9'
6969
cache: 'pip' # caching pip dependencies
7070

71+
- name: Configure AWS Credentials
72+
uses: aws-actions/configure-aws-credentials@v4
73+
with:
74+
aws-region: us-east-1
75+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
76+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
77+
7178
- name: Setup NVidia GPU environment for KinD
7279
uses: ./common/github-actions/nvidia-gpu-setup
7380

@@ -76,6 +83,8 @@ jobs:
7683

7784
- name: Install NVidia GPU operator for KinD
7885
uses: ./common/github-actions/nvidia-gpu-operator
86+
with:
87+
enable-time-slicing: 'true'
7988

8089
- name: Deploy CodeFlare stack
8190
id: deploy
@@ -113,46 +122,107 @@ jobs:
113122
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
114123
kubectl config use-context sdk-user
115124
116-
- name: Run e2e tests
125+
- name: Create AWS S3 bucket for tests
126+
id: s3-bucket
117127
run: |
118-
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
119-
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
128+
aws s3 mb s3://rhoai-dw-sdk-test
120129
121-
set -euo pipefail
122-
pip install poetry
130+
- name: Setup Guided notebooks execution
131+
run: |
132+
echo "Installing papermill and dependencies..."
133+
pip install poetry papermill ipython ipykernel
134+
# Disable virtualenv due to problems using packaged in virtualenv in papermill
135+
poetry config virtualenvs.create false
136+
137+
echo "Installing SDK..."
123138
poetry install --with test,docs
124-
echo "Running e2e tests..."
125-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
139+
140+
- name: Run 0_basic_ray.ipynb
141+
run: |
142+
set -euo pipefail
143+
144+
# Remove login/logout cells, as KinD doesn't support authentication using token
145+
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
146+
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
147+
# Run notebook
148+
# poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600
149+
working-directory: demo-notebooks/guided-demos
150+
151+
- name: Run 1_cluster_job_client.ipynb
152+
run: |
153+
set -euo pipefail
154+
155+
# Remove login/logout cells, as KinD doesn't support authentication using token
156+
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
157+
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
158+
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
159+
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
160+
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
161+
# Run notebook
162+
# poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200
163+
working-directory: demo-notebooks/guided-demos
164+
165+
- name: Run 2_basic_interactive.ipynb
166+
run: |
167+
set -euo pipefail
168+
169+
# Remove login/logout cells, as KinD doesn't support authentication using token
170+
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
171+
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
172+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
173+
sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
174+
# Set explicit namespace as SDK need it (currently) to resolve local queues
175+
sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb
176+
# Set env variables for AWS S3 bucket
177+
sed -i "s/\\\\\"pip\\\\\"/\\\\\"env_vars\\\\\": {\\\\\"AWS_ACCESS_KEY_ID\\\\\": \\\\\"${{ secrets.AWS_ACCESS_KEY_ID }}\\\\\", \\\\\"AWS_SECRET_ACCESS_KEY\\\\\": \\\\\"${{ secrets.AWS_SECRET_ACCESS_KEY }}\\\\\", \\\\\"AWS_DEFAULT_REGION\\\\\": \\\\\"us-east-1\\\\\"}, \\\\\"pip\\\\\"/" 2_basic_interactive.ipynb
178+
# Configure persistent storage for Ray trainer
179+
sed -i "s/# run_config/run_config/" 2_basic_interactive.ipynb
180+
sed -i "s|storage path|s3://rhoai-dw-sdk-test/test/|" 2_basic_interactive.ipynb
181+
# Run notebook
182+
poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200
126183
env:
127184
GRPC_DNS_RESOLVER: "native"
185+
working-directory: demo-notebooks/guided-demos
186+
187+
- name: Delete AWS S3 bucket
188+
if: always() && steps.s3-bucket.outcome == 'success'
189+
run: |
190+
aws s3 rb s3://rhoai-dw-sdk-test --force
128191
129192
- name: Switch to kind-cluster context to print logs
130193
if: always() && steps.deploy.outcome == 'success'
131194
run: kubectl config use-context kind-cluster
132195

133-
- name: Print Pytest output log
196+
- name: Print debug info
134197
if: always() && steps.deploy.outcome == 'success'
135198
run: |
136-
echo "Printing Pytest output logs"
137-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
199+
echo "Printing debug info"
200+
kubectl describe pods -n default
138201
139202
- name: Print CodeFlare operator logs
140203
if: always() && steps.deploy.outcome == 'success'
141204
run: |
142205
echo "Printing CodeFlare operator logs"
143-
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
206+
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
207+
208+
- name: Print Kueue operator logs
209+
if: always() && steps.deploy.outcome == 'success'
210+
run: |
211+
echo "Printing Kueue operator logs"
212+
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
213+
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
144214
145215
- name: Print KubeRay operator logs
146216
if: always() && steps.deploy.outcome == 'success'
147217
run: |
148218
echo "Printing KubeRay operator logs"
149-
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
219+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
150220
151221
- name: Export all KinD pod logs
152222
uses: ./common/github-actions/kind-export-logs
153223
if: always() && steps.deploy.outcome == 'success'
154224
with:
155-
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
225+
output-directory: ${TEMP_DIR}
156226

157227
- name: Upload logs
158228
uses: actions/upload-artifact@v4
@@ -161,4 +231,4 @@ jobs:
161231
name: logs
162232
retention-days: 10
163233
path: |
164-
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
234+
${{ env.TEMP_DIR }}/**/*.log

demo-notebooks/guided-demos/0_basic_ray.ipynb

+3-1
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,12 @@
6262
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
6363
"cluster = Cluster(ClusterConfiguration(\n",
6464
" name='raytest', \n",
65+
" head_cpus='500m',\n",
66+
" head_memory=2,\n",
6567
" head_gpus=0, # For GPU enabled workloads set the head_gpus and num_gpus\n",
6668
" num_gpus=0,\n",
6769
" num_workers=2,\n",
68-
" min_cpus=1,\n",
70+
" min_cpus='250m',\n",
6971
" max_cpus=1,\n",
7072
" min_memory=4,\n",
7173
" max_memory=4,\n",

demo-notebooks/guided-demos/1_cluster_job_client.ipynb

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,12 @@
4444
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
4545
"cluster = Cluster(ClusterConfiguration(\n",
4646
" name='jobtest',\n",
47+
" head_cpus=1,\n",
48+
" head_memory=4,\n",
4749
" head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
4850
" num_gpus=1,\n",
4951
" num_workers=2,\n",
50-
" min_cpus=1,\n",
52+
" min_cpus='250m',\n",
5153
" max_cpus=1,\n",
5254
" min_memory=4,\n",
5355
" max_memory=4,\n",

demo-notebooks/guided-demos/2_basic_interactive.ipynb

+17-5
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,15 @@
6060
"cluster_name = \"interactivetest\"\n",
6161
"cluster = Cluster(ClusterConfiguration(\n",
6262
" name=cluster_name,\n",
63+
" head_cpus=1,\n",
64+
" head_memory=4,\n",
6365
" head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
6466
" num_gpus=1,\n",
6567
" num_workers=2,\n",
66-
" min_cpus=2,\n",
67-
" max_cpus=2,\n",
68-
" min_memory=8,\n",
69-
" max_memory=8,\n",
68+
" min_cpus='250m',\n",
69+
" max_cpus=1,\n",
70+
" min_memory=4,\n",
71+
" max_memory=4,\n",
7072
" image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n",
7173
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
7274
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -251,7 +253,17 @@
251253
"\n",
252254
" ray_trainer = TorchTrainer(\n",
253255
" train_func,\n",
254-
" scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n",
256+
" scaling_config=ScalingConfig(\n",
257+
" # num_workers = number of worker nodes with the ray head node included\n",
258+
" num_workers=3,\n",
259+
" use_gpu=True,\n",
260+
" resources_per_worker={\n",
261+
" \"CPU\": 1,\n",
262+
" },\n",
263+
" trainer_resources={\n",
264+
" \"CPU\": 0,\n",
265+
" }\n",
266+
" )\n",
255267
" # Configure persistent storage that is accessible across \n",
256268
" # all worker nodes.\n",
257269
" # Uncomment and update the RunConfig below to include your storage details.\n",

demo-notebooks/guided-demos/mnist_fashion.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,16 @@ def train_func_distributed():
7878
trainer = TorchTrainer(
7979
train_func_distributed,
8080
scaling_config=ScalingConfig(
81-
num_workers=3, use_gpu=use_gpu
82-
), # num_workers = number of worker nodes with the ray head node included
81+
# num_workers = number of worker nodes with the ray head node included
82+
num_workers=3,
83+
use_gpu=use_gpu,
84+
resources_per_worker={
85+
"CPU": 1,
86+
},
87+
trainer_resources={
88+
"CPU": 0,
89+
},
90+
),
8391
)
8492

8593
results = trainer.fit()

0 commit comments

Comments
 (0)