Run PR check for guided notebooks

sutaakar · sutaakar · commit 14cc7296f2b3 · 2024-07-25T15:39:13.000+02:00
diff --git a/.github/resources/wait_for_job_cell.json b/.github/resources/wait_for_job_cell.json
@@ -0,0 +1,20 @@
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from time import sleep\n",
+    "\n",
+    "finished = False\n",
+    "while not finished:\n",
+    "    sleep(5)\n",
+    "    status = client.get_job_status(submission_id)\n",
+    "    finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n",
+    "    print(status)\n",
+    "print(\"Job status \" + status)\n",
+    "print(\"Logs: \")\n",
+    "print(client.get_job_logs(submission_id))\n",
+    "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\""
+   ]
+  }
diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml
@@ -1,4 +1,4 @@
-name: e2e
+name: Guided notebooks tests
 
 on:
   pull_request:
@@ -68,6 +68,13 @@ jobs:
           python-version: '3.9'
           cache: 'pip' # caching pip dependencies
 
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+
       - name: Setup NVidia GPU environment for KinD
         uses: ./common/github-actions/nvidia-gpu-setup
 
@@ -76,6 +83,8 @@ jobs:
 
       - name: Install NVidia GPU operator for KinD
         uses: ./common/github-actions/nvidia-gpu-operator
+        with:
+          enable-time-slicing: 'true'
 
       - name: Deploy CodeFlare stack
         id: deploy
@@ -113,46 +122,107 @@ jobs:
           kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
           kubectl config use-context sdk-user
 
-      - name: Run e2e tests
+      - name: Create AWS S3 bucket for tests
+        id: s3-bucket
         run: |
-          export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
-          echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
+          aws s3 mb s3://rhoai-dw-sdk-test
 
-          set -euo pipefail
-          pip install poetry
+      - name: Setup Guided notebooks execution
+        run: |
+          echo "Installing papermill and dependencies..."
+          pip install poetry papermill ipython ipykernel
+          # Disable virtualenv due to problems using packaged in virtualenv in papermill
+          poetry config virtualenvs.create false
+
+          echo "Installing SDK..."
           poetry install --with test,docs
-          echo "Running e2e tests..."
-          poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
+
+      - name: Run 0_basic_ray.ipynb
+        run: |
+          set -euo pipefail
+
+          # Remove login/logout cells, as KinD doesn't support authentication using token
+          jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
+          jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
+          # Run notebook
+#          poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600
+        working-directory: demo-notebooks/guided-demos
+
+      - name: Run 1_cluster_job_client.ipynb
+        run: |
+          set -euo pipefail
+
+          # Remove login/logout cells, as KinD doesn't support authentication using token
+          jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
+          jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
+          # Replace async logs with waiting for job to finish, async logs don't work properly in papermill
+          JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
+          jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
+          # Run notebook
+#          poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200
+        working-directory: demo-notebooks/guided-demos
+
+      - name: Run 2_basic_interactive.ipynb
+        run: |
+          set -euo pipefail
+
+          # Remove login/logout cells, as KinD doesn't support authentication using token
+          jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
+          jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
+          # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
+          sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
+          # Set explicit namespace as SDK need it (currently) to resolve local queues
+          sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb
+          # Set env variables for AWS S3 bucket
+          sed -i "s/\\\\\"pip\\\\\"/\\\\\"env_vars\\\\\": {\\\\\"AWS_ACCESS_KEY_ID\\\\\": \\\\\"${{ secrets.AWS_ACCESS_KEY_ID }}\\\\\", \\\\\"AWS_SECRET_ACCESS_KEY\\\\\": \\\\\"${{ secrets.AWS_SECRET_ACCESS_KEY }}\\\\\", \\\\\"AWS_DEFAULT_REGION\\\\\": \\\\\"us-east-1\\\\\"}, \\\\\"pip\\\\\"/" 2_basic_interactive.ipynb
+          # Configure persistent storage for Ray trainer
+          sed -i "s/# run_config/run_config/" 2_basic_interactive.ipynb
+          sed -i "s|storage path|s3://rhoai-dw-sdk-test/test/|" 2_basic_interactive.ipynb
+          # Run notebook
+          poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200
         env:
           GRPC_DNS_RESOLVER: "native"
+        working-directory: demo-notebooks/guided-demos
+
+      - name: Delete AWS S3 bucket
+        if: always() && steps.s3-bucket.outcome == 'success'
+        run: |
+          aws s3 rb s3://rhoai-dw-sdk-test --force
 
       - name: Switch to kind-cluster context to print logs
         if: always() && steps.deploy.outcome == 'success'
         run: kubectl config use-context kind-cluster
 
-      - name: Print Pytest output log
+      - name: Print debug info
         if: always() && steps.deploy.outcome == 'success'
         run: |
-          echo "Printing Pytest output logs"
-          cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
+          echo "Printing debug info"
+          kubectl describe pods -n default
 
       - name: Print CodeFlare operator logs
         if: always() && steps.deploy.outcome == 'success'
         run: |
           echo "Printing CodeFlare operator logs"
-          kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
+          kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
+
+      - name: Print Kueue operator logs
+        if: always() && steps.deploy.outcome == 'success'
+        run: |
+          echo "Printing Kueue operator logs"
+          KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
+          kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
 
       - name: Print KubeRay operator logs
         if: always() && steps.deploy.outcome == 'success'
         run: |
           echo "Printing KubeRay operator logs"
-          kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
+          kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
 
       - name: Export all KinD pod logs
         uses: ./common/github-actions/kind-export-logs
         if: always() && steps.deploy.outcome == 'success'
         with:
-          output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
+          output-directory: ${TEMP_DIR}
 
       - name: Upload logs
         uses: actions/upload-artifact@v4
@@ -161,4 +231,4 @@ jobs:
           name: logs
           retention-days: 10
           path: |
-            ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
+            ${{ env.TEMP_DIR }}/**/*.log
diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb
@@ -62,10 +62,12 @@
     "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
     "cluster = Cluster(ClusterConfiguration(\n",
     "    name='raytest', \n",
+    "    head_cpus='500m',\n",
+    "    head_memory=2,\n",
     "    head_gpus=0, # For GPU enabled workloads set the head_gpus and num_gpus\n",
     "    num_gpus=0,\n",
     "    num_workers=2,\n",
-    "    min_cpus=1,\n",
+    "    min_cpus='250m',\n",
     "    max_cpus=1,\n",
     "    min_memory=4,\n",
     "    max_memory=4,\n",
diff --git a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb
@@ -44,10 +44,12 @@
     "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
     "cluster = Cluster(ClusterConfiguration(\n",
     "    name='jobtest',\n",
+    "    head_cpus=1,\n",
+    "    head_memory=4,\n",
     "    head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
     "    num_gpus=1,\n",
     "    num_workers=2,\n",
-    "    min_cpus=1,\n",
+    "    min_cpus='250m',\n",
     "    max_cpus=1,\n",
     "    min_memory=4,\n",
     "    max_memory=4,\n",
diff --git a/demo-notebooks/guided-demos/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb
@@ -60,13 +60,15 @@
     "cluster_name = \"interactivetest\"\n",
     "cluster = Cluster(ClusterConfiguration(\n",
     "    name=cluster_name,\n",
+    "    head_cpus=1,\n",
+    "    head_memory=4,\n",
     "    head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
     "    num_gpus=1,\n",
     "    num_workers=2,\n",
-    "    min_cpus=2,\n",
-    "    max_cpus=2,\n",
-    "    min_memory=8,\n",
-    "    max_memory=8,\n",
+    "    min_cpus='250m',\n",
+    "    max_cpus=1,\n",
+    "    min_memory=4,\n",
+    "    max_memory=4,\n",
     "    image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n",
     "    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
     "    # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -251,7 +253,17 @@
     "\n",
     "    ray_trainer = TorchTrainer(\n",
     "        train_func,\n",
-    "        scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n",
+    "        scaling_config=ScalingConfig(\n",
+    "            # num_workers = number of worker nodes with the ray head node included\n",
+    "            num_workers=3,\n",
+    "            use_gpu=True,\n",
+    "            resources_per_worker={\n",
+    "                \"CPU\": 1,\n",
+    "            },\n",
+    "            trainer_resources={\n",
+    "                \"CPU\": 0,\n",
+    "            }\n",
+    "        )\n",
     "        # Configure persistent storage that is accessible across \n",
     "        # all worker nodes.\n",
     "        # Uncomment and update the RunConfig below to include your storage details.\n",
diff --git a/demo-notebooks/guided-demos/mnist_fashion.py b/demo-notebooks/guided-demos/mnist_fashion.py
@@ -78,8 +78,16 @@ def train_func_distributed():
 trainer = TorchTrainer(
     train_func_distributed,
     scaling_config=ScalingConfig(
-        num_workers=3, use_gpu=use_gpu
-    ),  # num_workers = number of worker nodes with the ray head node included
+        # num_workers = number of worker nodes with the ray head node included
+        num_workers=3,
+        use_gpu=use_gpu,
+        resources_per_worker={
+            "CPU": 1,
+        },
+        trainer_resources={
+            "CPU": 0,
+        },
+    ),
 )
 
 results = trainer.fit()