Skip to content

Commit 70c8707

Browse files
committed
Add e2e tests using GPU to execute current test scenarios
1 parent 130e003 commit 70c8707

8 files changed

+73
-43
lines changed

.github/workflows/e2e_tests.yaml

+8-22
Original file line numberDiff line numberDiff line change
@@ -30,26 +30,9 @@ env:
3030
jobs:
3131
kubernetes:
3232

33-
runs-on: ubuntu-20.04
33+
runs-on: ubuntu-20.04-4core-gpu
3434

3535
steps:
36-
- name: Cleanup
37-
run: |
38-
ls -lart
39-
echo "Initial status:"
40-
df -h
41-
echo "Cleaning up resources:"
42-
sudo swapoff -a
43-
sudo rm -f /swapfile
44-
sudo apt clean
45-
sudo rm -rf /usr/share/dotnet
46-
sudo rm -rf /opt/ghc
47-
sudo rm -rf "/usr/local/share/boost"
48-
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
49-
docker rmi $(docker image ls -aq)
50-
echo "Final status:"
51-
df -h
52-
5336
- name: Checkout code
5437
uses: actions/checkout@v4
5538
with:
@@ -85,9 +68,15 @@ jobs:
8568
python-version: '3.9'
8669
cache: 'pip' # caching pip dependencies
8770

71+
- name: Setup NVidia GPU environment for KinD
72+
uses: ./common/github-actions/nvidia-gpu-setup
73+
8874
- name: Setup and start KinD cluster
8975
uses: ./common/github-actions/kind
9076

77+
- name: Install NVidia GPU operator for KinD
78+
uses: ./common/github-actions/nvidia-gpu-operator
79+
9180
- name: Deploy CodeFlare stack
9281
id: deploy
9382
run: |
@@ -104,9 +93,6 @@ jobs:
10493
with:
10594
user-name: sdk-user
10695

107-
- name: Add kueue resources
108-
run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"
109-
11096
- name: Configure RBAC for sdk user with limited permissions
11197
run: |
11298
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
@@ -136,7 +122,7 @@ jobs:
136122
pip install poetry
137123
poetry install --with test,docs
138124
echo "Running e2e tests..."
139-
poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
125+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
140126
env:
141127
GRPC_DNS_RESOLVER: "native"
142128

docs/e2e.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
## On KinD clusters
66
Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127.0.0.1 kind`. This will map your localhost IP address to the KinD cluster's hostname. This is already performed on [GitHub Actions](https://github.com/project-codeflare/codeflare-common/blob/1edd775e2d4088a5a0bfddafb06ff3a773231c08/github-actions/kind/action.yml#L70-L72)
77

8+
If the system you run on contains NVidia GPU then you can enable the GPU support in KinD, this will allow you to run also GPU tests.
9+
To enable GPU on KinD follow [these instructions](https://www.substratus.ai/blog/kind-with-gpus).
10+
811
- Setup Phase:
912
- Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets:
1013
```
@@ -66,7 +69,11 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
6669
poetry install --with test,docs
6770
poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py
6871
```
69-
72+
- If the cluster doesn't have NVidia GPU support then we need to disable NVidia GPU tests by providing proper marker:
73+
```
74+
poetry install --with test,docs
75+
poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py -m 'kind and not nvidia_gpu'
76+
```
7077

7178

7279
## On OpenShift clusters

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ filterwarnings = [
5252
]
5353
markers = [
5454
"kind",
55-
"openshift"
55+
"openshift",
56+
"nvidia_gpu"
5657
]
5758
addopts = "--timeout=900"

tests/e2e/local_interactive_sdk_kind_test.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,16 @@ def test_local_interactives(self):
2727
create_kueue_resources(self)
2828
self.run_local_interactives()
2929

30-
def run_local_interactives(self):
30+
@pytest.mark.nvidia_gpu
31+
def test_local_interactives_nvidia_gpu(self):
32+
self.setup_method()
33+
create_namespace(self)
34+
create_kueue_resources(self)
35+
self.run_local_interactives(number_of_gpus=1)
36+
37+
def run_local_interactives(
38+
self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
39+
):
3140
ray_image = get_ray_image()
3241

3342
cluster_name = "test-ray-cluster-li"
@@ -43,6 +52,7 @@ def run_local_interactives(self):
4352
worker_cpu_limits=1,
4453
worker_memory_requests=1,
4554
worker_memory_limits=2,
55+
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
4656
image=ray_image,
4757
write_to_file=True,
4858
verify_tls=False,
@@ -59,7 +69,7 @@ def run_local_interactives(self):
5969
ray.shutdown()
6070
ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
6171

62-
@ray.remote
72+
@ray.remote(num_gpus=number_of_gpus / 2)
6373
def heavy_calculation_part(num_iterations):
6474
result = 0.0
6575
for i in range(num_iterations):
@@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations):
6878
result += math.sin(i) * math.cos(j) * math.tan(k)
6979
return result
7080

71-
@ray.remote
81+
@ray.remote(num_gpus=number_of_gpus / 2)
7282
def heavy_calculation(num_iterations):
7383
results = ray.get(
7484
[heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]

tests/e2e/mnist.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
3333
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
3434

35+
print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
36+
ACCELERATOR = os.getenv("ACCELERATOR")
37+
3538

3639
class LitMNIST(LightningModule):
3740
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
@@ -149,7 +152,7 @@ def test_dataloader(self):
149152

150153
# Initialize a trainer
151154
trainer = Trainer(
152-
accelerator="auto",
155+
accelerator=ACCELERATOR,
153156
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
154157
max_epochs=3,
155158
callbacks=[TQDMProgressBar(refresh_rate=20)],

tests/e2e/mnist_raycluster_sdk_aw_kind_test.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
2424
self.setup_method()
2525
create_namespace(self)
2626
create_kueue_resources(self)
27-
self.run_mnist_raycluster_sdk_kind()
27+
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
2828

29-
def run_mnist_raycluster_sdk_kind(self):
29+
@pytest.mark.nvidia_gpu
30+
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
31+
self.setup_method()
32+
create_namespace(self)
33+
create_kueue_resources(self)
34+
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
35+
36+
def run_mnist_raycluster_sdk_kind(
37+
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
38+
):
3039
ray_image = get_ray_image()
3140

3241
cluster = Cluster(
@@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self):
3645
num_workers=1,
3746
head_cpus="500m",
3847
head_memory=2,
39-
min_cpus="500m",
40-
max_cpus=1,
41-
min_memory=1,
42-
max_memory=2,
43-
num_gpus=0,
48+
worker_cpu_requests="500m",
49+
worker_cpu_limits=1,
50+
worker_memory_requests=1,
51+
worker_memory_limits=4,
52+
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
4453
image=ray_image,
4554
write_to_file=True,
4655
verify_tls=False,
@@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
5867

5968
cluster.details()
6069

61-
self.assert_jobsubmit_withoutlogin_kind(cluster)
70+
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
6271

6372
# Assertions
6473

65-
def assert_jobsubmit_withoutlogin_kind(self, cluster):
74+
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
6675
ray_dashboard = cluster.cluster_dashboard_uri()
6776
client = RayJobClient(address=ray_dashboard, verify=False)
6877

@@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
7180
runtime_env={
7281
"working_dir": "./tests/e2e/",
7382
"pip": "./tests/e2e/mnist_pip_requirements.txt",
83+
"env_vars": {"ACCELERATOR": accelerator},
7484
},
85+
entrypoint_num_gpus=number_of_gpus,
7586
)
7687
print(f"Submitted job with ID: {submission_id}")
7788
done = False

tests/e2e/mnist_raycluster_sdk_kind_test.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
2525
self.setup_method()
2626
create_namespace(self)
2727
create_kueue_resources(self)
28-
self.run_mnist_raycluster_sdk_kind()
28+
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
2929

30-
def run_mnist_raycluster_sdk_kind(self):
30+
@pytest.mark.nvidia_gpu
31+
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
32+
self.setup_method()
33+
create_namespace(self)
34+
create_kueue_resources(self)
35+
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
36+
37+
def run_mnist_raycluster_sdk_kind(
38+
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
39+
):
3140
ray_image = get_ray_image()
3241

3342
cluster = Cluster(
@@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self):
4049
worker_cpu_requests="500m",
4150
worker_cpu_limits=1,
4251
worker_memory_requests=1,
43-
worker_memory_limits=2,
52+
worker_memory_limits=4,
53+
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
4454
image=ray_image,
4555
write_to_file=True,
4656
verify_tls=False,
@@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
5767

5868
cluster.details()
5969

60-
self.assert_jobsubmit_withoutlogin_kind(cluster)
70+
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
6171

6272
# Assertions
6373

64-
def assert_jobsubmit_withoutlogin_kind(self, cluster):
74+
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
6575
ray_dashboard = cluster.cluster_dashboard_uri()
6676
client = RayJobClient(address=ray_dashboard, verify=False)
6777

@@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
7080
runtime_env={
7181
"working_dir": "./tests/e2e/",
7282
"pip": "./tests/e2e/mnist_pip_requirements.txt",
83+
"env_vars": {"ACCELERATOR": accelerator},
7384
},
85+
entrypoint_num_gpus=number_of_gpus,
7486
)
7587
print(f"Submitted job with ID: {submission_id}")
7688
done = False

tests/e2e/support.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
9090
"resources": [
9191
{"name": "cpu", "nominalQuota": 9},
9292
{"name": "memory", "nominalQuota": "36Gi"},
93-
{"name": "nvidia.com/gpu", "nominalQuota": 0},
93+
{"name": "nvidia.com/gpu", "nominalQuota": 1},
9494
],
9595
}
9696
],

0 commit comments

Comments
 (0)