Skip to content

Commit c53bc8d

Browse files
committed
Adjust e2e tests to use GPU
1 parent 5ce0b2c commit c53bc8d

7 files changed

+69
-45
lines changed

.github/workflows/e2e_tests.yaml

+12-25
Original file line numberDiff line numberDiff line change
@@ -24,29 +24,15 @@ concurrency:
2424
group: ${{ github.head_ref }}-${{ github.workflow }}
2525
cancel-in-progress: true
2626

27+
env:
28+
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
29+
2730
jobs:
2831
kubernetes:
2932

30-
runs-on: ubuntu-20.04
33+
runs-on: ubuntu-20.04-4core-gpu
3134

3235
steps:
33-
- name: Cleanup
34-
run: |
35-
ls -lart
36-
echo "Initial status:"
37-
df -h
38-
echo "Cleaning up resources:"
39-
sudo swapoff -a
40-
sudo rm -f /swapfile
41-
sudo apt clean
42-
sudo rm -rf /usr/share/dotnet
43-
sudo rm -rf /opt/ghc
44-
sudo rm -rf "/usr/local/share/boost"
45-
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
46-
docker rmi $(docker image ls -aq)
47-
echo "Final status:"
48-
df -h
49-
5036
- name: Checkout code
5137
uses: actions/checkout@v4
5238
with:
@@ -82,19 +68,23 @@ jobs:
8268
python-version: '3.9'
8369
cache: 'pip' # caching pip dependencies
8470

71+
- name: Setup NVidia GPU environment for KinD
72+
uses: ./common/github-actions/nvidia-gpu-setup
73+
8574
- name: Setup and start KinD cluster
8675
uses: ./common/github-actions/kind
8776

77+
- name: Install NVidia GPU operator for KinD
78+
uses: ./common/github-actions/nvidia-gpu-operator
79+
8880
- name: Deploy CodeFlare stack
8981
id: deploy
9082
run: |
9183
cd codeflare-operator
9284
echo Setting up CodeFlare stack
9385
make setup-e2e
9486
echo Deploying CodeFlare operator
95-
IMG="${REGISTRY_ADDRESS}"/codeflare-operator
96-
make image-push -e IMG="${IMG}"
97-
make deploy -e IMG="${IMG}" -e ENV="e2e"
87+
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
9888
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
9989
cd ..
10090
@@ -103,9 +93,6 @@ jobs:
10393
with:
10494
user-name: sdk-user
10595

106-
- name: Add kueue resources
107-
run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"
108-
10996
- name: Configure RBAC for sdk user with limited permissions
11097
run: |
11198
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
@@ -135,7 +122,7 @@ jobs:
135122
pip install poetry
136123
poetry install --with test,docs
137124
echo "Running e2e tests..."
138-
poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
125+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
139126
env:
140127
GRPC_DNS_RESOLVER: "native"
141128

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ filterwarnings = [
5252
]
5353
markers = [
5454
"kind",
55-
"openshift"
55+
"openshift",
56+
"nvidia_gpu"
5657
]
5758
addopts = "--timeout=900"

tests/e2e/local_interactive_sdk_kind_test.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,16 @@ def test_local_interactives(self):
2727
create_kueue_resources(self)
2828
self.run_local_interactives()
2929

30-
def run_local_interactives(self):
30+
@pytest.mark.nvidia_gpu
31+
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
32+
self.setup_method()
33+
create_namespace(self)
34+
create_kueue_resources(self)
35+
self.run_local_interactives(number_of_gpus=1)
36+
37+
def run_local_interactives(
38+
self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
39+
):
3140
ray_image = get_ray_image()
3241

3342
cluster_name = "test-ray-cluster-li"
@@ -43,6 +52,7 @@ def run_local_interactives(self):
4352
worker_cpu_limits=1,
4453
worker_memory_requests=1,
4554
worker_memory_limits=2,
55+
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
4656
image=ray_image,
4757
write_to_file=True,
4858
verify_tls=False,
@@ -59,7 +69,7 @@ def run_local_interactives(self):
5969
ray.shutdown()
6070
ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
6171

62-
@ray.remote
72+
@ray.remote(num_gpus=number_of_gpus / 2)
6373
def heavy_calculation_part(num_iterations):
6474
result = 0.0
6575
for i in range(num_iterations):
@@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations):
6878
result += math.sin(i) * math.cos(j) * math.tan(k)
6979
return result
7080

71-
@ray.remote
81+
@ray.remote(num_gpus=number_of_gpus / 2)
7282
def heavy_calculation(num_iterations):
7383
results = ray.get(
7484
[heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]

tests/e2e/mnist.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
3333
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
3434

35+
print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
36+
ACCELERATOR = os.getenv("ACCELERATOR")
37+
3538

3639
class LitMNIST(LightningModule):
3740
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
@@ -149,7 +152,7 @@ def test_dataloader(self):
149152

150153
# Initialize a trainer
151154
trainer = Trainer(
152-
accelerator="auto",
155+
accelerator=ACCELERATOR,
153156
# devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
154157
max_epochs=3,
155158
callbacks=[TQDMProgressBar(refresh_rate=20)],

tests/e2e/mnist_raycluster_sdk_aw_kind_test.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
2424
self.setup_method()
2525
create_namespace(self)
2626
create_kueue_resources(self)
27-
self.run_mnist_raycluster_sdk_kind()
27+
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
2828

29-
def run_mnist_raycluster_sdk_kind(self):
29+
@pytest.mark.nvidia_gpu
30+
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
31+
self.setup_method()
32+
create_namespace(self)
33+
create_kueue_resources(self)
34+
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
35+
36+
def run_mnist_raycluster_sdk_kind(
37+
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
38+
):
3039
ray_image = get_ray_image()
3140

3241
cluster = Cluster(
@@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self):
3645
num_workers=1,
3746
head_cpus="500m",
3847
head_memory=2,
39-
min_cpus="500m",
40-
max_cpus=1,
41-
min_memory=1,
42-
max_memory=2,
43-
num_gpus=0,
48+
worker_cpu_requests="500m",
49+
worker_cpu_limits=1,
50+
worker_memory_requests=1,
51+
worker_memory_limits=4,
52+
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
4453
image=ray_image,
4554
write_to_file=True,
4655
verify_tls=False,
@@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
5867

5968
cluster.details()
6069

61-
self.assert_jobsubmit_withoutlogin_kind(cluster)
70+
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
6271

6372
# Assertions
6473

65-
def assert_jobsubmit_withoutlogin_kind(self, cluster):
74+
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
6675
ray_dashboard = cluster.cluster_dashboard_uri()
6776
client = RayJobClient(address=ray_dashboard, verify=False)
6877

@@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
7180
runtime_env={
7281
"working_dir": "./tests/e2e/",
7382
"pip": "./tests/e2e/mnist_pip_requirements.txt",
83+
"env_vars": {"ACCELERATOR": accelerator},
7484
},
85+
entrypoint_num_gpus=number_of_gpus,
7586
)
7687
print(f"Submitted job with ID: {submission_id}")
7788
done = False

tests/e2e/mnist_raycluster_sdk_kind_test.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
2525
self.setup_method()
2626
create_namespace(self)
2727
create_kueue_resources(self)
28-
self.run_mnist_raycluster_sdk_kind()
28+
self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
2929

30-
def run_mnist_raycluster_sdk_kind(self):
30+
@pytest.mark.nvidia_gpu
31+
def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
32+
self.setup_method()
33+
create_namespace(self)
34+
create_kueue_resources(self)
35+
self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
36+
37+
def run_mnist_raycluster_sdk_kind(
38+
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
39+
):
3140
ray_image = get_ray_image()
3241

3342
cluster = Cluster(
@@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self):
4049
worker_cpu_requests="500m",
4150
worker_cpu_limits=1,
4251
worker_memory_requests=1,
43-
worker_memory_limits=2,
52+
worker_memory_limits=4,
53+
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
4454
image=ray_image,
4555
write_to_file=True,
4656
verify_tls=False,
@@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
5767

5868
cluster.details()
5969

60-
self.assert_jobsubmit_withoutlogin_kind(cluster)
70+
self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
6171

6272
# Assertions
6373

64-
def assert_jobsubmit_withoutlogin_kind(self, cluster):
74+
def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
6575
ray_dashboard = cluster.cluster_dashboard_uri()
6676
client = RayJobClient(address=ray_dashboard, verify=False)
6777

@@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
7080
runtime_env={
7181
"working_dir": "./tests/e2e/",
7282
"pip": "./tests/e2e/mnist_pip_requirements.txt",
83+
"env_vars": {"ACCELERATOR": accelerator},
7384
},
85+
entrypoint_num_gpus=number_of_gpus,
7486
)
7587
print(f"Submitted job with ID: {submission_id}")
7688
done = False

tests/e2e/support.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
9090
"resources": [
9191
{"name": "cpu", "nominalQuota": 9},
9292
{"name": "memory", "nominalQuota": "36Gi"},
93-
{"name": "nvidia.com/gpu", "nominalQuota": 0},
93+
{"name": "nvidia.com/gpu", "nominalQuota": 1},
9494
],
9595
}
9696
],

0 commit comments

Comments
 (0)