Skip to content

Commit b61409e

Browse files
committed
e2e: gpu: add a basic tensorflow test
Signed-off-by: Tuomas Katila <[email protected]>
1 parent a70651f commit b61409e

File tree

4 files changed

+131
-2
lines changed

4 files changed

+131
-2
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: training-pod
5+
spec:
6+
restartPolicy: Never
7+
containers:
8+
- name: testcontainer
9+
image: intel/intel-extension-for-tensorflow:latest
10+
imagePullPolicy: IfNotPresent
11+
command: ["/bin/sh", "-c"]
12+
args: ["python /code/training.py"]
13+
resources:
14+
limits:
15+
gpu.intel.com/i915: 1
16+
requests:
17+
gpu.intel.com/i915: 1
18+
volumeMounts:
19+
- mountPath: /code
20+
name: code
21+
volumes:
22+
- configMap:
23+
name: training-code
24+
name: code
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
configMapGenerator:
2+
- name: training-code
3+
files:
4+
- training.py
5+
6+
resources:
7+
- deployment.yaml
8+
9+
images:
10+
- name: intel/intel-extension-for-tensorflow
11+
newTag: 1.2.0-gpu
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# original code from:
2+
# https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l02c01_celsius_to_fahrenheit.ipynb
3+
# this is slightly modified to run explicitly with XPU devices
4+
5+
import tensorflow as tf
6+
import intel_extension_for_tensorflow as itex
7+
import numpy as np
8+
9+
print("BACKENDS: ", str(itex.get_backend()))
10+
11+
devs = tf.config.list_physical_devices('XPU')
12+
13+
print(devs)
14+
15+
if not devs:
16+
raise Exception("No devices found")
17+
18+
with tf.device("/xpu:0"):
19+
celsius_q = np.array([-40, -10, 0, 8, 15, 22, 38], dtype=float)
20+
fahrenheit_a = np.array([-40, 14, 32, 46, 59, 72, 100], dtype=float)
21+
22+
model = tf.keras.Sequential([
23+
tf.keras.layers.Dense(units=1, input_shape=[1])
24+
])
25+
26+
model.compile(loss='mean_squared_error',
27+
optimizer=tf.keras.optimizers.Adam(0.1))
28+
29+
history = model.fit(celsius_q, fahrenheit_a, epochs=500, verbose=False)
30+
31+
print("model trained")
32+
33+
test = [100.0]
34+
p = model.predict(test)
35+
36+
if len(p) != 1:
37+
raise Exception("invalid result obj")
38+
39+
prediction = p[0]
40+
41+
if prediction >= 211 and prediction <= 213:
42+
print("inference ok: %f" % prediction)
43+
else:
44+
raise Exception("bad prediction %f" % prediction)
45+
46+
print("SUCCESS")

test/e2e/gpu/gpu.go

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,10 @@ import (
3535
)
3636

3737
const (
38-
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
39-
containerName = "testcontainer"
38+
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
39+
containerName = "testcontainer"
40+
tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
41+
tfPodName = "training-pod"
4042
)
4143

4244
func init() {
@@ -119,4 +121,50 @@ func describe() {
119121
framework.Logf("found card and renderD from the log")
120122
})
121123
})
124+
125+
ginkgo.It("run some tensorflow code on GPU", func() {
126+
ginkgo.By("deploying GPU plugin")
127+
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))
128+
129+
ginkgo.By("waiting for GPU plugin's availability")
130+
_, err := e2epod.WaitForPodsWithLabelRunningReady(f.ClientSet, f.Namespace.Name,
131+
labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
132+
if err != nil {
133+
e2edebug.DumpAllNamespaceInfo(f.ClientSet, f.Namespace.Name)
134+
e2ekubectl.LogFailedContainers(f.ClientSet, f.Namespace.Name, framework.Logf)
135+
framework.Failf("unable to wait for all pods to be running and ready: %v", err)
136+
}
137+
138+
ginkgo.By("checking if the resource is allocatable")
139+
if err = utils.WaitForNodesWithResource(f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil {
140+
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
141+
}
142+
143+
kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml)
144+
if err != nil {
145+
framework.Failf("unable to locate %q: %v", tfKustomizationYaml, err)
146+
}
147+
148+
ginkgo.By("submitting demo deployment")
149+
150+
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomYaml))
151+
152+
ginkgo.By("waiting the pod to finish")
153+
e2epod.NewPodClient(f).WaitForFinish(tfPodName, 240*time.Second)
154+
155+
ginkgo.By("checking log output")
156+
log, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, tfPodName, containerName)
157+
158+
framework.Logf("logs: %s", log)
159+
160+
if err != nil {
161+
framework.Failf("unable to get log from pod: %v", err)
162+
}
163+
164+
if !strings.Contains(log, "SUCCESS") {
165+
framework.Failf("tensorflow execution failed")
166+
}
167+
168+
framework.Logf("tensorflow execution succeeded!")
169+
})
122170
}

0 commit comments

Comments
 (0)