Skip to content

Commit 504f09e

Browse files
committed
e2e: gpu: add a basic tensorflow test
Signed-off-by: Tuomas Katila <[email protected]>
1 parent 44e5886 commit 504f09e

File tree

5 files changed

+126
-4
lines changed

5 files changed

+126
-4
lines changed

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,12 @@ null :=
234234
space := $(null) #
235235
comma := ,
236236
images_json := $(subst $(space),$(comma),[$(addprefix ",$(addsuffix ",$(images) $(demos))]))
237-
skip_images := "ubuntu-demo-openvino"
237+
skip_images_source := ubuntu-demo-openvino intel-gpu-tensorflow
238+
skip_images := $(subst $(space),$(comma),[$(addprefix ",$(addsuffix ",$(skip_images_source))]))
238239

239240
check-github-actions:
240241
@python3 -c 'import sys, yaml, json; json.dump(yaml.load(sys.stdin, Loader=yaml.SafeLoader), sys.stdout)' < .github/workflows/ci.yaml | \
241-
jq -e '$(images_json) - [$(skip_images)] - .jobs.image.strategy.matrix.image == []' > /dev/null || \
242+
jq -e '$(images_json) - $(skip_images) - .jobs.image.strategy.matrix.image == []' > /dev/null || \
242243
(echo "Make sure all images are listed in .github/workflows/ci.yaml"; exit 1)
243244

244245
.PHONY: all format test lint build images $(cmds) $(images) lock-images vendor pre-pull set-version check-github-actions envtest fixture update-fixture install-tools test-image-base-layer
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: training-code
5+
data:
6+
training.py: |
7+
# original code from:
8+
# https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l02c01_celsius_to_fahrenheit.ipynb
9+
# this is slightly modified to run explicitly with XPU devices
10+
11+
import tensorflow as tf
12+
import intel_extension_for_tensorflow as itex
13+
import numpy as np
14+
15+
print("BACKENDS: ", str(itex.get_backend()))
16+
17+
devs = tf.config.list_physical_devices('XPU')
18+
19+
print(devs)
20+
21+
if not devs:
22+
raise Exception("No devices found")
23+
24+
with tf.device("/xpu:0"):
25+
celsius_q = np.array([-40, -10, 0, 8, 15, 22, 38], dtype=float)
26+
fahrenheit_a = np.array([-40, 14, 32, 46, 59, 72, 100], dtype=float)
27+
28+
model = tf.keras.Sequential([
29+
tf.keras.layers.Dense(units=1, input_shape=[1])
30+
])
31+
32+
model.compile(loss='mean_squared_error',
33+
optimizer=tf.keras.optimizers.Adam(0.1))
34+
35+
history = model.fit(celsius_q, fahrenheit_a, epochs=500, verbose=False)
36+
print("model trained")
37+
38+
test = [100.0]
39+
p = model.predict(test)
40+
if len(p) == 1:
41+
prediction = p[0]
42+
43+
if prediction >= 212 and prediction < 213:
44+
print("inference tested")
45+
else:
46+
raise Exception("bad prediction")
47+
48+
print("SUCCESS")
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: training-pod
5+
spec:
6+
restartPolicy: Never
7+
containers:
8+
- name: testcontainer
9+
image: intel/intel-extension-for-tensorflow:1.1.0-gpu-flex
10+
imagePullPolicy: IfNotPresent
11+
command: ["/bin/sh", "-c"]
12+
args: ["python /code/training.py"]
13+
resources:
14+
limits:
15+
gpu.intel.com/i915: 1
16+
requests:
17+
gpu.intel.com/i915: 1
18+
volumeMounts:
19+
- mountPath: /code
20+
name: code
21+
volumes:
22+
- configMap:
23+
name: training-code
24+
name: code
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
resources:
2+
- configmap.yaml
3+
- deployment.yaml

test/e2e/gpu/gpu.go

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,10 @@ import (
3535
)
3636

3737
const (
38-
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
39-
containerName = "testcontainer"
38+
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
39+
containerName = "testcontainer"
40+
tfKustomizationYaml = "demo/intel-gpu-tensorflow/kustomization.yaml"
41+
tfPodName = "training-pod"
4042
)
4143

4244
func init() {
@@ -113,4 +115,48 @@ func describe() {
113115

114116
framework.Logf("found card and renderD from the log")
115117
})
118+
119+
ginkgo.It("run some tensorflow code on GPU", func() {
120+
ginkgo.By("deploying GPU plugin")
121+
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))
122+
123+
ginkgo.By("waiting for GPU plugin's availability")
124+
_, err := e2epod.WaitForPodsWithLabelRunningReady(f.ClientSet, f.Namespace.Name,
125+
labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
126+
if err != nil {
127+
e2edebug.DumpAllNamespaceInfo(f.ClientSet, f.Namespace.Name)
128+
e2ekubectl.LogFailedContainers(f.ClientSet, f.Namespace.Name, framework.Logf)
129+
framework.Failf("unable to wait for all pods to be running and ready: %v", err)
130+
}
131+
132+
ginkgo.By("checking if the resource is allocatable")
133+
if err = utils.WaitForNodesWithResource(f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil {
134+
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
135+
}
136+
137+
kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml)
138+
if err != nil {
139+
framework.Failf("unable to locate %q: %v", kustomYaml, err)
140+
}
141+
142+
ginkgo.By("submitting demo deployment")
143+
144+
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomYaml))
145+
146+
ginkgo.By("waiting the pod to finish successfully")
147+
e2epod.NewPodClient(f).WaitForSuccess(tfPodName, 240*time.Second)
148+
149+
ginkgo.By("checking log output")
150+
log, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, tfPodName, containerName)
151+
152+
if err != nil {
153+
framework.Failf("unable to get log from pod: %v", err)
154+
}
155+
156+
if !strings.Contains(log, "SUCCESS") {
157+
framework.Failf("tensorflow execution failed")
158+
}
159+
160+
framework.Logf("tensorflow execution succeeded!")
161+
})
116162
}

0 commit comments

Comments
 (0)