e2e: gpu: add a basic tensorflow test

tkatila · tkatila · commit 504f09e8f6ae · 2023-04-11T14:01:10.000+03:00
Signed-off-by: Tuomas Katila &lt;tuomas.katila@intel.com&gt;
diff --git a/Makefile b/Makefile
@@ -234,11 +234,12 @@ null  :=
 space := $(null) #
 comma := ,
 images_json := $(subst $(space),$(comma),[$(addprefix ",$(addsuffix ",$(images) $(demos))]))
-skip_images := "ubuntu-demo-openvino"
+skip_images_source := ubuntu-demo-openvino intel-gpu-tensorflow
+skip_images := $(subst $(space),$(comma),[$(addprefix ",$(addsuffix ",$(skip_images_source))]))
 
 check-github-actions:
 	@python3 -c 'import sys, yaml, json; json.dump(yaml.load(sys.stdin, Loader=yaml.SafeLoader), sys.stdout)' < .github/workflows/ci.yaml | \
-	jq -e '$(images_json) - [$(skip_images)] - .jobs.image.strategy.matrix.image == []' > /dev/null || \
+	jq -e '$(images_json) - $(skip_images) - .jobs.image.strategy.matrix.image == []' > /dev/null || \
 	(echo "Make sure all images are listed in .github/workflows/ci.yaml"; exit 1)
 
 .PHONY: all format test lint build images $(cmds) $(images) lock-images vendor pre-pull set-version check-github-actions envtest fixture update-fixture install-tools test-image-base-layer
diff --git a/demo/intel-gpu-tensorflow/configmap.yaml b/demo/intel-gpu-tensorflow/configmap.yaml
@@ -0,0 +1,48 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: training-code
+data:
+  training.py: |
+    # original code from:
+    # https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l02c01_celsius_to_fahrenheit.ipynb
+    # this is slightly modified to run explicitly with XPU devices
+
+    import tensorflow as tf
+    import intel_extension_for_tensorflow as itex
+    import numpy as np
+
+    print("BACKENDS: ", str(itex.get_backend()))
+
+    devs = tf.config.list_physical_devices('XPU')
+
+    print(devs)
+
+    if not devs:
+      raise Exception("No devices found")
+
+    with tf.device("/xpu:0"):
+      celsius_q    = np.array([-40, -10,  0,  8, 15, 22,  38],  dtype=float)
+      fahrenheit_a = np.array([-40,  14, 32, 46, 59, 72, 100],  dtype=float)
+
+      model = tf.keras.Sequential([
+        tf.keras.layers.Dense(units=1, input_shape=[1])
+      ])
+
+      model.compile(loss='mean_squared_error',
+                    optimizer=tf.keras.optimizers.Adam(0.1))
+
+      history = model.fit(celsius_q, fahrenheit_a, epochs=500, verbose=False)
+      print("model trained")
+
+      test = [100.0]
+      p = model.predict(test)
+      if len(p) == 1:
+        prediction = p[0]
+
+        if prediction >= 212 and prediction < 213:
+          print("inference tested")
+        else:
+          raise Exception("bad prediction")
+
+      print("SUCCESS")
diff --git a/demo/intel-gpu-tensorflow/deployment.yaml b/demo/intel-gpu-tensorflow/deployment.yaml
@@ -0,0 +1,24 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: training-pod
+spec:
+  restartPolicy: Never
+  containers:
+  - name: testcontainer
+    image: intel/intel-extension-for-tensorflow:1.1.0-gpu-flex
+    imagePullPolicy: IfNotPresent
+    command: ["/bin/sh", "-c"]
+    args: ["python /code/training.py"]
+    resources:
+      limits:
+        gpu.intel.com/i915: 1
+      requests:
+        gpu.intel.com/i915: 1
+    volumeMounts:
+    - mountPath: /code
+      name: code
+  volumes:
+  - configMap:
+      name: training-code
+    name: code
diff --git a/demo/intel-gpu-tensorflow/kustomization.yaml b/demo/intel-gpu-tensorflow/kustomization.yaml
@@ -0,0 +1,3 @@
+resources:
+  - configmap.yaml
+  - deployment.yaml
diff --git a/test/e2e/gpu/gpu.go b/test/e2e/gpu/gpu.go
@@ -35,8 +35,10 @@ import (
 )
 
 const (
-	kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
-	containerName     = "testcontainer"
+	kustomizationYaml   = "deployments/gpu_plugin/kustomization.yaml"
+	containerName       = "testcontainer"
+	tfKustomizationYaml = "demo/intel-gpu-tensorflow/kustomization.yaml"
+	tfPodName           = "training-pod"
 )
 
 func init() {
@@ -113,4 +115,48 @@ func describe() {
 
 		framework.Logf("found card and renderD from the log")
 	})
+
+	ginkgo.It("run some tensorflow code on GPU", func() {
+		ginkgo.By("deploying GPU plugin")
+		e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))
+
+		ginkgo.By("waiting for GPU plugin's availability")
+		_, err := e2epod.WaitForPodsWithLabelRunningReady(f.ClientSet, f.Namespace.Name,
+			labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
+		if err != nil {
+			e2edebug.DumpAllNamespaceInfo(f.ClientSet, f.Namespace.Name)
+			e2ekubectl.LogFailedContainers(f.ClientSet, f.Namespace.Name, framework.Logf)
+			framework.Failf("unable to wait for all pods to be running and ready: %v", err)
+		}
+
+		ginkgo.By("checking if the resource is allocatable")
+		if err = utils.WaitForNodesWithResource(f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil {
+			framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
+		}
+
+		kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml)
+		if err != nil {
+			framework.Failf("unable to locate %q: %v", kustomYaml, err)
+		}
+
+		ginkgo.By("submitting demo deployment")
+
+		e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomYaml))
+
+		ginkgo.By("waiting the pod to finish successfully")
+		e2epod.NewPodClient(f).WaitForSuccess(tfPodName, 240*time.Second)
+
+		ginkgo.By("checking log output")
+		log, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, tfPodName, containerName)
+
+		if err != nil {
+			framework.Failf("unable to get log from pod: %v", err)
+		}
+
+		if !strings.Contains(log, "SUCCESS") {
+			framework.Failf("tensorflow execution failed")
+		}
+
+		framework.Logf("tensorflow execution succeeded!")
+	})
 }

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+resources:`
	`2`	`+ - configmap.yaml`
	`3`	`+ - deployment.yaml`