Skip to content

Commit 80ee520

Browse files
committed
gpu: refactor nfdhook functionality to plugin
NFD v0.14+ doesn't support binary NFD hooks by default, so there is a need to move the label creation away from the GPU nfdhook. Move extended resource label creation to plugin, and drop labels that were already marked deprecated (platform_gen, media_version etc.). Drop init-container from deployment files and operator. It is still possible to use an initcontainer, but the default deployments do not support it. Signed-off-by: Tuomas Katila <[email protected]>
1 parent 8d3ef43 commit 80ee520

File tree

18 files changed

+384
-450
lines changed

18 files changed

+384
-450
lines changed

cmd/gpu_nfdhook/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Table of Contents
1111

1212
## Introduction
1313

14+
***NOTE:*** NFD's binary hook support will be turned off by default in the 0.14 release. The functionality in the GPU NFD hook is moved into a new NFD rule and into GPU plugin, and the capability labels are being removed completely. The GPU plugin deployment doesn't anymore support using init container. This directory will be removed in the future.
15+
1416
This is the [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery)
1517
binary hook implementation for the Intel GPUs. The intel-gpu-initcontainer (which
1618
is built with the other images) can be used as part of the gpu-plugin deployment

cmd/gpu_nfdhook/main.go

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,14 @@
1515
package main
1616

1717
import (
18-
"os"
19-
20-
"k8s.io/klog/v2"
18+
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
2119
)
2220

2321
const (
24-
sysfsDirectory = "/host-sys"
25-
sysfsDRMDirectory = sysfsDirectory + "/class/drm"
26-
debugfsDRIDirectory = sysfsDirectory + "/kernel/debug/dri"
22+
sysfsDirectory = "/host-sys"
23+
sysfsDRMDirectory = sysfsDirectory + "/class/drm"
2724
)
2825

2926
func main() {
30-
l := newLabeler(sysfsDRMDirectory, debugfsDRIDirectory)
31-
32-
err := l.createLabels()
33-
if err != nil {
34-
klog.Errorf("%+v", err)
35-
os.Exit(1)
36-
}
37-
38-
l.printLabels()
27+
labeler.CreateAndPrintLabels(sysfsDRMDirectory)
3928
}

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2017-2022 Intel Corporation. All Rights Reserved.
1+
// Copyright 2017-2023 Intel Corporation. All Rights Reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -31,13 +31,16 @@ import (
3131
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
3232

3333
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
34+
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
3435
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
3536
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
3637
)
3738

3839
const (
3940
sysfsDrmDirectory = "/sys/class/drm"
4041
devfsDriDirectory = "/dev/dri"
42+
nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d"
43+
resourceFilename = "intel-gpu-resources.txt"
4144
gpuDeviceRE = `^card[0-9]+$`
4245
controlDeviceRE = `^controlD[0-9]+$`
4346
pciAddressRE = "^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\\.[0-9a-f]{1}$"
@@ -53,6 +56,9 @@ const (
5356

5457
// Period of device scans.
5558
scanPeriod = 5 * time.Second
59+
60+
// Labeler's max update interval, 5min.
61+
labelerMaxInterval = 5 * 60 * time.Second
5662
)
5763

5864
type cliOptions struct {
@@ -242,8 +248,9 @@ type devicePlugin struct {
242248
controlDeviceReg *regexp.Regexp
243249
pciAddressReg *regexp.Regexp
244250

245-
scanTicker *time.Ticker
246-
scanDone chan bool
251+
scanTicker *time.Ticker
252+
scanDone chan bool
253+
scanResources chan bool
247254

248255
resMan rm.ResourceManager
249256

@@ -270,6 +277,7 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
270277
scanTicker: time.NewTicker(scanPeriod),
271278
scanDone: make(chan bool, 1), // buffered as we may send to it before Scan starts receiving from it
272279
bypathFound: true,
280+
scanResources: make(chan bool, 1),
273281
}
274282

275283
if options.resourceManagement {
@@ -347,17 +355,26 @@ func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
347355
klog.Warning("Failed to scan: ", err)
348356
}
349357

358+
countChanged := false
359+
350360
for name, prev := range previousCount {
351361
count := devTree.DeviceTypeCount(name)
352362
if count != prev {
353363
klog.V(1).Infof("GPU scan update: %d->%d '%s' resources found", prev, count, name)
354364

355365
previousCount[name] = count
366+
367+
countChanged = true
356368
}
357369
}
358370

359371
notifier.Notify(devTree)
360372

373+
// Trigger resource scan if it's enabled.
374+
if dp.resMan != nil && countChanged {
375+
dp.scanResources <- true
376+
}
377+
361378
select {
362379
case <-dp.scanDone:
363380
return nil
@@ -515,6 +532,18 @@ func main() {
515532
klog.V(1).Infof("GPU device plugin started with %s preferred allocation policy", opts.preferredAllocationPolicy)
516533

517534
plugin := newDevicePlugin(prefix+sysfsDrmDirectory, prefix+devfsDriDirectory, opts)
535+
536+
if plugin.options.resourceManagement {
537+
// Start labeler to export labels file for NFD.
538+
nfdFeatureFile := path.Join(nfdFeatureDir, resourceFilename)
539+
540+
klog.V(2).Infof("NFD feature file location: %s", nfdFeatureFile)
541+
542+
// Labeler catches OS signals and calls os.Exit() after receiving any.
543+
go labeler.Run(prefix+sysfsDrmDirectory, nfdFeatureFile,
544+
labelerMaxInterval, plugin.scanResources)
545+
}
546+
518547
manager := dpapi.NewManager(namespace, plugin)
519548
manager.Run()
520549
}

cmd/gpu_plugin/gpu_plugin_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2017-2021 Intel Corporation. All Rights Reserved.
1+
// Copyright 2017-2023 Intel Corporation. All Rights Reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.

cmd/gpu_plugin/render-device.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/sh
22
#
3-
# Copyright 2021 Intel Corporation.
3+
# Copyright 2021-2023 Intel Corporation.
44
#
55
# SPDX-License-Identifier: Apache-2.0
66
#

cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021 Intel Corporation. All Rights Reserved.
1+
// Copyright 2021-2023 Intel Corporation. All Rights Reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.

cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021 Intel Corporation. All Rights Reserved.
1+
// Copyright 2021-2023 Intel Corporation. All Rights Reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.

0 commit comments

Comments
 (0)