From f9221c46fd360ec3932ae3ba421a3bafc3a49fdc Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Thu, 26 Oct 2023 13:17:41 +0300 Subject: [PATCH 1/3] operator: remove one-cr-per-kind limitation Differentiate objects by adding cr names as suffixes Drop kind book keeping and related functions from controllers Signed-off-by: Tuomas Katila --- cmd/operator/README.md | 6 +++ .../v1/dlbdeviceplugin_webhook.go | 9 ---- .../v1/dsadeviceplugin_webhook.go | 8 --- .../v1/fpgadeviceplugin_webhook.go | 9 ---- .../v1/gpudeviceplugin_webhook.go | 8 --- .../v1/iaadeviceplugin_webhook.go | 8 --- .../v1/qatdeviceplugin_webhook.go | 8 --- .../v1/sgxdeviceplugin_webhook.go | 9 ---- pkg/controllers/dlb/controller.go | 11 +--- pkg/controllers/dlb/controller_test.go | 3 +- pkg/controllers/dsa/controller.go | 11 +--- pkg/controllers/dsa/controller_test.go | 3 +- pkg/controllers/fpga/controller.go | 11 +--- pkg/controllers/fpga/controller_test.go | 3 +- pkg/controllers/gpu/controller.go | 26 ++++----- pkg/controllers/gpu/controller_test.go | 9 ++-- pkg/controllers/iaa/controller.go | 10 +--- pkg/controllers/iaa/controller_test.go | 3 +- pkg/controllers/qat/controller.go | 10 +--- pkg/controllers/qat/controller_test.go | 5 +- pkg/controllers/reconciler.go | 53 ++----------------- pkg/controllers/sgx/controller.go | 11 +--- pkg/controllers/sgx/controller_test.go | 3 +- .../dlbdeviceplugin_controller_test.go | 11 ++-- .../dsadeviceplugin_controller_test.go | 12 +++-- .../fpgadeviceplugin_controller_test.go | 11 ++-- .../gpudeviceplugin_controller_test.go | 10 ++-- .../iaadeviceplugin_controller_test.go | 11 ++-- .../qatdeviceplugin_controller_test.go | 11 ++-- .../sgxdeviceplugin_controller_test.go | 11 ++-- 30 files changed, 107 insertions(+), 207 deletions(-) diff --git a/cmd/operator/README.md b/cmd/operator/README.md index 3b4a5c46e..6790e4728 100644 --- a/cmd/operator/README.md +++ b/cmd/operator/README.md @@ -103,6 +103,8 @@ NAME DESIRED READY NODE SELECTOR AGE gpudeviceplugin-sample 1 1 5s ``` +**NOTE:** Intel Device Plugin Operator supports multiple custom resources per Kind (QAT, DSA, etc.). With multiple custom resources and different `nodeSelectors`, it is possible to customize device plugin configuration per node or per group of nodes. See also [known issues](#multiple-custom-resources). + ## Upgrade The upgrade of the deployed plugins can be done by simply installing a new release of the operator. @@ -135,6 +137,10 @@ command line argument multiple times. ## Known issues +### Multiple Custom Resources + +With multiple custom resources, `nodeSelector` has to be carefully set to avoid device plugin DaemonSet getting deployed multiple times on the same node, as operator does not check or prevent this. Multiple plugins managing same resource on a node can cause invalid behavior and/or duplicate device resources on node. + ### Cluster behind a proxy If your cluster operates behind a corporate proxy make sure that the API diff --git a/pkg/apis/deviceplugin/v1/dlbdeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/dlbdeviceplugin_webhook.go index dd5cfd02f..baff36a75 100644 --- a/pkg/apis/deviceplugin/v1/dlbdeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/dlbdeviceplugin_webhook.go @@ -15,7 +15,6 @@ package v1 import ( - "github.com/pkg/errors" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -25,10 +24,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -const ( - dlbPluginKind = "DlbDevicePlugin" -) - var ( // dlbdevicepluginlog is for logging in this package. dlbdevicepluginlog = logf.Log.WithName("dlbdeviceplugin-resource") @@ -64,10 +59,6 @@ var _ webhook.Validator = &DlbDevicePlugin{} func (r *DlbDevicePlugin) ValidateCreate() (admission.Warnings, error) { dlbdevicepluginlog.Info("validate create", "name", r.Name) - if controllers.GetDevicePluginCount(dlbPluginKind) > 0 { - return nil, errors.Errorf("an instance of %q already exists in the cluster", dlbPluginKind) - } - return nil, r.validatePlugin() } diff --git a/pkg/apis/deviceplugin/v1/dsadeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/dsadeviceplugin_webhook.go index a5226c66f..59777bf3d 100644 --- a/pkg/apis/deviceplugin/v1/dsadeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/dsadeviceplugin_webhook.go @@ -25,10 +25,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -const ( - dsaPluginKind = "DsaDevicePlugin" -) - var ( // dsadevicepluginlog is for logging in this package. dsadevicepluginlog = logf.Log.WithName("dsadeviceplugin-resource") @@ -64,10 +60,6 @@ var _ webhook.Validator = &DsaDevicePlugin{} func (r *DsaDevicePlugin) ValidateCreate() (admission.Warnings, error) { dsadevicepluginlog.Info("validate create", "name", r.Name) - if controllers.GetDevicePluginCount(dsaPluginKind) > 0 { - return nil, errors.Errorf("an instance of %q already exists in the cluster", dsaPluginKind) - } - return nil, r.validatePlugin() } diff --git a/pkg/apis/deviceplugin/v1/fpgadeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/fpgadeviceplugin_webhook.go index ba96b7c66..bff8de5df 100644 --- a/pkg/apis/deviceplugin/v1/fpgadeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/fpgadeviceplugin_webhook.go @@ -15,7 +15,6 @@ package v1 import ( - "github.com/pkg/errors" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -25,10 +24,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -const ( - fpgaPluginKind = "FpgaDevicePlugin" -) - var ( // fpgadevicepluginlog is for logging in this package. fpgadevicepluginlog = logf.Log.WithName("fpgadeviceplugin-resource") @@ -68,10 +63,6 @@ var _ webhook.Validator = &FpgaDevicePlugin{} func (r *FpgaDevicePlugin) ValidateCreate() (admission.Warnings, error) { fpgadevicepluginlog.Info("validate create", "name", r.Name) - if controllers.GetDevicePluginCount(fpgaPluginKind) > 0 { - return nil, errors.Errorf("an instance of %q already exists in the cluster", fpgaPluginKind) - } - return nil, r.validatePlugin() } diff --git a/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go index 6a3f55b4a..d306312b3 100644 --- a/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go @@ -25,10 +25,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -const ( - gpuPluginKind = "GpuDevicePlugin" -) - var ( // gpudevicepluginlog is for logging in this package. gpudevicepluginlog = logf.Log.WithName("gpudeviceplugin-resource") @@ -64,10 +60,6 @@ var _ webhook.Validator = &GpuDevicePlugin{} func (r *GpuDevicePlugin) ValidateCreate() (admission.Warnings, error) { gpudevicepluginlog.Info("validate create", "name", r.Name) - if controllers.GetDevicePluginCount(gpuPluginKind) > 0 { - return nil, errors.Errorf("an instance of %q already exists in the cluster", gpuPluginKind) - } - return nil, r.validatePlugin() } diff --git a/pkg/apis/deviceplugin/v1/iaadeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/iaadeviceplugin_webhook.go index f31eb1dc0..aaa20f5ae 100644 --- a/pkg/apis/deviceplugin/v1/iaadeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/iaadeviceplugin_webhook.go @@ -25,10 +25,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -const ( - iaaPluginKind = "IaaDevicePlugin" -) - var ( // iaadevicepluginlog is for logging in this package. iaadevicepluginlog = logf.Log.WithName("iaadeviceplugin-resource") @@ -64,10 +60,6 @@ var _ webhook.Validator = &IaaDevicePlugin{} func (r *IaaDevicePlugin) ValidateCreate() (admission.Warnings, error) { iaadevicepluginlog.Info("validate create", "name", r.Name) - if controllers.GetDevicePluginCount(iaaPluginKind) > 0 { - return nil, errors.Errorf("an instance of %q already exists in the cluster", iaaPluginKind) - } - return nil, r.validatePlugin() } diff --git a/pkg/apis/deviceplugin/v1/qatdeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/qatdeviceplugin_webhook.go index 7a1e2204c..268da6e9a 100644 --- a/pkg/apis/deviceplugin/v1/qatdeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/qatdeviceplugin_webhook.go @@ -25,10 +25,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -const ( - qatPluginKind = "QatDevicePlugin" -) - var ( // qatdevicepluginlog is for logging in this package. qatdevicepluginlog = logf.Log.WithName("qatdeviceplugin-resource") @@ -64,10 +60,6 @@ var _ webhook.Validator = &QatDevicePlugin{} func (r *QatDevicePlugin) ValidateCreate() (admission.Warnings, error) { qatdevicepluginlog.Info("validate create", "name", r.Name) - if controllers.GetDevicePluginCount(qatPluginKind) > 0 { - return nil, errors.Errorf("an instance of %q already exists in the cluster", qatPluginKind) - } - return nil, r.validatePlugin() } diff --git a/pkg/apis/deviceplugin/v1/sgxdeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/sgxdeviceplugin_webhook.go index c2c21223a..8bf1e356f 100644 --- a/pkg/apis/deviceplugin/v1/sgxdeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/sgxdeviceplugin_webhook.go @@ -15,7 +15,6 @@ package v1 import ( - "github.com/pkg/errors" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -25,10 +24,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -const ( - sgxPluginKind = "SgxDevicePlugin" -) - var ( // sgxdevicepluginlog is for logging in this package. sgxdevicepluginlog = logf.Log.WithName("sgxdeviceplugin-resource") @@ -64,10 +59,6 @@ var _ webhook.Validator = &SgxDevicePlugin{} func (r *SgxDevicePlugin) ValidateCreate() (admission.Warnings, error) { sgxdevicepluginlog.Info("validate create", "name", r.Name) - if controllers.GetDevicePluginCount(sgxPluginKind) > 0 { - return nil, errors.Errorf("an instance of %q already exists in the cluster", sgxPluginKind) - } - return nil, r.validatePlugin() } diff --git a/pkg/controllers/dlb/controller.go b/pkg/controllers/dlb/controller.go index 5d4f01460..0f2bb1b5c 100644 --- a/pkg/controllers/dlb/controller.go +++ b/pkg/controllers/dlb/controller.go @@ -71,19 +71,12 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) { - var list devicepluginv1.DlbDevicePluginList - if err := clnt.List(ctx, &list); err != nil { - return 0, err - } - - return len(list.Items), nil -} - func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.DlbDevicePlugin) ds := deployments.DLBPluginDaemonSet() + ds.Name = controllers.SuffixedName(ds.Name, devicePlugin.Name) + if len(devicePlugin.Spec.NodeSelector) > 0 { ds.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector } diff --git a/pkg/controllers/dlb/controller_test.go b/pkg/controllers/dlb/controller_test.go index f0573220d..3bd7cf5bc 100644 --- a/pkg/controllers/dlb/controller_test.go +++ b/pkg/controllers/dlb/controller_test.go @@ -42,7 +42,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, ObjectMeta: metav1.ObjectMeta{ Namespace: c.ns, - Name: appLabel, + Name: appLabel + "-" + devicePlugin.Name, Labels: map[string]string{ "app": appLabel, }, @@ -139,6 +139,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet // equal to the expected daemonset. func TestNewDaemonSetDLB(t *testing.T) { plugin := &devicepluginv1.DlbDevicePlugin{} + plugin.Name = "testing" c := &controller{} expected := c.newDaemonSetExpected(plugin) diff --git a/pkg/controllers/dsa/controller.go b/pkg/controllers/dsa/controller.go index cc71a823f..61a452b01 100644 --- a/pkg/controllers/dsa/controller.go +++ b/pkg/controllers/dsa/controller.go @@ -75,15 +75,6 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) { - var list devicepluginv1.DsaDevicePluginList - if err := clnt.List(ctx, &list); err != nil { - return 0, err - } - - return len(list.Items), nil -} - func removeInitContainer(ds *apps.DaemonSet, dp *devicepluginv1.DsaDevicePlugin) { newInitContainers := []v1.Container{} @@ -199,6 +190,8 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.DsaDevicePlugin) daemonSet := deployments.DSAPluginDaemonSet() + daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name) + if len(devicePlugin.Spec.NodeSelector) > 0 { daemonSet.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector } diff --git a/pkg/controllers/dsa/controller_test.go b/pkg/controllers/dsa/controller_test.go index c168093d6..aa4120bec 100644 --- a/pkg/controllers/dsa/controller_test.go +++ b/pkg/controllers/dsa/controller_test.go @@ -43,7 +43,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, ObjectMeta: metav1.ObjectMeta{ Namespace: c.ns, - Name: appLabel, + Name: appLabel + "-" + devicePlugin.Name, Labels: map[string]string{ "app": appLabel, }, @@ -160,6 +160,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet // equal to the expected daemonset. func TestNewDaemonSetDSA(t *testing.T) { plugin := &devicepluginv1.DsaDevicePlugin{} + plugin.Name = "testing" c := &controller{} expected := c.newDaemonSetExpected(plugin) diff --git a/pkg/controllers/fpga/controller.go b/pkg/controllers/fpga/controller.go index 19488714c..afb57f1c2 100644 --- a/pkg/controllers/fpga/controller.go +++ b/pkg/controllers/fpga/controller.go @@ -70,19 +70,12 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) { - var list devicepluginv1.FpgaDevicePluginList - if err := clnt.List(ctx, &list); err != nil { - return 0, err - } - - return len(list.Items), nil -} - func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.FpgaDevicePlugin) daemonSet := deployments.FPGAPluginDaemonSet() + daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name) + if len(devicePlugin.Spec.NodeSelector) > 0 { daemonSet.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector } diff --git a/pkg/controllers/fpga/controller_test.go b/pkg/controllers/fpga/controller_test.go index 76e974f0c..da7e06218 100644 --- a/pkg/controllers/fpga/controller_test.go +++ b/pkg/controllers/fpga/controller_test.go @@ -45,7 +45,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, ObjectMeta: metav1.ObjectMeta{ Namespace: c.ns, - Name: appLabel, + Name: appLabel + "-" + devicePlugin.Name, Labels: map[string]string{ "app": appLabel, }, @@ -184,6 +184,7 @@ func TestNewDaemonSetFPGA(t *testing.T) { InitImage: "intel/intel-fpga-initcontainer:devel", }, } + plugin.Name = "testing" expected := c.newDaemonSetExpected(plugin) actual := c.NewDaemonSet(plugin) diff --git a/pkg/controllers/gpu/controller.go b/pkg/controllers/gpu/controller.go index 45bbcf836..d8e6286f6 100644 --- a/pkg/controllers/gpu/controller.go +++ b/pkg/controllers/gpu/controller.go @@ -37,8 +37,9 @@ import ( ) const ( - ownerKey = ".metadata.controller.gpu" - serviceAccountName = "gpu-manager-sa" + ownerKey = ".metadata.controller.gpu" + serviceAccountPrefix = "gpu-manager-sa" + roleBindingPrefix = "gpu-manager-rolebinding" ) var defaultNodeSelector = deployments.GPUPluginDaemonSet().Spec.Template.Spec.NodeSelector @@ -75,21 +76,12 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) { - var list devicepluginv1.GpuDevicePluginList - if err := clnt.List(ctx, &list); err != nil { - return 0, err - } - - return len(list.Items), nil -} - func (c *controller) NewServiceAccount(rawObj client.Object) *v1.ServiceAccount { devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin) if devicePlugin.Spec.ResourceManager { sa := v1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-manager-sa", + Name: prefixedName(serviceAccountPrefix, devicePlugin.Name), Namespace: c.ns, }, } @@ -105,13 +97,13 @@ func (c *controller) NewClusterRoleBinding(rawObj client.Object) *rbacv1.Cluster if devicePlugin.Spec.ResourceManager { rb := rbacv1.ClusterRoleBinding{ ObjectMeta: metav1.ObjectMeta{ - Name: "gpu-manager-rolebinding", + Name: prefixedName(roleBindingPrefix, devicePlugin.Name), Namespace: c.ns, }, Subjects: []rbacv1.Subject{ { Kind: "ServiceAccount", - Name: "gpu-manager-sa", + Name: prefixedName(serviceAccountPrefix, devicePlugin.Name), Namespace: c.ns, }, }, @@ -132,6 +124,8 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin) daemonSet := deployments.GPUPluginDaemonSet() + daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name) + if len(devicePlugin.Spec.NodeSelector) > 0 { daemonSet.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector } @@ -149,7 +143,7 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { // add service account if resource manager is enabled if devicePlugin.Spec.ResourceManager { - daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountName + daemonSet.Spec.Template.Spec.ServiceAccountName = prefixedName(serviceAccountPrefix, devicePlugin.Name) addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", v1.HostPathFileOrCreate) @@ -330,7 +324,7 @@ func (c *controller) UpdateDaemonSet(rawObj client.Object, ds *apps.DaemonSet) ( newServiceAccountName := "default" if dp.Spec.ResourceManager { - newServiceAccountName = serviceAccountName + newServiceAccountName = prefixedName(serviceAccountPrefix, dp.Name) } if ds.Spec.Template.Spec.ServiceAccountName != newServiceAccountName { diff --git a/pkg/controllers/gpu/controller_test.go b/pkg/controllers/gpu/controller_test.go index c0b8b8241..b7a69c81c 100644 --- a/pkg/controllers/gpu/controller_test.go +++ b/pkg/controllers/gpu/controller_test.go @@ -45,7 +45,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, ObjectMeta: metav1.ObjectMeta{ Namespace: c.ns, - Name: appLabel, + Name: appLabel + "-" + devicePlugin.Name, Labels: map[string]string{ "app": appLabel, }, @@ -146,7 +146,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet // add service account if resource manager is enabled if devicePlugin.Spec.ResourceManager { - daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountName + daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountPrefix + "-" + devicePlugin.Name addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) @@ -169,7 +169,7 @@ func (c *controller) updateDaemonSetExpected(rawObj client.Object, ds *apps.Daem hadRM := strings.Contains(argString, "-resource-manager") if !hadRM && dp.Spec.ResourceManager { - ds.Spec.Template.Spec.ServiceAccountName = "gpu-manager-sa" + ds.Spec.Template.Spec.ServiceAccountName = serviceAccountPrefix + "-" + dp.Name addVolumeIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) addVolumeMountIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) @@ -220,6 +220,7 @@ func TestNewDamonSetGPU(t *testing.T) { for _, tc := range tcases { plugin := &devicepluginv1.GpuDevicePlugin{} + plugin.Name = "new-gpu-cr-testing" plugin.Spec.ResourceManager = tc.rm t.Run(tc.name, func(t *testing.T) { @@ -252,10 +253,12 @@ func TestUpdateDamonSetGPU(t *testing.T) { for _, tc := range tcases { before := &devicepluginv1.GpuDevicePlugin{} + before.Name = "update-gpu-cr-testing" before.Spec.ResourceManager = tc.rmInitially after := &devicepluginv1.GpuDevicePlugin{} + after.Name = "update-gpu-cr-testing" after.Spec.ResourceManager = !tc.rmInitially diff --git a/pkg/controllers/iaa/controller.go b/pkg/controllers/iaa/controller.go index f0dd5112d..87ec87c88 100644 --- a/pkg/controllers/iaa/controller.go +++ b/pkg/controllers/iaa/controller.go @@ -73,15 +73,6 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) { - var list devicepluginv1.IaaDevicePluginList - if err := clnt.List(ctx, &list); err != nil { - return 0, err - } - - return len(list.Items), nil -} - func removeInitContainer(ds *apps.DaemonSet, dp *devicepluginv1.IaaDevicePlugin) { newInitContainers := []v1.Container{} @@ -198,6 +189,7 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.IaaDevicePlugin) daemonSet := deployments.IAAPluginDaemonSet() + daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name) if len(devicePlugin.Spec.NodeSelector) > 0 { daemonSet.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector diff --git a/pkg/controllers/iaa/controller_test.go b/pkg/controllers/iaa/controller_test.go index f1cd5ab28..c47eaad92 100644 --- a/pkg/controllers/iaa/controller_test.go +++ b/pkg/controllers/iaa/controller_test.go @@ -43,7 +43,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, ObjectMeta: metav1.ObjectMeta{ Namespace: c.ns, - Name: appLabel, + Name: appLabel + "-" + devicePlugin.Name, Labels: map[string]string{ "app": appLabel, }, @@ -160,6 +160,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet // equal to the expected daemonset. func TestNewDaemonSetIAA(t *testing.T) { plugin := &devicepluginv1.IaaDevicePlugin{} + plugin.Name = "testing" c := &controller{} expected := c.newDaemonSetExpected(plugin) diff --git a/pkg/controllers/qat/controller.go b/pkg/controllers/qat/controller.go index d1019dc62..ddacf02a4 100644 --- a/pkg/controllers/qat/controller.go +++ b/pkg/controllers/qat/controller.go @@ -75,21 +75,13 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) { - var list devicepluginv1.QatDevicePluginList - if err := clnt.List(ctx, &list); err != nil { - return 0, err - } - - return len(list.Items), nil -} - func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.QatDevicePlugin) annotations := devicePlugin.ObjectMeta.DeepCopy().Annotations daemonSet := deployments.QATPluginDaemonSet() + daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name) daemonSet.Annotations = annotations daemonSet.Spec.Template.Annotations = annotations diff --git a/pkg/controllers/qat/controller_test.go b/pkg/controllers/qat/controller_test.go index c2eb12bef..5030f73c2 100644 --- a/pkg/controllers/qat/controller_test.go +++ b/pkg/controllers/qat/controller_test.go @@ -26,6 +26,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" devicepluginv1 "github.com/intel/intel-device-plugins-for-kubernetes/pkg/apis/deviceplugin/v1" + "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) const appLabel = "intel-qat-plugin" @@ -45,7 +46,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, ObjectMeta: metav1.ObjectMeta{ Namespace: c.ns, - Name: appLabel, + Name: appLabel + "-" + devicePlugin.Name, Labels: map[string]string{ "app": appLabel, }, @@ -164,6 +165,8 @@ func TestNewDaemonSetQAT(t *testing.T) { c := &controller{} plugin := &devicepluginv1.QatDevicePlugin{} + plugin.Name = "testing" + plugin.Spec.InitImage = "intel/intel-qat-initcontainer:" + controllers.ImageMinVersion.String() expected := c.newDaemonSetExpected(plugin) actual := c.NewDaemonSet(plugin) diff --git a/pkg/controllers/reconciler.go b/pkg/controllers/reconciler.go index 420241157..6d9782bea 100644 --- a/pkg/controllers/reconciler.go +++ b/pkg/controllers/reconciler.go @@ -20,7 +20,6 @@ import ( "os" "path/filepath" "strings" - "sync" "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" @@ -38,39 +37,9 @@ import ( ) var ( - bKeeper = &bookKeeper{} ImageMinVersion = versionutil.MustParseSemantic("0.28.0") ) -func init() { - bKeeper.pluginCounter = make(map[string]int) -} - -//nolint:govet -type bookKeeper struct { - sync.Mutex - pluginCounter map[string]int -} - -func (b *bookKeeper) set(pluginKind string, count int) { - b.Lock() - defer b.Unlock() - - b.pluginCounter[pluginKind] = count -} - -func (b *bookKeeper) count(pluginKind string) int { - b.Lock() - defer b.Unlock() - - return b.pluginCounter[pluginKind] -} - -// GetDevicePluginCount returns number of device plugin CRs registered. -func GetDevicePluginCount(pluginKind string) int { - return bKeeper.count(pluginKind) -} - // +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;delete // +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;delete @@ -102,7 +71,6 @@ func (d *DefaultServiceAccountFactory) NewClusterRoleBinding(rawObj client.Objec type DevicePluginController interface { ServiceAccountFactory CreateEmptyObject() (devicePlugin client.Object) - GetTotalObjectCount(ctx context.Context, client client.Client) (count int, err error) NewDaemonSet(devicePlugin client.Object) *apps.DaemonSet UpdateDaemonSet(client.Object, *apps.DaemonSet) (updated bool) UpdateStatus(client.Object, *apps.DaemonSet, []string) (updated bool, err error) @@ -117,6 +85,11 @@ type reconciler struct { ownerKey string } +// Combine base and suffix with a dash +func SuffixedName(base, suffix string) string { + return base + "-" + suffix +} + // fetchObjects returns the required objects for Reconcile. func (r *reconciler) fetchObjects(ctx context.Context, req ctrl.Request, log logr.Logger) ( *apps.DaemonSetList, *v1.ServiceAccountList, *rbacv1.ClusterRoleBindingList, error) { @@ -212,11 +185,6 @@ func upgrade(ctx context.Context, r *reconciler, devicePlugin client.Object) { func (r *reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx) - if err := r.updateBookKeeper(ctx); err != nil { - log.Error(err, "unable to total count of device plugins") - return ctrl.Result{}, err - } - childDaemonSets, childServiceAccounts, childClusterRoleBindings, err2 := r.fetchObjects(ctx, req, log) if err2 != nil { return ctrl.Result{}, err2 @@ -407,17 +375,6 @@ func SetupWithManager(mgr ctrl.Manager, controller DevicePluginController, apiGV Complete(r) } -func (r *reconciler) updateBookKeeper(ctx context.Context) error { - count, err := r.controller.GetTotalObjectCount(ctx, r) - if err != nil { - return err - } - - bKeeper.set(r.pluginKind, count) - - return nil -} - func (r *reconciler) createClusterRoleBinding(ctx context.Context, dp client.Object, log logr.Logger) (ctrl.Result, error) { rb := r.controller.NewClusterRoleBinding(dp) if rb == nil { diff --git a/pkg/controllers/sgx/controller.go b/pkg/controllers/sgx/controller.go index e60d14cfe..05f0b6d71 100644 --- a/pkg/controllers/sgx/controller.go +++ b/pkg/controllers/sgx/controller.go @@ -71,15 +71,6 @@ func (c *controller) CreateEmptyObject() client.Object { return &devicepluginv1.SgxDevicePlugin{} } -func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) { - var list devicepluginv1.SgxDevicePluginList - if err := clnt.List(ctx, &list); err != nil { - return 0, err - } - - return len(list.Items), nil -} - func addVolumeIfMissing(spec *v1.PodSpec, name, path string, hpType v1.HostPathType) { for _, vol := range spec.Volumes { if vol.Name == name { @@ -125,6 +116,8 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.SgxDevicePlugin) daemonSet := deployments.SGXPluginDaemonSet() + daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name) + if len(devicePlugin.Spec.NodeSelector) > 0 { daemonSet.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector } diff --git a/pkg/controllers/sgx/controller_test.go b/pkg/controllers/sgx/controller_test.go index 2281d78da..eb7916ba6 100644 --- a/pkg/controllers/sgx/controller_test.go +++ b/pkg/controllers/sgx/controller_test.go @@ -45,7 +45,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, ObjectMeta: metav1.ObjectMeta{ Namespace: c.ns, - Name: appLabel, + Name: appLabel + "-" + devicePlugin.Name, Labels: map[string]string{ "app": appLabel, }, @@ -142,6 +142,7 @@ func TestNewDaemonSetSGX(t *testing.T) { c := &controller{} plugin := &devicepluginv1.SgxDevicePlugin{} + plugin.Name = "testing" expected := c.newDaemonSetExpected(plugin) actual := c.NewDaemonSet(plugin) diff --git a/test/envtest/dlbdeviceplugin_controller_test.go b/test/envtest/dlbdeviceplugin_controller_test.go index 5ad8ad1b4..0e6e50c4a 100644 --- a/test/envtest/dlbdeviceplugin_controller_test.go +++ b/test/envtest/dlbdeviceplugin_controller_test.go @@ -52,6 +52,8 @@ var _ = Describe("DlbDevicePlugin Controller", func() { Spec: spec, } + expectedDsName := "intel-dlb-plugin-dlbdeviceplugin-test" + By("creating DlbDevicePlugin successfully") Expect(k8sClient.Create(context.Background(), toCreate)).Should(Succeed()) time.Sleep(time.Second * 5) @@ -64,7 +66,8 @@ var _ = Describe("DlbDevicePlugin Controller", func() { By("checking DaemonSet is created successfully") ds := &apps.DaemonSet{} - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-dlb-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.Containers[0].Image).To(Equal(spec.Image)) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(1)) Expect(ds.Spec.Template.Spec.InitContainers[0].Image).To(Equal(spec.InitImage)) @@ -90,7 +93,8 @@ var _ = Describe("DlbDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-dlb-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) expectArgs := []string{ "-v", @@ -113,7 +117,8 @@ var _ = Describe("DlbDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated with different values successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-dlb-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.NodeSelector).Should(And(HaveLen(1), HaveKeyWithValue("kubernetes.io/arch", "amd64"))) By("deleting DlbDevicePlugin successfully") diff --git a/test/envtest/dsadeviceplugin_controller_test.go b/test/envtest/dsadeviceplugin_controller_test.go index 93ab191a7..260b6f15d 100644 --- a/test/envtest/dsadeviceplugin_controller_test.go +++ b/test/envtest/dsadeviceplugin_controller_test.go @@ -53,6 +53,8 @@ var _ = Describe("DsaDevicePlugin Controller", func() { Spec: spec, } + expectedDsName := "intel-dsa-plugin-dsadeviceplugin-test" + By("creating DsaDevicePlugin successfully") Expect(k8sClient.Create(context.Background(), toCreate)).Should(Succeed()) time.Sleep(time.Second * 5) @@ -65,7 +67,8 @@ var _ = Describe("DsaDevicePlugin Controller", func() { By("checking DaemonSet is created successfully") ds := &apps.DaemonSet{} - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-dsa-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.Containers[0].Image).To(Equal(spec.Image)) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(1)) Expect(ds.Spec.Template.Spec.InitContainers[0].Image).To(Equal(spec.InitImage)) @@ -95,7 +98,9 @@ var _ = Describe("DsaDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-dsa-plugin"}, ds) + + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) expectArgs := []string{ "-v", @@ -133,7 +138,8 @@ var _ = Describe("DsaDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated with different values successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-dsa-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(0)) Expect(ds.Spec.Template.Spec.Volumes).ShouldNot(ContainElement(expectedVolume)) Expect(ds.Spec.Template.Spec.NodeSelector).Should(And(HaveLen(1), HaveKeyWithValue("kubernetes.io/arch", "amd64"))) diff --git a/test/envtest/fpgadeviceplugin_controller_test.go b/test/envtest/fpgadeviceplugin_controller_test.go index a22d68730..fb6b255e4 100644 --- a/test/envtest/fpgadeviceplugin_controller_test.go +++ b/test/envtest/fpgadeviceplugin_controller_test.go @@ -52,6 +52,8 @@ var _ = Describe("FpgaDevicePlugin Controller", func() { Spec: spec, } + expectedDsName := "intel-fpga-plugin-fpgadeviceplugin-test" + By("creating FpgaDevicePlugin successfully") Expect(k8sClient.Create(context.Background(), toCreate)).Should(Succeed()) time.Sleep(time.Second * 5) @@ -64,7 +66,8 @@ var _ = Describe("FpgaDevicePlugin Controller", func() { By("checking DaemonSet is created successfully") ds := &apps.DaemonSet{} - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-fpga-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.Containers[0].Image).To(Equal(spec.Image)) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(1)) Expect(ds.Spec.Template.Spec.InitContainers[0].Image).To(Equal(spec.InitImage)) @@ -92,7 +95,8 @@ var _ = Describe("FpgaDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-fpga-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) expectArgs := []string{ "-v", @@ -114,7 +118,8 @@ var _ = Describe("FpgaDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated with different values successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-fpga-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) time.Sleep(interval) Expect(ds.Spec.Template.Spec.NodeSelector).Should(And(HaveLen(1), HaveKeyWithValue("kubernetes.io/arch", "amd64"))) diff --git a/test/envtest/gpudeviceplugin_controller_test.go b/test/envtest/gpudeviceplugin_controller_test.go index 24b0bb525..4917b6331 100644 --- a/test/envtest/gpudeviceplugin_controller_test.go +++ b/test/envtest/gpudeviceplugin_controller_test.go @@ -52,6 +52,8 @@ var _ = Describe("GpuDevicePlugin Controller", func() { Spec: spec, } + expectedDsName := "intel-gpu-plugin-gpudeviceplugin-test" + By("creating GpuDevicePlugin successfully") Expect(k8sClient.Create(context.Background(), toCreate)).Should(Succeed()) time.Sleep(time.Second * 5) @@ -64,7 +66,8 @@ var _ = Describe("GpuDevicePlugin Controller", func() { By("checking DaemonSet is created successfully") ds := &apps.DaemonSet{} - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-gpu-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.Containers[0].Image).To(Equal(spec.Image)) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(1)) Expect(ds.Spec.Template.Spec.InitContainers[0].Image).To(Equal(spec.InitImage)) @@ -92,7 +95,7 @@ var _ = Describe("GpuDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-gpu-plugin"}, ds) + _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) expectArgs := []string{ "-v", @@ -119,7 +122,8 @@ var _ = Describe("GpuDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated with different values successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-gpu-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(0)) Expect(ds.Spec.Template.Spec.NodeSelector).Should(And(HaveLen(1), HaveKeyWithValue("kubernetes.io/arch", "amd64"))) diff --git a/test/envtest/iaadeviceplugin_controller_test.go b/test/envtest/iaadeviceplugin_controller_test.go index caa8e69a1..b0fb796b9 100644 --- a/test/envtest/iaadeviceplugin_controller_test.go +++ b/test/envtest/iaadeviceplugin_controller_test.go @@ -56,6 +56,8 @@ var _ = Describe("IaaDevicePlugin Controller", func() { Spec: spec, } + expectedDsName := "intel-iaa-plugin-iaadeviceplugin-test" + By("creating IaaDevicePlugin successfully") Expect(k8sClient.Create(context.Background(), toCreate)).Should(Succeed()) time.Sleep(time.Second * 5) @@ -68,7 +70,8 @@ var _ = Describe("IaaDevicePlugin Controller", func() { By("checking DaemonSet is created successfully") ds := &apps.DaemonSet{} - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-iaa-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.Containers[0].Image).To(Equal(spec.Image)) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(1)) Expect(ds.Spec.Template.Spec.InitContainers[0].Image).To(Equal(spec.InitImage)) @@ -100,7 +103,8 @@ var _ = Describe("IaaDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-iaa-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) expectArgs := []string{ "-v", @@ -138,7 +142,8 @@ var _ = Describe("IaaDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated with different values successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-iaa-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(0)) Expect(ds.Spec.Template.Spec.Volumes).ShouldNot(ContainElement(expectedVolume)) Expect(ds.Spec.Template.Spec.NodeSelector).Should(And(HaveLen(1), HaveKeyWithValue("kubernetes.io/arch", "amd64"))) diff --git a/test/envtest/qatdeviceplugin_controller_test.go b/test/envtest/qatdeviceplugin_controller_test.go index 1d4b6ddd9..c24fb008e 100644 --- a/test/envtest/qatdeviceplugin_controller_test.go +++ b/test/envtest/qatdeviceplugin_controller_test.go @@ -58,6 +58,8 @@ var _ = Describe("QatDevicePlugin Controller", func() { Spec: spec, } + expectedDsName := "intel-qat-plugin-qatdeviceplugin-test" + By("creating QatDevicePlugin successfully") Expect(k8sClient.Create(context.Background(), toCreate)).Should(Succeed()) time.Sleep(time.Second * 5) @@ -70,7 +72,8 @@ var _ = Describe("QatDevicePlugin Controller", func() { By("checking DaemonSet is created successfully") ds := &apps.DaemonSet{} - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-qat-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.Containers[0].Image).To(Equal(spec.Image)) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(1)) Expect(ds.Spec.Template.Spec.InitContainers[0].Image).To(Equal(spec.InitImage)) @@ -120,7 +123,8 @@ var _ = Describe("QatDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-qat-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) expectArgs := []string{ "-v", @@ -165,7 +169,8 @@ var _ = Describe("QatDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated with different values successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-qat-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(0)) Expect(ds.Spec.Template.Spec.Volumes).ShouldNot(ContainElement(expectedVolume)) Expect(ds.Spec.Template.Spec.NodeSelector).Should(And(HaveLen(1), HaveKeyWithValue("kubernetes.io/arch", "amd64"))) diff --git a/test/envtest/sgxdeviceplugin_controller_test.go b/test/envtest/sgxdeviceplugin_controller_test.go index 3f186caa8..932563773 100644 --- a/test/envtest/sgxdeviceplugin_controller_test.go +++ b/test/envtest/sgxdeviceplugin_controller_test.go @@ -52,6 +52,8 @@ var _ = Describe("SgxDevicePlugin Controller", func() { Spec: spec, } + expectedDsName := "intel-sgx-plugin-sgxdeviceplugin-test" + By("creating SgxDevicePlugin successfully") Expect(k8sClient.Create(context.Background(), toCreate)).Should(Succeed()) time.Sleep(time.Second * 5) @@ -64,7 +66,8 @@ var _ = Describe("SgxDevicePlugin Controller", func() { By("checking DaemonSet is created successfully") ds := &apps.DaemonSet{} - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-sgx-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.Containers[0].Image).To(Equal(spec.Image)) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(1)) Expect(ds.Spec.Template.Spec.InitContainers[0].Image).To(Equal(spec.InitImage)) @@ -94,7 +97,8 @@ var _ = Describe("SgxDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-sgx-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) expectArgs := []string{ "-v", @@ -121,7 +125,8 @@ var _ = Describe("SgxDevicePlugin Controller", func() { time.Sleep(interval) By("checking DaemonSet is updated with different values successfully") - _ = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: "intel-sgx-plugin"}, ds) + err = k8sClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: expectedDsName}, ds) + Expect(err).To(BeNil()) Expect(ds.Spec.Template.Spec.InitContainers).To(HaveLen(0)) Expect(ds.Spec.Template.Spec.NodeSelector).Should(And(HaveLen(1), HaveKeyWithValue("kubernetes.io/arch", "amd64"))) From 6d9e96856d3fd721c113c0728ea880b50c13de42 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 1 Nov 2023 11:13:36 +0200 Subject: [PATCH 2/3] operator: modify service accounts and role bindings to be shared Additional objects are shared between device plugin CRs. Once the last CR is removed, the additional objects are also removed. Signed-off-by: Tuomas Katila --- pkg/controllers/gpu/controller.go | 82 ++++---- pkg/controllers/gpu/controller_test.go | 4 +- pkg/controllers/reconciler.go | 255 +++++++++---------------- 3 files changed, 136 insertions(+), 205 deletions(-) diff --git a/pkg/controllers/gpu/controller.go b/pkg/controllers/gpu/controller.go index d8e6286f6..a03ce9634 100644 --- a/pkg/controllers/gpu/controller.go +++ b/pkg/controllers/gpu/controller.go @@ -37,9 +37,9 @@ import ( ) const ( - ownerKey = ".metadata.controller.gpu" - serviceAccountPrefix = "gpu-manager-sa" - roleBindingPrefix = "gpu-manager-rolebinding" + ownerKey = ".metadata.controller.gpu" + serviceAccountName = "gpu-manager-sa" + roleBindingName = "gpu-manager-rolebinding" ) var defaultNodeSelector = deployments.GPUPluginDaemonSet().Spec.Template.Spec.NodeSelector @@ -76,48 +76,54 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) NewServiceAccount(rawObj client.Object) *v1.ServiceAccount { - devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin) - if devicePlugin.Spec.ResourceManager { - sa := v1.ServiceAccount{ - ObjectMeta: metav1.ObjectMeta{ - Name: prefixedName(serviceAccountPrefix, devicePlugin.Name), +func (c *controller) NewSharedServiceAccount() *v1.ServiceAccount { + return &v1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceAccountName, + Namespace: c.ns, + }, + } +} + +func (c *controller) NewSharedClusterRoleBinding() *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: roleBindingName, + Namespace: c.ns, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: serviceAccountName, Namespace: c.ns, }, - } - - return &sa + }, + RoleRef: rbacv1.RoleRef{ + Kind: "ClusterRole", + Name: "inteldeviceplugins-gpu-manager-role", + APIGroup: "rbac.authorization.k8s.io", + }, } +} - return nil +func (c *controller) PluginMayRequireSharedObjects() bool { + return true } -func (c *controller) NewClusterRoleBinding(rawObj client.Object) *rbacv1.ClusterRoleBinding { - devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin) - if devicePlugin.Spec.ResourceManager { - rb := rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: prefixedName(roleBindingPrefix, devicePlugin.Name), - Namespace: c.ns, - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: prefixedName(serviceAccountPrefix, devicePlugin.Name), - Namespace: c.ns, - }, - }, - RoleRef: rbacv1.RoleRef{ - Kind: "ClusterRole", - Name: "inteldeviceplugins-gpu-manager-role", - APIGroup: "rbac.authorization.k8s.io", - }, - } +func (c *controller) PluginRequiresSharedObjects(ctx context.Context, client client.Client) bool { + var list devicepluginv1.GpuDevicePluginList - return &rb + if err := client.List(ctx, &list); err != nil { + return false } - return nil + for _, cr := range list.Items { + if cr.Spec.ResourceManager { + return true + } + } + + return false } func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { @@ -143,7 +149,7 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { // add service account if resource manager is enabled if devicePlugin.Spec.ResourceManager { - daemonSet.Spec.Template.Spec.ServiceAccountName = prefixedName(serviceAccountPrefix, devicePlugin.Name) + daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountName addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", v1.HostPathFileOrCreate) @@ -324,7 +330,7 @@ func (c *controller) UpdateDaemonSet(rawObj client.Object, ds *apps.DaemonSet) ( newServiceAccountName := "default" if dp.Spec.ResourceManager { - newServiceAccountName = prefixedName(serviceAccountPrefix, dp.Name) + newServiceAccountName = serviceAccountName } if ds.Spec.Template.Spec.ServiceAccountName != newServiceAccountName { diff --git a/pkg/controllers/gpu/controller_test.go b/pkg/controllers/gpu/controller_test.go index b7a69c81c..ccdb7c30d 100644 --- a/pkg/controllers/gpu/controller_test.go +++ b/pkg/controllers/gpu/controller_test.go @@ -146,7 +146,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet // add service account if resource manager is enabled if devicePlugin.Spec.ResourceManager { - daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountPrefix + "-" + devicePlugin.Name + daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountName addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) @@ -169,7 +169,7 @@ func (c *controller) updateDaemonSetExpected(rawObj client.Object, ds *apps.Daem hadRM := strings.Contains(argString, "-resource-manager") if !hadRM && dp.Spec.ResourceManager { - ds.Spec.Template.Spec.ServiceAccountName = serviceAccountPrefix + "-" + dp.Name + ds.Spec.Template.Spec.ServiceAccountName = serviceAccountName addVolumeIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) addVolumeMountIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) diff --git a/pkg/controllers/reconciler.go b/pkg/controllers/reconciler.go index 6d9782bea..c4ea0598e 100644 --- a/pkg/controllers/reconciler.go +++ b/pkg/controllers/reconciler.go @@ -40,6 +40,12 @@ var ( ImageMinVersion = versionutil.MustParseSemantic("0.28.0") ) +const ( + sharedObjectsNone = iota + sharedObjectsMayUse + sharedObjectsUsed +) + // +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;delete // +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;delete @@ -49,27 +55,37 @@ var ( // +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,resourceNames=privileged,verbs=use // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,resourceNames=d1c7b6d5.intel.com,verbs=get;update -// ServiceAccountFactory provides functions for creating a service account and related objects +// SharedObjectsFactory provides functions for creating service account and cluster rule binding objects. // Note that the rbac Role can be generated from kubebuilder:rbac comment (some examples above), // which is the reason why this interface does not yet have a NewRole function. -type ServiceAccountFactory interface { - NewServiceAccount(rawObj client.Object) *v1.ServiceAccount - NewClusterRoleBinding(rawObj client.Object) *rbacv1.ClusterRoleBinding +type SharedObjectsFactory interface { + // Indicates if plugin will ever require shared objects. Not all plugins do. + PluginMayRequireSharedObjects() bool + // Indicates if plugin currently require shared objects. + PluginRequiresSharedObjects(ctx context.Context, client client.Client) bool + NewSharedServiceAccount() *v1.ServiceAccount + NewSharedClusterRoleBinding() *rbacv1.ClusterRoleBinding } // DefaultServiceAccountFactory is an empty ServiceAccountFactory. "default" will be used for the service account then. type DefaultServiceAccountFactory struct{} -func (d *DefaultServiceAccountFactory) NewServiceAccount(rawObj client.Object) *v1.ServiceAccount { +func (d *DefaultServiceAccountFactory) NewSharedServiceAccount() *v1.ServiceAccount { return nil } -func (d *DefaultServiceAccountFactory) NewClusterRoleBinding(rawObj client.Object) *rbacv1.ClusterRoleBinding { +func (d *DefaultServiceAccountFactory) NewSharedClusterRoleBinding() *rbacv1.ClusterRoleBinding { return nil } +func (d *DefaultServiceAccountFactory) PluginMayRequireSharedObjects() bool { + return false +} +func (d *DefaultServiceAccountFactory) PluginRequiresSharedObjects(ctx context.Context, client client.Client) bool { + return false +} // DevicePluginController provides functionality for manipulating actual device plugin CRD objects. type DevicePluginController interface { - ServiceAccountFactory + SharedObjectsFactory CreateEmptyObject() (devicePlugin client.Object) NewDaemonSet(devicePlugin client.Object) *apps.DaemonSet UpdateDaemonSet(client.Object, *apps.DaemonSet) (updated bool) @@ -85,58 +101,40 @@ type reconciler struct { ownerKey string } -// Combine base and suffix with a dash +// Combine base and suffix with a dash. func SuffixedName(base, suffix string) string { return base + "-" + suffix } // fetchObjects returns the required objects for Reconcile. func (r *reconciler) fetchObjects(ctx context.Context, req ctrl.Request, log logr.Logger) ( - *apps.DaemonSetList, *v1.ServiceAccountList, *rbacv1.ClusterRoleBindingList, error) { + *apps.DaemonSetList, error) { // Fetch the plugin's DaemonSet. var childDaemonSets apps.DaemonSetList if err := r.List(ctx, &childDaemonSets, client.MatchingFields{r.ownerKey: req.Name}); err != nil { log.Error(err, "unable to list child DaemonSets") - return nil, nil, nil, err + return nil, err } - // Fetch the plugin's ServiceAccount. - var childServiceAccounts v1.ServiceAccountList - if err := r.List(ctx, &childServiceAccounts, client.MatchingFields{r.ownerKey: req.Name}); err != nil { - log.Error(err, "unable to list child ServiceAccounts") - return nil, nil, nil, err - } - - // Fetch the plugin's RoleBinding. - var childClusterRoleBindings rbacv1.ClusterRoleBindingList - if err := r.List(ctx, &childClusterRoleBindings, client.MatchingFields{r.ownerKey: req.Name}); err != nil { - log.Error(err, "unable to list child RoleBindings") - return nil, nil, nil, err - } - - return &childDaemonSets, &childServiceAccounts, &childClusterRoleBindings, nil + return &childDaemonSets, nil } -// createObjects creates required objects for Reconcile. -func (r *reconciler) createObjects(ctx context.Context, - log logr.Logger, - childServiceAccounts *v1.ServiceAccountList, - childClusterRoleBindings *rbacv1.ClusterRoleBindingList, - devicePlugin client.Object) (result ctrl.Result, err error) { - // Create service account for the plugin if it doesn't exist - if len(childServiceAccounts.Items) == 0 { - result, err = r.createServiceAccount(ctx, devicePlugin, log) - if err != nil { - return result, err - } +// createSharedObjects creates required objects for Reconcile. +func (r *reconciler) createSharedObjects(ctx context.Context, log logr.Logger) (result ctrl.Result, err error) { + // Since ServiceAccount and ClusterRoleBinding are can be shared by many, + // it's not owned by the CR. 'SetControllerReference' in the create daemonset function. + sa := r.controller.NewSharedServiceAccount() + + if err := r.Create(ctx, sa); client.IgnoreAlreadyExists(err) != nil { + log.Error(err, "unable to create shared ServiceAccount") + return result, err } - // Create role binding for the plugin if it doesn't exist - if len(childClusterRoleBindings.Items) == 0 { - result, err = r.createClusterRoleBinding(ctx, devicePlugin, log) - if err != nil { - return result, err - } + rb := r.controller.NewSharedClusterRoleBinding() + + if err := r.Create(ctx, rb); client.IgnoreAlreadyExists(err) != nil { + log.Error(err, "unable to create shared ClusterRoleBinding") + return ctrl.Result{}, err } return result, nil @@ -172,7 +170,8 @@ func UpgradeImages(ctx context.Context, image *string, initimage *string) (upgra return upgrade } -func upgrade(ctx context.Context, r *reconciler, devicePlugin client.Object) { +// upgradeDevicePluginImages calls controller's Upgrade function which mostly calls reconcilers' UpgradeImages. +func upgradeDevicePluginImages(ctx context.Context, r *reconciler, devicePlugin client.Object) { if r.controller.Upgrade(ctx, devicePlugin) { if err := r.Update(ctx, devicePlugin); err != nil { log := log.FromContext(ctx) @@ -181,25 +180,52 @@ func upgrade(ctx context.Context, r *reconciler, devicePlugin client.Object) { } } +// determinateSharedObjectReqs Determinates if the installed plugins require shared objects. +// The result is one of three: no, may use and uses currently. +func (r *reconciler) determinateSharedObjectReqs(ctx context.Context, req ctrl.Request) int { + ret := sharedObjectsNone + + if !r.controller.PluginMayRequireSharedObjects() { + return ret + } + + ret = sharedObjectsMayUse + + // Decide from the untyped objects the need to have shared objects. + if r.controller.PluginRequiresSharedObjects(ctx, r.Client) { + ret = sharedObjectsUsed + } + + return ret +} + // Reconcile reconciles a device plugin object. func (r *reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx) - childDaemonSets, childServiceAccounts, childClusterRoleBindings, err2 := r.fetchObjects(ctx, req, log) + childDaemonSets, err2 := r.fetchObjects(ctx, req, log) if err2 != nil { return ctrl.Result{}, err2 } + sharedObjectsNeed := r.determinateSharedObjectReqs(ctx, req) devicePlugin := r.controller.CreateEmptyObject() + if err := r.Get(ctx, req.NamespacedName, devicePlugin); err != nil { + // Delete shared objects if they are not needed anymore. + r.maybeDeleteSharedObjects(ctx, sharedObjectsNeed, log) + return r.maybeDeleteDaemonSets(ctx, err, childDaemonSets.Items, log) } - if result, err := r.createObjects(ctx, log, childServiceAccounts, childClusterRoleBindings, devicePlugin); err != nil { - return result, err + if sharedObjectsNeed == sharedObjectsUsed { + if result, err := r.createSharedObjects(ctx, log); err != nil { + return result, err + } } - upgrade(ctx, r, devicePlugin) + // Upgrade device plugin object's image, initImage etc. + upgradeDevicePluginImages(ctx, r, devicePlugin) // Create a daemon set for the plugin if it doesn't exist. if len(childDaemonSets.Items) == 0 { @@ -249,8 +275,8 @@ func (r *reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu // Drop redundant daemon sets, role bindings and service accounts, if any. r.maybeDeleteRedundantDaemonSets(ctx, childDaemonSets.Items, log) - r.maybeDeleteRedundantClusterRoleBindings(ctx, devicePlugin, childClusterRoleBindings.Items, log) - r.maybeDeleteRedundantServiceAccounts(ctx, devicePlugin, childServiceAccounts.Items, log) + // Delete shared objects if they are not needed anymore. + r.maybeDeleteSharedObjects(ctx, sharedObjectsNeed, log) return ctrl.Result{}, nil } @@ -275,46 +301,6 @@ func indexDaemonSets(ctx context.Context, mgr ctrl.Manager, apiGVString, pluginK }) } -func indexServiceAccounts(ctx context.Context, mgr ctrl.Manager, apiGVString, pluginKind, ownerKey string) error { - return mgr.GetFieldIndexer().IndexField(ctx, &v1.ServiceAccount{}, ownerKey, - func(rawObj client.Object) []string { - // grab the ServiceAccounts object, extract the owner... - sa := rawObj.(*v1.ServiceAccount) - owner := metav1.GetControllerOf(sa) - if owner == nil { - return nil - } - - // make sure it's a device plugin - if owner.APIVersion != apiGVString || owner.Kind != pluginKind { - return nil - } - - // and if so, return it. - return []string{owner.Name} - }) -} - -func indexRoleBindings(ctx context.Context, mgr ctrl.Manager, apiGVString, pluginKind, ownerKey string) error { - return mgr.GetFieldIndexer().IndexField(ctx, &rbacv1.ClusterRoleBinding{}, ownerKey, - func(rawObj client.Object) []string { - // grab the ServiceAccounts object, extract the owner... - rb := rawObj.(*rbacv1.ClusterRoleBinding) - owner := metav1.GetControllerOf(rb) - if owner == nil { - return nil - } - - // make sure it's a device plugin - if owner.APIVersion != apiGVString || owner.Kind != pluginKind { - return nil - } - - // and if so, return it. - return []string{owner.Name} - }) -} - func indexPods(ctx context.Context, mgr ctrl.Manager, apiGVString, pluginKind, ownerKey string) error { return mgr.GetFieldIndexer().IndexField(ctx, &v1.Pod{}, ownerKey, func(rawObj client.Object) []string { @@ -352,16 +338,6 @@ func SetupWithManager(mgr ctrl.Manager, controller DevicePluginController, apiGV return err } - // Index ServiceAccounts with their owner (e.g. QatDevicePlugin). - if err := indexServiceAccounts(ctx, mgr, apiGVString, pluginKind, ownerKey); err != nil { - return err - } - - // Index RoleBindings with their owner (e.g. QatDevicePlugin). - if err := indexRoleBindings(ctx, mgr, apiGVString, pluginKind, ownerKey); err != nil { - return err - } - // Index Pods with their owner (DaemonSet). if err := indexPods(ctx, mgr, apiGVString, pluginKind, ownerKey); err != nil { return err @@ -370,51 +346,9 @@ func SetupWithManager(mgr ctrl.Manager, controller DevicePluginController, apiGV return ctrl.NewControllerManagedBy(mgr). For(r.controller.CreateEmptyObject()). Owns(&apps.DaemonSet{}). - Owns(&rbacv1.ClusterRoleBinding{}). - Owns(&v1.ServiceAccount{}). Complete(r) } -func (r *reconciler) createClusterRoleBinding(ctx context.Context, dp client.Object, log logr.Logger) (ctrl.Result, error) { - rb := r.controller.NewClusterRoleBinding(dp) - if rb == nil { - // most controllers don't need role bindings - return ctrl.Result{}, nil - } - - if err := ctrl.SetControllerReference(dp.(metav1.Object), rb, r.scheme); err != nil { - log.Error(err, "unable to set controller reference") - return ctrl.Result{}, err - } - - if err := r.Create(ctx, rb); err != nil { - log.Error(err, "unable to create ClusterRoleBinding") - return ctrl.Result{}, err - } - - return ctrl.Result{}, nil -} - -func (r *reconciler) createServiceAccount(ctx context.Context, dp client.Object, log logr.Logger) (ctrl.Result, error) { - sa := r.controller.NewServiceAccount(dp) - if sa == nil { - // most controllers don't need service accounts - return ctrl.Result{}, nil - } - - if err := ctrl.SetControllerReference(dp.(metav1.Object), sa, r.scheme); err != nil { - log.Error(err, "unable to set controller reference") - return ctrl.Result{}, err - } - - if err := r.Create(ctx, sa); err != nil { - log.Error(err, "unable to create ServiceAccount") - return ctrl.Result{}, err - } - - return ctrl.Result{}, nil -} - func (r *reconciler) createDaemonSet(ctx context.Context, dp client.Object, log logr.Logger) (ctrl.Result, error) { ds := r.controller.NewDaemonSet(dp) @@ -466,30 +400,21 @@ func (r *reconciler) maybeDeleteRedundantDaemonSets(ctx context.Context, dsets [ } } -func (r *reconciler) maybeDeleteRedundantServiceAccounts(ctx context.Context, dp client.Object, sas []v1.ServiceAccount, log logr.Logger) { - sa := r.controller.NewServiceAccount(dp) - if sa == nil { - for _, sa := range sas { - saCopy := sa - if err := r.Delete(ctx, &saCopy, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { - log.Error(err, "unable to delete redundant ServiceAccount", "ServiceAccount", sa) - } else { - log.V(1).Info("deleted redundant ServiceAccount", "ServiceAccount", sa) - } - } +func (r *reconciler) maybeDeleteSharedObjects(ctx context.Context, sharedObjectsNeed int, log logr.Logger) { + // Delete shared objects only if plugin may use but is not currently using any. + if sharedObjectsNeed != sharedObjectsMayUse { + return } -} -func (r *reconciler) maybeDeleteRedundantClusterRoleBindings(ctx context.Context, dp client.Object, rbs []rbacv1.ClusterRoleBinding, log logr.Logger) { - rb := r.controller.NewClusterRoleBinding(dp) - if rb == nil { - for _, rb := range rbs { - rbCopy := rb - if err := r.Delete(ctx, &rbCopy, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { - log.Error(err, "unable to delete redundant ClusterRoleBinding", "ClusterRoleBinding", rb) - } else { - log.V(1).Info("deleted redundant ClusterRoleBinding", "ClusterRoleBinding", rb) - } - } + sa := r.controller.NewSharedServiceAccount() + + if err := r.Delete(ctx, sa, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { + log.Error(err, "unable to delete redundant shared ServiceAccount", "ServiceAccount", sa) + } + + crb := r.controller.NewSharedClusterRoleBinding() + + if err := r.Delete(ctx, crb, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { + log.Error(err, "unable to delete redundant shared ClusterRoleBinding", "ClusterRoleBinding", crb) } } From 4e066900632b5e37704b30ba8f050bc981f6813f Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 1 Nov 2023 09:35:51 +0200 Subject: [PATCH 3/3] operator: gpu: prevent scenario where CRs both enable and disable resource management Signed-off-by: Tuomas Katila --- .../v1/gpudeviceplugin_webhook.go | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go index d306312b3..84343e86a 100644 --- a/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go @@ -15,9 +15,12 @@ package v1 import ( + "context" + "github.com/pkg/errors" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" @@ -32,8 +35,12 @@ var ( gpuMinVersion = controllers.ImageMinVersion ) +var cli client.Client + // SetupWebhookWithManager sets up a webhook for GpuDevicePlugin custom resources. func (r *GpuDevicePlugin) SetupWebhookWithManager(mgr ctrl.Manager) error { + cli = mgr.GetClient() + return ctrl.NewWebhookManagedBy(mgr). For(r). Complete() @@ -77,6 +84,30 @@ func (r *GpuDevicePlugin) ValidateDelete() (admission.Warnings, error) { return nil, nil } +func (r *GpuDevicePlugin) crossCheckResourceManagement() bool { + ctx := context.Background() + gpuCrs := GpuDevicePluginList{} + + if err := cli.List(ctx, &gpuCrs); err != nil { + gpudevicepluginlog.Info("unable to list GPU CRs") + + return false + } + + for _, cr := range gpuCrs.Items { + // Ignore itself. + if cr.Name == r.Name { + continue + } + + if cr.Spec.ResourceManager != r.Spec.ResourceManager { + return false + } + } + + return true +} + func (r *GpuDevicePlugin) validatePlugin() error { if r.Spec.SharedDevNum == 1 && r.Spec.PreferredAllocationPolicy != "none" { return errors.Errorf("PreferredAllocationPolicy is valid only when setting sharedDevNum > 1") @@ -86,5 +117,9 @@ func (r *GpuDevicePlugin) validatePlugin() error { return errors.Errorf("resourceManager is valid only when setting sharedDevNum > 1") } + if !r.crossCheckResourceManagement() { + return errors.Errorf("All GPU CRs must be with or without resource management") + } + return validatePluginImage(r.Spec.Image, "intel-gpu-plugin", gpuMinVersion) }