Skip to content

Commit f8eeeef

Browse files
authored
Merge pull request #1579 from tkatila/operator/multiple-crs
operator: remove one-cr-per-kind limitation
2 parents 29c8ed9 + 4e06690 commit f8eeeef

30 files changed

+258
-392
lines changed

cmd/operator/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ NAME DESIRED READY NODE SELECTOR AGE
103103
gpudeviceplugin-sample 1 1 5s
104104
```
105105

106+
**NOTE:** Intel Device Plugin Operator supports multiple custom resources per Kind (QAT, DSA, etc.). With multiple custom resources and different `nodeSelectors`, it is possible to customize device plugin configuration per node or per group of nodes. See also [known issues](#multiple-custom-resources).
107+
106108
## Upgrade
107109

108110
The upgrade of the deployed plugins can be done by simply installing a new release of the operator.
@@ -135,6 +137,10 @@ command line argument multiple times.
135137

136138
## Known issues
137139

140+
### Multiple Custom Resources
141+
142+
With multiple custom resources, `nodeSelector` has to be carefully set to avoid device plugin DaemonSet getting deployed multiple times on the same node, as operator does not check or prevent this. Multiple plugins managing same resource on a node can cause invalid behavior and/or duplicate device resources on node.
143+
138144
### Cluster behind a proxy
139145

140146
If your cluster operates behind a corporate proxy make sure that the API

pkg/apis/deviceplugin/v1/dlbdeviceplugin_webhook.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
package v1
1616

1717
import (
18-
"github.com/pkg/errors"
1918
"k8s.io/apimachinery/pkg/runtime"
2019
ctrl "sigs.k8s.io/controller-runtime"
2120
logf "sigs.k8s.io/controller-runtime/pkg/log"
@@ -25,10 +24,6 @@ import (
2524
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers"
2625
)
2726

28-
const (
29-
dlbPluginKind = "DlbDevicePlugin"
30-
)
31-
3227
var (
3328
// dlbdevicepluginlog is for logging in this package.
3429
dlbdevicepluginlog = logf.Log.WithName("dlbdeviceplugin-resource")
@@ -64,10 +59,6 @@ var _ webhook.Validator = &DlbDevicePlugin{}
6459
func (r *DlbDevicePlugin) ValidateCreate() (admission.Warnings, error) {
6560
dlbdevicepluginlog.Info("validate create", "name", r.Name)
6661

67-
if controllers.GetDevicePluginCount(dlbPluginKind) > 0 {
68-
return nil, errors.Errorf("an instance of %q already exists in the cluster", dlbPluginKind)
69-
}
70-
7162
return nil, r.validatePlugin()
7263
}
7364

pkg/apis/deviceplugin/v1/dsadeviceplugin_webhook.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ import (
2525
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers"
2626
)
2727

28-
const (
29-
dsaPluginKind = "DsaDevicePlugin"
30-
)
31-
3228
var (
3329
// dsadevicepluginlog is for logging in this package.
3430
dsadevicepluginlog = logf.Log.WithName("dsadeviceplugin-resource")
@@ -64,10 +60,6 @@ var _ webhook.Validator = &DsaDevicePlugin{}
6460
func (r *DsaDevicePlugin) ValidateCreate() (admission.Warnings, error) {
6561
dsadevicepluginlog.Info("validate create", "name", r.Name)
6662

67-
if controllers.GetDevicePluginCount(dsaPluginKind) > 0 {
68-
return nil, errors.Errorf("an instance of %q already exists in the cluster", dsaPluginKind)
69-
}
70-
7163
return nil, r.validatePlugin()
7264
}
7365

pkg/apis/deviceplugin/v1/fpgadeviceplugin_webhook.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
package v1
1616

1717
import (
18-
"github.com/pkg/errors"
1918
"k8s.io/apimachinery/pkg/runtime"
2019
ctrl "sigs.k8s.io/controller-runtime"
2120
logf "sigs.k8s.io/controller-runtime/pkg/log"
@@ -25,10 +24,6 @@ import (
2524
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers"
2625
)
2726

28-
const (
29-
fpgaPluginKind = "FpgaDevicePlugin"
30-
)
31-
3227
var (
3328
// fpgadevicepluginlog is for logging in this package.
3429
fpgadevicepluginlog = logf.Log.WithName("fpgadeviceplugin-resource")
@@ -68,10 +63,6 @@ var _ webhook.Validator = &FpgaDevicePlugin{}
6863
func (r *FpgaDevicePlugin) ValidateCreate() (admission.Warnings, error) {
6964
fpgadevicepluginlog.Info("validate create", "name", r.Name)
7065

71-
if controllers.GetDevicePluginCount(fpgaPluginKind) > 0 {
72-
return nil, errors.Errorf("an instance of %q already exists in the cluster", fpgaPluginKind)
73-
}
74-
7566
return nil, r.validatePlugin()
7667
}
7768

pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,32 @@
1515
package v1
1616

1717
import (
18+
"context"
19+
1820
"github.com/pkg/errors"
1921
"k8s.io/apimachinery/pkg/runtime"
2022
ctrl "sigs.k8s.io/controller-runtime"
23+
"sigs.k8s.io/controller-runtime/pkg/client"
2124
logf "sigs.k8s.io/controller-runtime/pkg/log"
2225
"sigs.k8s.io/controller-runtime/pkg/webhook"
2326
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
2427

2528
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers"
2629
)
2730

28-
const (
29-
gpuPluginKind = "GpuDevicePlugin"
30-
)
31-
3231
var (
3332
// gpudevicepluginlog is for logging in this package.
3433
gpudevicepluginlog = logf.Log.WithName("gpudeviceplugin-resource")
3534

3635
gpuMinVersion = controllers.ImageMinVersion
3736
)
3837

38+
var cli client.Client
39+
3940
// SetupWebhookWithManager sets up a webhook for GpuDevicePlugin custom resources.
4041
func (r *GpuDevicePlugin) SetupWebhookWithManager(mgr ctrl.Manager) error {
42+
cli = mgr.GetClient()
43+
4144
return ctrl.NewWebhookManagedBy(mgr).
4245
For(r).
4346
Complete()
@@ -64,10 +67,6 @@ var _ webhook.Validator = &GpuDevicePlugin{}
6467
func (r *GpuDevicePlugin) ValidateCreate() (admission.Warnings, error) {
6568
gpudevicepluginlog.Info("validate create", "name", r.Name)
6669

67-
if controllers.GetDevicePluginCount(gpuPluginKind) > 0 {
68-
return nil, errors.Errorf("an instance of %q already exists in the cluster", gpuPluginKind)
69-
}
70-
7170
return nil, r.validatePlugin()
7271
}
7372

@@ -85,6 +84,30 @@ func (r *GpuDevicePlugin) ValidateDelete() (admission.Warnings, error) {
8584
return nil, nil
8685
}
8786

87+
func (r *GpuDevicePlugin) crossCheckResourceManagement() bool {
88+
ctx := context.Background()
89+
gpuCrs := GpuDevicePluginList{}
90+
91+
if err := cli.List(ctx, &gpuCrs); err != nil {
92+
gpudevicepluginlog.Info("unable to list GPU CRs")
93+
94+
return false
95+
}
96+
97+
for _, cr := range gpuCrs.Items {
98+
// Ignore itself.
99+
if cr.Name == r.Name {
100+
continue
101+
}
102+
103+
if cr.Spec.ResourceManager != r.Spec.ResourceManager {
104+
return false
105+
}
106+
}
107+
108+
return true
109+
}
110+
88111
func (r *GpuDevicePlugin) validatePlugin() error {
89112
if r.Spec.SharedDevNum == 1 && r.Spec.PreferredAllocationPolicy != "none" {
90113
return errors.Errorf("PreferredAllocationPolicy is valid only when setting sharedDevNum > 1")
@@ -94,5 +117,9 @@ func (r *GpuDevicePlugin) validatePlugin() error {
94117
return errors.Errorf("resourceManager is valid only when setting sharedDevNum > 1")
95118
}
96119

120+
if !r.crossCheckResourceManagement() {
121+
return errors.Errorf("All GPU CRs must be with or without resource management")
122+
}
123+
97124
return validatePluginImage(r.Spec.Image, "intel-gpu-plugin", gpuMinVersion)
98125
}

pkg/apis/deviceplugin/v1/iaadeviceplugin_webhook.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ import (
2525
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers"
2626
)
2727

28-
const (
29-
iaaPluginKind = "IaaDevicePlugin"
30-
)
31-
3228
var (
3329
// iaadevicepluginlog is for logging in this package.
3430
iaadevicepluginlog = logf.Log.WithName("iaadeviceplugin-resource")
@@ -64,10 +60,6 @@ var _ webhook.Validator = &IaaDevicePlugin{}
6460
func (r *IaaDevicePlugin) ValidateCreate() (admission.Warnings, error) {
6561
iaadevicepluginlog.Info("validate create", "name", r.Name)
6662

67-
if controllers.GetDevicePluginCount(iaaPluginKind) > 0 {
68-
return nil, errors.Errorf("an instance of %q already exists in the cluster", iaaPluginKind)
69-
}
70-
7163
return nil, r.validatePlugin()
7264
}
7365

pkg/apis/deviceplugin/v1/qatdeviceplugin_webhook.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ import (
2525
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers"
2626
)
2727

28-
const (
29-
qatPluginKind = "QatDevicePlugin"
30-
)
31-
3228
var (
3329
// qatdevicepluginlog is for logging in this package.
3430
qatdevicepluginlog = logf.Log.WithName("qatdeviceplugin-resource")
@@ -64,10 +60,6 @@ var _ webhook.Validator = &QatDevicePlugin{}
6460
func (r *QatDevicePlugin) ValidateCreate() (admission.Warnings, error) {
6561
qatdevicepluginlog.Info("validate create", "name", r.Name)
6662

67-
if controllers.GetDevicePluginCount(qatPluginKind) > 0 {
68-
return nil, errors.Errorf("an instance of %q already exists in the cluster", qatPluginKind)
69-
}
70-
7163
return nil, r.validatePlugin()
7264
}
7365

pkg/apis/deviceplugin/v1/sgxdeviceplugin_webhook.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
package v1
1616

1717
import (
18-
"github.com/pkg/errors"
1918
"k8s.io/apimachinery/pkg/runtime"
2019
ctrl "sigs.k8s.io/controller-runtime"
2120
logf "sigs.k8s.io/controller-runtime/pkg/log"
@@ -25,10 +24,6 @@ import (
2524
"github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers"
2625
)
2726

28-
const (
29-
sgxPluginKind = "SgxDevicePlugin"
30-
)
31-
3227
var (
3328
// sgxdevicepluginlog is for logging in this package.
3429
sgxdevicepluginlog = logf.Log.WithName("sgxdeviceplugin-resource")
@@ -64,10 +59,6 @@ var _ webhook.Validator = &SgxDevicePlugin{}
6459
func (r *SgxDevicePlugin) ValidateCreate() (admission.Warnings, error) {
6560
sgxdevicepluginlog.Info("validate create", "name", r.Name)
6661

67-
if controllers.GetDevicePluginCount(sgxPluginKind) > 0 {
68-
return nil, errors.Errorf("an instance of %q already exists in the cluster", sgxPluginKind)
69-
}
70-
7162
return nil, r.validatePlugin()
7263
}
7364

pkg/controllers/dlb/controller.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,19 +71,12 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool {
7171
return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage)
7272
}
7373

74-
func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) {
75-
var list devicepluginv1.DlbDevicePluginList
76-
if err := clnt.List(ctx, &list); err != nil {
77-
return 0, err
78-
}
79-
80-
return len(list.Items), nil
81-
}
82-
8374
func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet {
8475
devicePlugin := rawObj.(*devicepluginv1.DlbDevicePlugin)
8576

8677
ds := deployments.DLBPluginDaemonSet()
78+
ds.Name = controllers.SuffixedName(ds.Name, devicePlugin.Name)
79+
8780
if len(devicePlugin.Spec.NodeSelector) > 0 {
8881
ds.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector
8982
}

pkg/controllers/dlb/controller_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
4242
},
4343
ObjectMeta: metav1.ObjectMeta{
4444
Namespace: c.ns,
45-
Name: appLabel,
45+
Name: appLabel + "-" + devicePlugin.Name,
4646
Labels: map[string]string{
4747
"app": appLabel,
4848
},
@@ -139,6 +139,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
139139
// equal to the expected daemonset.
140140
func TestNewDaemonSetDLB(t *testing.T) {
141141
plugin := &devicepluginv1.DlbDevicePlugin{}
142+
plugin.Name = "testing"
142143
c := &controller{}
143144

144145
expected := c.newDaemonSetExpected(plugin)

pkg/controllers/dsa/controller.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,6 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool {
7575
return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage)
7676
}
7777

78-
func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) {
79-
var list devicepluginv1.DsaDevicePluginList
80-
if err := clnt.List(ctx, &list); err != nil {
81-
return 0, err
82-
}
83-
84-
return len(list.Items), nil
85-
}
86-
8778
func removeInitContainer(ds *apps.DaemonSet, dp *devicepluginv1.DsaDevicePlugin) {
8879
newInitContainers := []v1.Container{}
8980

@@ -199,6 +190,8 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet {
199190
devicePlugin := rawObj.(*devicepluginv1.DsaDevicePlugin)
200191

201192
daemonSet := deployments.DSAPluginDaemonSet()
193+
daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name)
194+
202195
if len(devicePlugin.Spec.NodeSelector) > 0 {
203196
daemonSet.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector
204197
}

pkg/controllers/dsa/controller_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
4343
},
4444
ObjectMeta: metav1.ObjectMeta{
4545
Namespace: c.ns,
46-
Name: appLabel,
46+
Name: appLabel + "-" + devicePlugin.Name,
4747
Labels: map[string]string{
4848
"app": appLabel,
4949
},
@@ -160,6 +160,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
160160
// equal to the expected daemonset.
161161
func TestNewDaemonSetDSA(t *testing.T) {
162162
plugin := &devicepluginv1.DsaDevicePlugin{}
163+
plugin.Name = "testing"
163164
c := &controller{}
164165

165166
expected := c.newDaemonSetExpected(plugin)

pkg/controllers/fpga/controller.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,19 +70,12 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool {
7070
return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage)
7171
}
7272

73-
func (c *controller) GetTotalObjectCount(ctx context.Context, clnt client.Client) (int, error) {
74-
var list devicepluginv1.FpgaDevicePluginList
75-
if err := clnt.List(ctx, &list); err != nil {
76-
return 0, err
77-
}
78-
79-
return len(list.Items), nil
80-
}
81-
8273
func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet {
8374
devicePlugin := rawObj.(*devicepluginv1.FpgaDevicePlugin)
8475

8576
daemonSet := deployments.FPGAPluginDaemonSet()
77+
daemonSet.Name = controllers.SuffixedName(daemonSet.Name, devicePlugin.Name)
78+
8679
if len(devicePlugin.Spec.NodeSelector) > 0 {
8780
daemonSet.Spec.Template.Spec.NodeSelector = devicePlugin.Spec.NodeSelector
8881
}

pkg/controllers/fpga/controller_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
4545
},
4646
ObjectMeta: metav1.ObjectMeta{
4747
Namespace: c.ns,
48-
Name: appLabel,
48+
Name: appLabel + "-" + devicePlugin.Name,
4949
Labels: map[string]string{
5050
"app": appLabel,
5151
},
@@ -184,6 +184,7 @@ func TestNewDaemonSetFPGA(t *testing.T) {
184184
InitImage: "intel/intel-fpga-initcontainer:devel",
185185
},
186186
}
187+
plugin.Name = "testing"
187188

188189
expected := c.newDaemonSetExpected(plugin)
189190
actual := c.NewDaemonSet(plugin)

0 commit comments

Comments
 (0)