Skip to content

Commit d09f59e

Browse files
authored
Merge pull request #2335 from dlipovetsky/controlplane-scale
✨ Adds kubeadm control plane scale up/down
2 parents 497be37 + 386d3bf commit d09f59e

File tree

5 files changed

+387
-383
lines changed

5 files changed

+387
-383
lines changed

controlplane/kubeadm/controllers/kubeadm_control_plane_controller.go

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package controllers
1919
import (
2020
"context"
2121
"fmt"
22+
"sort"
2223
"strings"
2324
"time"
2425

@@ -59,8 +60,18 @@ const (
5960
// DeleteRequeueAfter is how long to wait before checking again to see if
6061
// all control plane machines have been deleted.
6162
DeleteRequeueAfter = 30 * time.Second
63+
64+
// HealthCheckFailedRequeueAfter is how long to wait before trying to scale
65+
// up/down if some target cluster health check has failed
66+
HealthCheckFailedRequeueAfter = 20 * time.Second
6267
)
6368

69+
type managementCluster interface {
70+
GetMachinesForCluster(ctx context.Context, cluster types.NamespacedName, filters ...func(machine *clusterv1.Machine) bool) ([]*clusterv1.Machine, error)
71+
TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey types.NamespacedName, controlPlaneName string) error
72+
TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey types.NamespacedName, controlPlaneName string) error
73+
}
74+
6475
// +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch
6576
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;patch
6677
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io;bootstrap.cluster.x-k8s.io;controlplane.cluster.x-k8s.io,resources=*,verbs=get;list;watch;create;update;patch;delete
@@ -77,7 +88,7 @@ type KubeadmControlPlaneReconciler struct {
7788

7889
remoteClientGetter remote.ClusterClientGetter
7990

80-
managementCluster *internal.ManagementCluster
91+
managementCluster managementCluster
8192
}
8293

8394
func (r *KubeadmControlPlaneReconciler) SetupWithManager(mgr ctrl.Manager, options controller.Options) error {
@@ -210,7 +221,6 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
210221
if requeueErr, ok := errors.Cause(err).(capierrors.HasRequeueAfterError); ok {
211222
logger.Error(err, "required certificates not found, requeueing")
212223
return ctrl.Result{
213-
Requeue: true,
214224
RequeueAfter: requeueErr.GetRequeueAfter(),
215225
}, nil
216226
}
@@ -251,29 +261,36 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
251261
switch {
252262
// We are creating the first replica
253263
case numMachines < desiredReplicas && numMachines == 0:
254-
// create new Machine w/ init
255-
logger.Info("Scaling to 1", "Desired Replicas", desiredReplicas, "Existing Replicas", numMachines)
256-
if err := r.initializeControlPlane(ctx, cluster, kcp); err != nil {
257-
logger.Error(err, "Failed to initialize the Control Plane")
258-
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedInitialization", "Failed to initialize the control plane: %v", err)
259-
return ctrl.Result{}, err
264+
// Create new Machine w/ init
265+
logger.Info("Initializing control plane", "Desired", desiredReplicas, "Existing", numMachines)
266+
result, err := r.initializeControlPlane(ctx, cluster, kcp)
267+
if err != nil {
268+
logger.Error(err, "Failed to initialize control plane")
269+
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedInitialization", "Failed to initialize cluster %s/%s control plane: %v", cluster.Namespace, cluster.Name, err)
260270
}
261-
// scaling up
271+
// TODO: return the error if it is unexpected and should cause an immediate requeue
272+
return result, nil
273+
// We are scaling up
262274
case numMachines < desiredReplicas && numMachines > 0:
263-
// create a new Machine w/ join
264-
logger.Info("Scaling up", "Desired Replicas", desiredReplicas, "Existing Replicas", numMachines)
265-
wantMachines := desiredReplicas - numMachines
266-
if err := r.scaleUpControlPlane(ctx, cluster, kcp, wantMachines); err != nil {
267-
logger.Error(err, "Failed to scale up the Control Plane")
268-
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedScaleUp", "Failed to scale up the control plane: %v", err)
269-
return ctrl.Result{}, err
275+
// Create a new Machine w/ join
276+
logger.Info("Scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines)
277+
result, err := r.scaleUpControlPlane(ctx, cluster, kcp)
278+
if err != nil {
279+
logger.Error(err, "Failed to scale up control plane")
280+
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedScaleUp", "Failed to scale up cluster %s/%s control plane: %v", cluster.Namespace, cluster.Name, err)
270281
}
271-
// scaling down
282+
// TODO: return the error if it is unexpected and should cause an immediate requeue
283+
return result, nil
284+
// We are scaling down
272285
case numMachines > desiredReplicas:
273-
logger.Info("Scaling down", "Desired Replicas", desiredReplicas, "Existing Replicas", numMachines)
274-
err := errors.New("Not Implemented")
275-
logger.Error(err, "Should delete the appropriate Machine here.")
276-
return ctrl.Result{}, err
286+
logger.Info("Scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)
287+
result, err := r.scaleDownControlPlane(ctx, cluster, kcp)
288+
if err != nil {
289+
logger.Error(err, "Failed to scale down control plane")
290+
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedScaleDown", "Failed to scale down cluster %s/%s control plane: %v", cluster.Namespace, cluster.Name, err)
291+
}
292+
// TODO: return the error if it is unexpected and should cause an immediate requeue
293+
return result, nil
277294
}
278295

279296
return ctrl.Result{}, nil
@@ -346,28 +363,70 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(ctx context.Context,
346363
return nil
347364
}
348365

349-
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, numMachines int) error {
350-
var errs []error
366+
func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) {
367+
bootstrapSpec := kcp.Spec.KubeadmConfigSpec.DeepCopy()
368+
bootstrapSpec.JoinConfiguration = nil
369+
370+
if err := r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec); err != nil {
371+
return ctrl.Result{}, errors.Wrapf(err, "failed to create control plane Machine for cluster %s/%s", cluster.Name, cluster.Namespace)
372+
}
373+
374+
// Requeue the control plane, in case we are going to scale up
375+
return ctrl.Result{Requeue: true}, nil
376+
}
377+
378+
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) {
379+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
380+
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "control plane is not healthy")
381+
}
382+
383+
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
384+
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "etcd cluster is not healthy")
385+
}
351386

352387
// Create the bootstrap configuration
353388
bootstrapSpec := kcp.Spec.KubeadmConfigSpec.DeepCopy()
354389
bootstrapSpec.InitConfiguration = nil
355390
bootstrapSpec.ClusterConfiguration = nil
356391

357-
for i := 0; i < numMachines; i++ {
358-
err := r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec)
359-
if err != nil {
360-
errs = append(errs, errors.Wrap(err, "failed to clone and create an additional control plane Machine"))
361-
}
392+
if err := r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec); err != nil {
393+
return ctrl.Result{}, errors.Wrapf(err, "failed to create control plane Machine for cluster %s/%s", cluster.Name, cluster.Namespace)
362394
}
363395

364-
return kerrors.NewAggregate(errs)
396+
// Requeue the control plane, in case we are not done scaling up
397+
return ctrl.Result{Requeue: true}, nil
365398
}
366399

367-
func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) error {
368-
bootstrapSpec := kcp.Spec.KubeadmConfigSpec.DeepCopy()
369-
bootstrapSpec.JoinConfiguration = nil
370-
return r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec)
400+
func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) {
401+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
402+
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "control plane is not healthy")
403+
}
404+
405+
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
406+
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "etcd cluster is not healthy")
407+
}
408+
409+
ownedMachines, err := r.managementCluster.GetMachinesForCluster(ctx, clusterKey(cluster), internal.OwnedControlPlaneMachines(kcp.Name))
410+
if err != nil {
411+
return ctrl.Result{}, err
412+
}
413+
414+
// Wait for any delete in progress to complete before deleting another Machine
415+
if len(internal.FilterMachines(ownedMachines, internal.HasDeletionTimestamp())) > 0 {
416+
return ctrl.Result{RequeueAfter: DeleteRequeueAfter}, nil
417+
}
418+
419+
machineToDelete, err := oldestMachine(ownedMachines)
420+
if err != nil {
421+
return ctrl.Result{}, errors.Wrap(err, "failed to pick control plane Machine to delete")
422+
}
423+
424+
if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
425+
return ctrl.Result{}, errors.Wrapf(err, "failed to delete control plane Machine %s/%s", machineToDelete.Namespace, machineToDelete.Name)
426+
}
427+
428+
// Requeue the control plane, in case we are not done scaling down
429+
return ctrl.Result{Requeue: true}, nil
371430
}
372431

373432
func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, bootstrapSpec *bootstrapv1.KubeadmConfigSpec) error {
@@ -558,7 +617,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
558617
if errs != nil {
559618
return ctrl.Result{}, kerrors.NewAggregate(errs)
560619
}
561-
return ctrl.Result{Requeue: true, RequeueAfter: DeleteRequeueAfter}, nil
620+
return ctrl.Result{RequeueAfter: DeleteRequeueAfter}, nil
562621
}
563622

564623
func (r *KubeadmControlPlaneReconciler) reconcileKubeconfig(ctx context.Context, clusterName types.NamespacedName, endpoint clusterv1.APIEndpoint, kcp *controlplanev1.KubeadmControlPlane) error {
@@ -667,3 +726,11 @@ func clusterKey(cluster *clusterv1.Cluster) types.NamespacedName {
667726
Name: cluster.Name,
668727
}
669728
}
729+
730+
func oldestMachine(machines []*clusterv1.Machine) (*clusterv1.Machine, error) {
731+
if len(machines) == 0 {
732+
return &clusterv1.Machine{}, errors.New("no machines given")
733+
}
734+
sort.Sort(util.MachinesByCreationTimestamp(machines))
735+
return machines[0], nil
736+
}

0 commit comments

Comments
 (0)