@@ -19,6 +19,7 @@ package controllers
1919import (
2020 "context"
2121 "fmt"
22+ "sort"
2223 "strings"
2324 "time"
2425
@@ -59,8 +60,18 @@ const (
5960 // DeleteRequeueAfter is how long to wait before checking again to see if
6061 // all control plane machines have been deleted.
6162 DeleteRequeueAfter = 30 * time .Second
63+
64+ // HealthCheckFailedRequeueAfter is how long to wait before trying to scale
65+ // up/down if some target cluster health check has failed
66+ HealthCheckFailedRequeueAfter = 20 * time .Second
6267)
6368
69+ type managementCluster interface {
70+ GetMachinesForCluster (ctx context.Context , cluster types.NamespacedName , filters ... func (machine * clusterv1.Machine ) bool ) ([]* clusterv1.Machine , error )
71+ TargetClusterControlPlaneIsHealthy (ctx context.Context , clusterKey types.NamespacedName , controlPlaneName string ) error
72+ TargetClusterEtcdIsHealthy (ctx context.Context , clusterKey types.NamespacedName , controlPlaneName string ) error
73+ }
74+
6475// +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch
6576// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;patch
6677// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io;bootstrap.cluster.x-k8s.io;controlplane.cluster.x-k8s.io,resources=*,verbs=get;list;watch;create;update;patch;delete
@@ -77,7 +88,7 @@ type KubeadmControlPlaneReconciler struct {
7788
7889 remoteClientGetter remote.ClusterClientGetter
7990
80- managementCluster * internal. ManagementCluster
91+ managementCluster managementCluster
8192}
8293
8394func (r * KubeadmControlPlaneReconciler ) SetupWithManager (mgr ctrl.Manager , options controller.Options ) error {
@@ -210,7 +221,6 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
210221 if requeueErr , ok := errors .Cause (err ).(capierrors.HasRequeueAfterError ); ok {
211222 logger .Error (err , "required certificates not found, requeueing" )
212223 return ctrl.Result {
213- Requeue : true ,
214224 RequeueAfter : requeueErr .GetRequeueAfter (),
215225 }, nil
216226 }
@@ -251,29 +261,36 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
251261 switch {
252262 // We are creating the first replica
253263 case numMachines < desiredReplicas && numMachines == 0 :
254- // create new Machine w/ init
255- logger .Info ("Scaling to 1 " , "Desired Replicas " , desiredReplicas , "Existing Replicas " , numMachines )
256- if err := r .initializeControlPlane (ctx , cluster , kcp ); err != nil {
257- logger . Error ( err , "Failed to initialize the Control Plane" )
258- r . recorder . Eventf ( kcp , corev1 . EventTypeWarning , "FailedInitialization" , " Failed to initialize the control plane: %v" , err )
259- return ctrl. Result {}, err
264+ // Create new Machine w/ init
265+ logger .Info ("Initializing control plane " , "Desired" , desiredReplicas , "Existing" , numMachines )
266+ result , err := r .initializeControlPlane (ctx , cluster , kcp )
267+ if err != nil {
268+ logger . Error ( err , " Failed to initialize control plane" )
269+ r . recorder . Eventf ( kcp , corev1 . EventTypeWarning , "FailedInitialization" , "Failed to initialize cluster %s/%s control plane: %v" , cluster . Namespace , cluster . Name , err )
260270 }
261- // scaling up
271+ // TODO: return the error if it is unexpected and should cause an immediate requeue
272+ return result , nil
273+ // We are scaling up
262274 case numMachines < desiredReplicas && numMachines > 0 :
263- // create a new Machine w/ join
264- logger .Info ("Scaling up" , "Desired Replicas" , desiredReplicas , "Existing Replicas" , numMachines )
265- wantMachines := desiredReplicas - numMachines
266- if err := r .scaleUpControlPlane (ctx , cluster , kcp , wantMachines ); err != nil {
267- logger .Error (err , "Failed to scale up the Control Plane" )
268- r .recorder .Eventf (kcp , corev1 .EventTypeWarning , "FailedScaleUp" , "Failed to scale up the control plane: %v" , err )
269- return ctrl.Result {}, err
275+ // Create a new Machine w/ join
276+ logger .Info ("Scaling up control plane" , "Desired" , desiredReplicas , "Existing" , numMachines )
277+ result , err := r .scaleUpControlPlane (ctx , cluster , kcp )
278+ if err != nil {
279+ logger .Error (err , "Failed to scale up control plane" )
280+ r .recorder .Eventf (kcp , corev1 .EventTypeWarning , "FailedScaleUp" , "Failed to scale up cluster %s/%s control plane: %v" , cluster .Namespace , cluster .Name , err )
270281 }
271- // scaling down
282+ // TODO: return the error if it is unexpected and should cause an immediate requeue
283+ return result , nil
284+ // We are scaling down
272285 case numMachines > desiredReplicas :
273- logger .Info ("Scaling down" , "Desired Replicas" , desiredReplicas , "Existing Replicas" , numMachines )
274- err := errors .New ("Not Implemented" )
275- logger .Error (err , "Should delete the appropriate Machine here." )
276- return ctrl.Result {}, err
286+ logger .Info ("Scaling down control plane" , "Desired" , desiredReplicas , "Existing" , numMachines )
287+ result , err := r .scaleDownControlPlane (ctx , cluster , kcp )
288+ if err != nil {
289+ logger .Error (err , "Failed to scale down control plane" )
290+ r .recorder .Eventf (kcp , corev1 .EventTypeWarning , "FailedScaleDown" , "Failed to scale down cluster %s/%s control plane: %v" , cluster .Namespace , cluster .Name , err )
291+ }
292+ // TODO: return the error if it is unexpected and should cause an immediate requeue
293+ return result , nil
277294 }
278295
279296 return ctrl.Result {}, nil
@@ -346,28 +363,70 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(ctx context.Context,
346363 return nil
347364}
348365
349- func (r * KubeadmControlPlaneReconciler ) scaleUpControlPlane (ctx context.Context , cluster * clusterv1.Cluster , kcp * controlplanev1.KubeadmControlPlane , numMachines int ) error {
350- var errs []error
366+ func (r * KubeadmControlPlaneReconciler ) initializeControlPlane (ctx context.Context , cluster * clusterv1.Cluster , kcp * controlplanev1.KubeadmControlPlane ) (ctrl.Result , error ) {
367+ bootstrapSpec := kcp .Spec .KubeadmConfigSpec .DeepCopy ()
368+ bootstrapSpec .JoinConfiguration = nil
369+
370+ if err := r .cloneConfigsAndGenerateMachine (ctx , cluster , kcp , bootstrapSpec ); err != nil {
371+ return ctrl.Result {}, errors .Wrapf (err , "failed to create control plane Machine for cluster %s/%s" , cluster .Name , cluster .Namespace )
372+ }
373+
374+ // Requeue the control plane, in case we are going to scale up
375+ return ctrl.Result {Requeue : true }, nil
376+ }
377+
378+ func (r * KubeadmControlPlaneReconciler ) scaleUpControlPlane (ctx context.Context , cluster * clusterv1.Cluster , kcp * controlplanev1.KubeadmControlPlane ) (ctrl.Result , error ) {
379+ if err := r .managementCluster .TargetClusterControlPlaneIsHealthy (ctx , clusterKey (cluster ), kcp .Name ); err != nil {
380+ return ctrl.Result {RequeueAfter : HealthCheckFailedRequeueAfter }, errors .Wrap (err , "control plane is not healthy" )
381+ }
382+
383+ if err := r .managementCluster .TargetClusterEtcdIsHealthy (ctx , clusterKey (cluster ), kcp .Name ); err != nil {
384+ return ctrl.Result {RequeueAfter : HealthCheckFailedRequeueAfter }, errors .Wrap (err , "etcd cluster is not healthy" )
385+ }
351386
352387 // Create the bootstrap configuration
353388 bootstrapSpec := kcp .Spec .KubeadmConfigSpec .DeepCopy ()
354389 bootstrapSpec .InitConfiguration = nil
355390 bootstrapSpec .ClusterConfiguration = nil
356391
357- for i := 0 ; i < numMachines ; i ++ {
358- err := r .cloneConfigsAndGenerateMachine (ctx , cluster , kcp , bootstrapSpec )
359- if err != nil {
360- errs = append (errs , errors .Wrap (err , "failed to clone and create an additional control plane Machine" ))
361- }
392+ if err := r .cloneConfigsAndGenerateMachine (ctx , cluster , kcp , bootstrapSpec ); err != nil {
393+ return ctrl.Result {}, errors .Wrapf (err , "failed to create control plane Machine for cluster %s/%s" , cluster .Name , cluster .Namespace )
362394 }
363395
364- return kerrors .NewAggregate (errs )
396+ // Requeue the control plane, in case we are not done scaling up
397+ return ctrl.Result {Requeue : true }, nil
365398}
366399
367- func (r * KubeadmControlPlaneReconciler ) initializeControlPlane (ctx context.Context , cluster * clusterv1.Cluster , kcp * controlplanev1.KubeadmControlPlane ) error {
368- bootstrapSpec := kcp .Spec .KubeadmConfigSpec .DeepCopy ()
369- bootstrapSpec .JoinConfiguration = nil
370- return r .cloneConfigsAndGenerateMachine (ctx , cluster , kcp , bootstrapSpec )
400+ func (r * KubeadmControlPlaneReconciler ) scaleDownControlPlane (ctx context.Context , cluster * clusterv1.Cluster , kcp * controlplanev1.KubeadmControlPlane ) (ctrl.Result , error ) {
401+ if err := r .managementCluster .TargetClusterControlPlaneIsHealthy (ctx , clusterKey (cluster ), kcp .Name ); err != nil {
402+ return ctrl.Result {RequeueAfter : HealthCheckFailedRequeueAfter }, errors .Wrap (err , "control plane is not healthy" )
403+ }
404+
405+ if err := r .managementCluster .TargetClusterEtcdIsHealthy (ctx , clusterKey (cluster ), kcp .Name ); err != nil {
406+ return ctrl.Result {RequeueAfter : HealthCheckFailedRequeueAfter }, errors .Wrap (err , "etcd cluster is not healthy" )
407+ }
408+
409+ ownedMachines , err := r .managementCluster .GetMachinesForCluster (ctx , clusterKey (cluster ), internal .OwnedControlPlaneMachines (kcp .Name ))
410+ if err != nil {
411+ return ctrl.Result {}, err
412+ }
413+
414+ // Wait for any delete in progress to complete before deleting another Machine
415+ if len (internal .FilterMachines (ownedMachines , internal .HasDeletionTimestamp ())) > 0 {
416+ return ctrl.Result {RequeueAfter : DeleteRequeueAfter }, nil
417+ }
418+
419+ machineToDelete , err := oldestMachine (ownedMachines )
420+ if err != nil {
421+ return ctrl.Result {}, errors .Wrap (err , "failed to pick control plane Machine to delete" )
422+ }
423+
424+ if err := r .Client .Delete (ctx , machineToDelete ); err != nil && ! apierrors .IsNotFound (err ) {
425+ return ctrl.Result {}, errors .Wrapf (err , "failed to delete control plane Machine %s/%s" , machineToDelete .Namespace , machineToDelete .Name )
426+ }
427+
428+ // Requeue the control plane, in case we are not done scaling down
429+ return ctrl.Result {Requeue : true }, nil
371430}
372431
373432func (r * KubeadmControlPlaneReconciler ) cloneConfigsAndGenerateMachine (ctx context.Context , cluster * clusterv1.Cluster , kcp * controlplanev1.KubeadmControlPlane , bootstrapSpec * bootstrapv1.KubeadmConfigSpec ) error {
@@ -558,7 +617,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
558617 if errs != nil {
559618 return ctrl.Result {}, kerrors .NewAggregate (errs )
560619 }
561- return ctrl.Result {Requeue : true , RequeueAfter : DeleteRequeueAfter }, nil
620+ return ctrl.Result {RequeueAfter : DeleteRequeueAfter }, nil
562621}
563622
564623func (r * KubeadmControlPlaneReconciler ) reconcileKubeconfig (ctx context.Context , clusterName types.NamespacedName , endpoint clusterv1.APIEndpoint , kcp * controlplanev1.KubeadmControlPlane ) error {
@@ -667,3 +726,11 @@ func clusterKey(cluster *clusterv1.Cluster) types.NamespacedName {
667726 Name : cluster .Name ,
668727 }
669728}
729+
730+ func oldestMachine (machines []* clusterv1.Machine ) (* clusterv1.Machine , error ) {
731+ if len (machines ) == 0 {
732+ return & clusterv1.Machine {}, errors .New ("no machines given" )
733+ }
734+ sort .Sort (util .MachinesByCreationTimestamp (machines ))
735+ return machines [0 ], nil
736+ }
0 commit comments