@@ -3,16 +3,18 @@ package reconciler
33
44import (
55 "context"
6+ "errors"
67 "fmt"
78
8- "github.com/pkg/errors"
9+ pkgerrors "github.com/pkg/errors"
910 "github.com/sirupsen/logrus"
1011 corev1 "k8s.io/api/core/v1"
1112 rbacv1 "k8s.io/api/rbac/v1"
1213 apierrors "k8s.io/apimachinery/pkg/api/errors"
1314 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415 "k8s.io/apimachinery/pkg/labels"
1516 "k8s.io/apimachinery/pkg/util/intstr"
17+ "k8s.io/utils/ptr"
1618
1719 "github.com/operator-framework/api/pkg/operators/v1alpha1"
1820 "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -284,19 +286,19 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(catalogSource *v1alph
284286
285287 //TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
286288 if err := c .ensureServiceAccount (source , overwrite ); err != nil {
287- return errors .Wrapf (err , "error ensuring service account: %s" , source .serviceAccountName ())
289+ return pkgerrors .Wrapf (err , "error ensuring service account: %s" , source .serviceAccountName ())
288290 }
289291 if err := c .ensureRole (source , overwrite ); err != nil {
290- return errors .Wrapf (err , "error ensuring role: %s" , source .roleName ())
292+ return pkgerrors .Wrapf (err , "error ensuring role: %s" , source .roleName ())
291293 }
292294 if err := c .ensureRoleBinding (source , overwrite ); err != nil {
293- return errors .Wrapf (err , "error ensuring rolebinding: %s" , source .RoleBinding ().GetName ())
295+ return pkgerrors .Wrapf (err , "error ensuring rolebinding: %s" , source .RoleBinding ().GetName ())
294296 }
295297 if err := c .ensurePod (source , overwritePod ); err != nil {
296- return errors .Wrapf (err , "error ensuring pod: %s" , source .Pod (image ).GetName ())
298+ return pkgerrors .Wrapf (err , "error ensuring pod: %s" , source .Pod (image ).GetName ())
297299 }
298300 if err := c .ensureService (source , overwrite ); err != nil {
299- return errors .Wrapf (err , "error ensuring service: %s" , source .Service ().GetName ())
301+ return pkgerrors .Wrapf (err , "error ensuring service: %s" , source .Service ().GetName ())
300302 }
301303
302304 if overwritePod {
@@ -363,15 +365,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
363365 }
364366 for _ , p := range currentPods {
365367 if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (pod .GetNamespace ()).Delete (context .TODO (), p .GetName (), * metav1 .NewDeleteOptions (1 )); err != nil && ! apierrors .IsNotFound (err ) {
366- return errors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
368+ return pkgerrors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
367369 }
368370 }
369371 }
370372 _ , err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (pod .GetNamespace ()).Create (context .TODO (), pod , metav1.CreateOptions {})
371373 if err == nil {
372374 return nil
373375 }
374- return errors .Wrapf (err , "error creating new pod: %s" , pod .GetGenerateName ())
376+ return pkgerrors .Wrapf (err , "error creating new pod: %s" , pod .GetGenerateName ())
375377}
376378
377379func (c * ConfigMapRegistryReconciler ) ensureService (source configMapCatalogSourceDecorator , overwrite bool ) error {
@@ -390,16 +392,15 @@ func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourc
390392}
391393
392394// CheckRegistryServer returns true if the given CatalogSource is considered healthy; false otherwise.
393- func (c * ConfigMapRegistryReconciler ) CheckRegistryServer (catalogSource * v1alpha1.CatalogSource ) (healthy bool , err error ) {
395+ func (c * ConfigMapRegistryReconciler ) CheckRegistryServer (catalogSource * v1alpha1.CatalogSource ) (bool , error ) {
394396 source := configMapCatalogSourceDecorator {catalogSource , c .createPodAsUser }
395-
396397 image := c .Image
397398 if source .Spec .SourceType == "grpc" {
398399 image = source .Spec .Image
399400 }
400401 if image == "" {
401- err = fmt .Errorf ("no image for registry" )
402- return
402+ err : = fmt .Errorf ("no image for registry" )
403+ return false , err
403404 }
404405
405406 if source .Spec .SourceType == v1alpha1 .SourceTypeConfigmap || source .Spec .SourceType == v1alpha1 .SourceTypeInternal {
@@ -426,10 +427,59 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha
426427 c .currentRoleBinding (source ) == nil ||
427428 c .currentService (source ) == nil ||
428429 len (c .currentPods (source , c .Image )) < 1 {
429- healthy = false
430- return
430+
431+ return false , nil
431432 }
432433
433- healthy = true
434- return
434+ podsAreLive , e := detectAndDeleteDeadPods (c .OpClient , c .currentPods (source , c .Image ), source .GetNamespace ())
435+ if e != nil {
436+ return false , fmt .Errorf ("error deleting dead pods: %v" , e )
437+ }
438+ return podsAreLive , nil
439+ }
440+
441+ // detectAndDeleteDeadPods determines if there are registry client pods that are in the deleted state
442+ // but have not been removed by GC (eg the node goes down before GC can remove them), and attempts to
443+ // force delete the pods. If there are live registry pods remaining, it returns true, otherwise returns false.
444+ func detectAndDeleteDeadPods (client operatorclient.ClientInterface , pods []* corev1.Pod , sourceNamespace string ) (bool , error ) {
445+ var forceDeletionErrs []error
446+ livePodFound := false
447+ for _ , pod := range pods {
448+ if ! isPodDead (pod ) {
449+ livePodFound = true
450+ continue
451+ }
452+ if err := client .KubernetesInterface ().CoreV1 ().Pods (sourceNamespace ).Delete (context .TODO (), pod .GetName (), metav1.DeleteOptions {
453+ GracePeriodSeconds : ptr.To [int64 ](0 ),
454+ }); err != nil && ! apierrors .IsNotFound (err ) {
455+ forceDeletionErrs = append (forceDeletionErrs , err )
456+ }
457+ }
458+ if len (forceDeletionErrs ) > 0 {
459+ return false , errors .Join (forceDeletionErrs ... )
460+ }
461+ return livePodFound , nil
462+ }
463+
464+ func isPodDead (pod * corev1.Pod ) bool {
465+ for _ , check := range []func (* corev1.Pod ) bool {
466+ isPodDeletedByTaintManager ,
467+ } {
468+ if check (pod ) {
469+ return true
470+ }
471+ }
472+ return false
473+ }
474+
475+ func isPodDeletedByTaintManager (pod * corev1.Pod ) bool {
476+ if pod .DeletionTimestamp == nil {
477+ return false
478+ }
479+ for _ , condition := range pod .Status .Conditions {
480+ if condition .Type == corev1 .DisruptionTarget && condition .Reason == "DeletionByTaintManager" && condition .Status == corev1 .ConditionTrue {
481+ return true
482+ }
483+ }
484+ return false
435485}
0 commit comments