@@ -8,20 +8,23 @@ import (
88
99 "github.com/google/go-cmp/cmp"
1010 "github.com/operator-framework/api/pkg/operators/v1alpha1"
11- "github.com/pkg/errors"
11+ pkgerrors "github.com/pkg/errors"
1212 "github.com/sirupsen/logrus"
1313 corev1 "k8s.io/api/core/v1"
1414 apierrors "k8s.io/apimachinery/pkg/api/errors"
1515 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1616 "k8s.io/apimachinery/pkg/labels"
1717 "k8s.io/apimachinery/pkg/util/intstr"
18+ "k8s.io/utils/ptr"
1819
20+ "errors"
1921 "github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
2022 controllerclient "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/controller-runtime/client"
2123 hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
2224 "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
2325 "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorlister"
2426 "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/ownerutil"
27+ "slices"
2528)
2629
2730const (
@@ -262,7 +265,7 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
262265 //TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
263266 sa , err := c .ensureSA (source )
264267 if err != nil && ! apierrors .IsAlreadyExists (err ) {
265- return errors .Wrapf (err , "error ensuring service account: %s" , source .GetName ())
268+ return pkgerrors .Wrapf (err , "error ensuring service account: %s" , source .GetName ())
266269 }
267270
268271 sa , err = c .OpClient .GetServiceAccount (sa .GetNamespace (), sa .GetName ())
@@ -285,20 +288,20 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
285288 return err
286289 }
287290 if err := c .ensurePod (logger , source , sa , overwritePod ); err != nil {
288- return errors .Wrapf (err , "error ensuring pod: %s" , pod .GetName ())
291+ return pkgerrors .Wrapf (err , "error ensuring pod: %s" , pod .GetName ())
289292 }
290293 if err := c .ensureUpdatePod (logger , sa , source ); err != nil {
291294 if _ , ok := err .(UpdateNotReadyErr ); ok {
292295 return err
293296 }
294- return errors .Wrapf (err , "error ensuring updated catalog source pod: %s" , pod .GetName ())
297+ return pkgerrors .Wrapf (err , "error ensuring updated catalog source pod: %s" , pod .GetName ())
295298 }
296299 service , err := source .Service ()
297300 if err != nil {
298301 return err
299302 }
300303 if err := c .ensureService (source , overwrite ); err != nil {
301- return errors .Wrapf (err , "error ensuring service: %s" , service .GetName ())
304+ return pkgerrors .Wrapf (err , "error ensuring service: %s" , service .GetName ())
302305 }
303306
304307 if overwritePod {
@@ -338,16 +341,35 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) (bool, err
338341}
339342
340343func (c * GrpcRegistryReconciler ) ensurePod (logger * logrus.Entry , source grpcCatalogSourceDecorator , serviceAccount * corev1.ServiceAccount , overwrite bool ) error {
341- // currentLivePods refers to the currently live instances of the catalog source
342- currentLivePods := c .currentPods (logger , source )
343- if len (currentLivePods ) > 0 {
344+ // currentPods refers to the current pod instances of the catalog source
345+ currentPods := c .currentPods (logger , source )
346+
347+ var forceDeleteErrs []error
348+ currentPods = slices .DeleteFunc (currentPods , func (pod * corev1.Pod ) bool {
349+ if ! isPodDead (pod ) {
350+ logger .WithFields (logrus.Fields {"pod.namespace" : source .GetNamespace (), "pod.name" : pod .GetName ()}).Debug ("pod is alive" )
351+ return false
352+ }
353+ logger .WithFields (logrus.Fields {"pod.namespace" : source .GetNamespace (), "pod.name" : pod .GetName ()}).Info ("force deleting dead pod" )
354+ if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (source .GetNamespace ()).Delete (context .TODO (), pod .GetName (), metav1.DeleteOptions {
355+ GracePeriodSeconds : ptr.To [int64 ](0 ),
356+ }); err != nil && ! apierrors .IsNotFound (err ) {
357+ forceDeleteErrs = append (forceDeleteErrs , pkgerrors .Wrapf (err , "error deleting old pod: %s" , pod .GetName ()))
358+ }
359+ return true
360+ })
361+ if len (forceDeleteErrs ) > 0 {
362+ return errors .Join (forceDeleteErrs ... )
363+ }
364+
365+ if len (currentPods ) > 0 {
344366 if ! overwrite {
345367 return nil
346368 }
347- for _ , p := range currentLivePods {
369+ for _ , p := range currentPods {
348370 logger .WithFields (logrus.Fields {"pod.namespace" : source .GetNamespace (), "pod.name" : p .GetName ()}).Info ("deleting current pod" )
349371 if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (source .GetNamespace ()).Delete (context .TODO (), p .GetName (), * metav1 .NewDeleteOptions (1 )); err != nil && ! apierrors .IsNotFound (err ) {
350- return errors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
372+ return pkgerrors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
351373 }
352374 }
353375 }
@@ -358,7 +380,7 @@ func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCata
358380 logger .WithFields (logrus.Fields {"pod.namespace" : desiredPod .GetNamespace (), "pod.name" : desiredPod .GetName ()}).Info ("creating desired pod" )
359381 _ , err = c .OpClient .KubernetesInterface ().CoreV1 ().Pods (source .GetNamespace ()).Create (context .TODO (), desiredPod , metav1.CreateOptions {})
360382 if err != nil {
361- return errors .Wrapf (err , "error creating new pod: %s" , desiredPod .GetGenerateName ())
383+ return pkgerrors .Wrapf (err , "error creating new pod: %s" , desiredPod .GetGenerateName ())
362384 }
363385
364386 return nil
@@ -378,7 +400,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
378400 logger .Infof ("catalog update required at %s" , time .Now ().String ())
379401 pod , err := c .createUpdatePod (source , serviceAccount )
380402 if err != nil {
381- return errors .Wrapf (err , "creating update catalog source pod" )
403+ return pkgerrors .Wrapf (err , "creating update catalog source pod" )
382404 }
383405 source .SetLastUpdateTime ()
384406 return UpdateNotReadyErr {catalogName : source .GetName (), podName : pod .GetName ()}
@@ -410,7 +432,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
410432 for _ , p := range currentLivePods {
411433 logger .WithFields (logrus.Fields {"live-pod.namespace" : source .GetNamespace (), "live-pod.name" : p .Name }).Info ("deleting current live pods" )
412434 if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (source .GetNamespace ()).Delete (context .TODO (), p .GetName (), * metav1 .NewDeleteOptions (1 )); err != nil && ! apierrors .IsNotFound (err ) {
413- return errors .Wrapf (errors .Wrapf (err , "error deleting pod: %s" , p .GetName ()), "detected imageID change: error deleting old catalog source pod" )
435+ return pkgerrors .Wrapf (pkgerrors .Wrapf (err , "error deleting pod: %s" , p .GetName ()), "detected imageID change: error deleting old catalog source pod" )
414436 }
415437 }
416438 // done syncing
@@ -420,7 +442,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
420442 // delete update pod right away, since the digest match, to prevent long-lived duplicate catalog pods
421443 logger .WithFields (logrus.Fields {"update-pod.namespace" : updatePod .Namespace , "update-pod.name" : updatePod .Name }).Debug ("catalog polling result: no update; removing duplicate update pod" )
422444 if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (source .GetNamespace ()).Delete (context .TODO (), updatePod .GetName (), * metav1 .NewDeleteOptions (1 )); err != nil && ! apierrors .IsNotFound (err ) {
423- return errors .Wrapf (errors .Wrapf (err , "error deleting pod: %s" , updatePod .GetName ()), "duplicate catalog polling pod" )
445+ return pkgerrors .Wrapf (pkgerrors .Wrapf (err , "error deleting pod: %s" , updatePod .GetName ()), "duplicate catalog polling pod" )
424446 }
425447 }
426448
@@ -523,6 +545,29 @@ func imageChanged(logger *logrus.Entry, updatePod *corev1.Pod, servingPods []*co
523545 return false
524546}
525547
548+ func isPodDead (pod * corev1.Pod ) bool {
549+ for _ , check := range []func (* corev1.Pod ) bool {
550+ isPodDeletedByTaintManager ,
551+ } {
552+ if check (pod ) {
553+ return true
554+ }
555+ }
556+ return false
557+ }
558+
559+ func isPodDeletedByTaintManager (pod * corev1.Pod ) bool {
560+ if pod .DeletionTimestamp == nil {
561+ return false
562+ }
563+ for _ , condition := range pod .Status .Conditions {
564+ if condition .Type == corev1 .DisruptionTarget && condition .Reason == "DeletionByTaintManager" && condition .Status == corev1 .ConditionTrue {
565+ return true
566+ }
567+ }
568+ return false
569+ }
570+
526571// imageID returns the ImageID of the primary catalog source container or an empty string if the image ID isn't available yet.
527572// Note: the pod must be running and the container in a ready status to return a valid ImageID.
528573func imageID (pod * corev1.Pod ) string {
@@ -545,7 +590,7 @@ func imageID(pod *corev1.Pod) string {
545590func (c * GrpcRegistryReconciler ) removePods (pods []* corev1.Pod , namespace string ) error {
546591 for _ , p := range pods {
547592 if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (namespace ).Delete (context .TODO (), p .GetName (), * metav1 .NewDeleteOptions (1 )); err != nil && ! apierrors .IsNotFound (err ) {
548- return errors .Wrapf (err , "error deleting pod: %s" , p .GetName ())
593+ return pkgerrors .Wrapf (err , "error deleting pod: %s" , p .GetName ())
549594 }
550595 }
551596 return nil
@@ -623,7 +668,7 @@ func (c *GrpcRegistryReconciler) podFailed(pod *corev1.Pod) (bool, error) {
623668 logrus .WithField ("UpdatePod" , pod .GetName ()).Infof ("catalog polling result: update pod %s failed to start" , pod .GetName ())
624669 err := c .removePods ([]* corev1.Pod {pod }, pod .GetNamespace ())
625670 if err != nil {
626- return true , errors .Wrapf (err , "error deleting failed catalog polling pod: %s" , pod .GetName ())
671+ return true , pkgerrors .Wrapf (err , "error deleting failed catalog polling pod: %s" , pod .GetName ())
627672 }
628673 return true , nil
629674 }
0 commit comments