From 57dcfe7d558d44b55ec7a074f02b32bab48a206c Mon Sep 17 00:00:00 2001 From: Samuel Lang Date: Thu, 10 Dec 2020 15:14:02 +0100 Subject: [PATCH 1/2] :bug: fixing crashes on State-change events NTH crashes often from "EC2 Instance State-change Notification" events. The underlying issue was that an empty parsed NodeName derived from PrivateDnsName for forwarded unverified, creating cascading problems. This solves the root cause, that "EC2 Instance State-change Notification" can arrive at a time where the instance is in shutting-down, terminated or any other not-online situation where the PrivateDnsName metadata is empty! Instead of just ignoring these errors this implementation gets and decides based on the state of the instance message if it is an error to fail or to ignore. Therefor such messages are dropped in above situation because they are useless. --- pkg/monitor/sqsevent/sqs-monitor.go | 26 +++++++++++++++-- pkg/monitor/sqsevent/sqs-monitor_test.go | 36 ++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 46f81a99..d8054e6c 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -15,6 +15,7 @@ package sqsevent import ( "encoding/json" + "errors" "fmt" "github.com/aws/aws-node-termination-handler/pkg/monitor" @@ -33,6 +34,9 @@ const ( SQSTerminateKind = "SQS_TERMINATE" ) +// ErrNodeStateNotRunning forwards condition that the instance is terminated thus metadata missing +var ErrNodeStateNotRunning = errors.New("node metadata unavailable") + // SQSMonitor is a struct definition that knows how to process events from Amazon EventBridge type SQSMonitor struct { InterruptionChan chan<- monitor.InterruptionEvent @@ -54,6 +58,10 @@ func (m SQSMonitor) Kind() string { func (m SQSMonitor) Monitor() error { interruptionEvent, err := m.checkForSQSMessage() if err != nil { + if errors.Is(err, ErrNodeStateNotRunning) { + log.Warn().Err(err).Msg("dropping event for an already terminated node") + return nil + } return err } if interruptionEvent != nil && interruptionEvent.Kind == SQSTerminateKind { @@ -175,10 +183,24 @@ func (m SQSMonitor) retrieveNodeName(instanceID string) (string, error) { } instance := result.Reservations[0].Instances[0] - log.Debug().Msgf("Got nodename from private ip %s", *instance.PrivateDnsName) + nodeName := *instance.PrivateDnsName + log.Debug().Msgf("Got nodename from private ip %s", nodeName) instanceJSON, _ := json.MarshalIndent(*instance, " ", " ") log.Debug().Msgf("Got nodename from ec2 describe call: %s", instanceJSON) - return *instance.PrivateDnsName, nil + + if nodeName == "" { + state := "unknown" + // safe access instance.State potentially being nil + if instance.State != nil { + state = *instance.State.Name + } + // anything except running might not contain PrivateDnsName + if state != ec2.InstanceStateNameRunning { + return "", ErrNodeStateNotRunning + } + return "", fmt.Errorf("unable to retrieve PrivateDnsName name for '%s' in state '%s'", instanceID, state) + } + return nodeName, nil } // isInstanceManaged returns whether the instance specified should be managed by node termination handler diff --git a/pkg/monitor/sqsevent/sqs-monitor_test.go b/pkg/monitor/sqsevent/sqs-monitor_test.go index 06cfe0ba..b220406b 100644 --- a/pkg/monitor/sqsevent/sqs-monitor_test.go +++ b/pkg/monitor/sqsevent/sqs-monitor_test.go @@ -410,6 +410,42 @@ func TestMonitor_EC2NoDNSName(t *testing.T) { h.Ok(t, err) } +func TestMonitor_EC2NoDNSNameOnTerminatedInstance(t *testing.T) { + msg, err := getSQSMessageFromEvent(asgLifecycleEvent) + h.Ok(t, err) + messages := []*sqs.Message{ + &msg, + } + sqsMock := h.MockedSQS{ + ReceiveMessageResp: sqs.ReceiveMessageOutput{Messages: messages}, + ReceiveMessageErr: nil, + DeleteMessageResp: sqs.DeleteMessageOutput{}, + } + ec2Mock := h.MockedEC2{ + DescribeInstancesResp: getDescribeInstancesResp(""), + } + ec2Mock.DescribeInstancesResp.Reservations[0].Instances[0].State = &ec2.InstanceState{ + Name: aws.String("running"), + } + drainChan := make(chan monitor.InterruptionEvent) + + sqsMonitor := sqsevent.SQSMonitor{ + SQS: sqsMock, + EC2: ec2Mock, + ASG: mockIsManagedTrue(nil), + CheckIfManaged: true, + QueueURL: "https://test-queue", + InterruptionChan: drainChan, + } + go func() { + result := <-drainChan + h.Equals(t, result.Kind, sqsevent.SQSTerminateKind) + }() + + err = sqsMonitor.Monitor() + h.Nok(t, err) +} + func TestMonitor_SQSDeleteFailure(t *testing.T) { msg, err := getSQSMessageFromEvent(asgLifecycleEvent) h.Ok(t, err) From 12bde5b8f10ddb43e9e274b1c9bbf23037c0569a Mon Sep 17 00:00:00 2001 From: Samuel Lang Date: Thu, 10 Dec 2020 17:08:54 +0100 Subject: [PATCH 2/2] ignore events for nodes that are not belonging to the cluster --- cmd/node-termination-handler.go | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 069b8e76..da4f2024 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -37,6 +37,7 @@ import ( "github.com/aws/aws-sdk-go/service/sqs" "github.com/rs/zerolog" "github.com/rs/zerolog/log" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/wait" ) @@ -267,23 +268,33 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto if nthConfig.CordonOnly || drainEvent.IsRebalanceRecommendation() { err := node.Cordon(nodeName) if err != nil { - log.Log().Err(err).Msg("There was a problem while trying to cordon the node") - os.Exit(1) - } - log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned") - err = node.LogPods(nodeName) - if err != nil { - log.Log().Err(err).Msg("There was a problem while trying to log all pod names on the node") + if errors.IsNotFound(err) { + log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName) + } else { + log.Log().Err(err).Msg("There was a problem while trying to cordon the node") + os.Exit(1) + } + } else { + log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned") + err = node.LogPods(nodeName) + if err != nil { + log.Log().Err(err).Msg("There was a problem while trying to log all pod names on the node") + } + metrics.NodeActionsInc("cordon", nodeName, err) } - metrics.NodeActionsInc("cordon", nodeName, err) } else { err := node.CordonAndDrain(nodeName) if err != nil { - log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node") - os.Exit(1) + if errors.IsNotFound(err) { + log.Warn().Err(err).Msgf("node '%s' not found in the cluster", nodeName) + } else { + log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node") + os.Exit(1) + } + } else { + log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained") + metrics.NodeActionsInc("cordon-and-drain", nodeName, err) } - log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained") - metrics.NodeActionsInc("cordon-and-drain", nodeName, err) } interruptionEventStore.MarkAllAsDrained(nodeName)