Skip to content

Commit 9e5d9e2

Browse files
committed
🐛 fixing crashes on State-change events
NTH crashes often from "EC2 Instance State-change Notification" events. The underlying issue was that an empty parsed NodeName derived from PrivateDnsName for forwarded unverified, creating cascading problems. This solves the root cause, that "EC2 Instance State-change Notification" can arrive at a time where the instance is in shutting-down, terminated or any other not-online situation where the PrivateDnsName metadata is empty! Instead of just ignoring these errors this implementation gets and decides based on the state of the instance message if it is an error to fail or to ignore. Therefor such messages are dropped in above situation because they are useless.
1 parent 363674f commit 9e5d9e2

File tree

2 files changed

+61
-3
lines changed

2 files changed

+61
-3
lines changed

pkg/monitor/sqsevent/sqs-monitor.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package sqsevent
1515

1616
import (
1717
"encoding/json"
18+
"errors"
1819
"fmt"
1920

2021
"github.com/aws/aws-node-termination-handler/pkg/monitor"
@@ -33,6 +34,9 @@ const (
3334
SQSTerminateKind = "SQS_TERMINATE"
3435
)
3536

37+
// ErrNodeTerminated forwards condition that the instance is terminated thus metadata missing
38+
var ErrNodeMetadataUnavailable = errors.New("node metadata unavailable")
39+
3640
// SQSMonitor is a struct definition that knows how to process events from Amazon EventBridge
3741
type SQSMonitor struct {
3842
InterruptionChan chan<- monitor.InterruptionEvent
@@ -54,6 +58,10 @@ func (m SQSMonitor) Kind() string {
5458
func (m SQSMonitor) Monitor() error {
5559
interruptionEvent, err := m.checkForSQSMessage()
5660
if err != nil {
61+
if errors.Is(err, ErrNodeMetadataUnavailable) {
62+
log.Warn().Err(err).Msg("dropping event for an already terminated node")
63+
return nil
64+
}
5765
return err
5866
}
5967
if interruptionEvent != nil && interruptionEvent.Kind == SQSTerminateKind {
@@ -175,10 +183,24 @@ func (m SQSMonitor) retrieveNodeName(instanceID string) (string, error) {
175183
}
176184

177185
instance := result.Reservations[0].Instances[0]
178-
log.Debug().Msgf("Got nodename from private ip %s", *instance.PrivateDnsName)
186+
nodeName := *instance.PrivateDnsName
187+
log.Debug().Msgf("Got nodename from private ip %s", nodeName)
179188
instanceJSON, _ := json.MarshalIndent(*instance, " ", " ")
180189
log.Debug().Msgf("Got nodename from ec2 describe call: %s", instanceJSON)
181-
return *instance.PrivateDnsName, nil
190+
191+
if nodeName == "" {
192+
state := "unknown"
193+
// safe access instance.State potentially being nil
194+
if instance.State != nil {
195+
state = *instance.State.Name
196+
}
197+
// anything except running might not contain PrivateDnsName
198+
if state != ec2.InstanceStateNameRunning {
199+
return "", ErrNodeMetadataUnavailable
200+
}
201+
return "", fmt.Errorf("unable to retrieve PrivateDnsName name for '%s' in state '%s'", instanceID, state)
202+
}
203+
return nodeName, nil
182204
}
183205

184206
// isInstanceManaged returns whether the instance specified should be managed by node termination handler

pkg/monitor/sqsevent/sqs-monitor_test.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ func TestMonitor_EC2NoInstances(t *testing.T) {
377377
}
378378
}
379379

380-
func TestMonitor_EC2NoDNSName(t *testing.T) {
380+
func TestMonitor_EC2NoDNSNameFailure(t *testing.T) {
381381
msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
382382
h.Ok(t, err)
383383
messages := []*sqs.Message{
@@ -410,6 +410,42 @@ func TestMonitor_EC2NoDNSName(t *testing.T) {
410410
h.Ok(t, err)
411411
}
412412

413+
func TestMonitor_EC2NoDNSNameOnTerminatedInstance(t *testing.T) {
414+
msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
415+
h.Ok(t, err)
416+
messages := []*sqs.Message{
417+
&msg,
418+
}
419+
sqsMock := h.MockedSQS{
420+
ReceiveMessageResp: sqs.ReceiveMessageOutput{Messages: messages},
421+
ReceiveMessageErr: nil,
422+
DeleteMessageResp: sqs.DeleteMessageOutput{},
423+
}
424+
ec2Mock := h.MockedEC2{
425+
DescribeInstancesResp: getDescribeInstancesResp(""),
426+
}
427+
ec2Mock.DescribeInstancesResp.Reservations[0].Instances[0].State = &ec2.InstanceState{
428+
Name: aws.String("running"),
429+
}
430+
drainChan := make(chan monitor.InterruptionEvent)
431+
432+
sqsMonitor := sqsevent.SQSMonitor{
433+
SQS: sqsMock,
434+
EC2: ec2Mock,
435+
ASG: mockIsManagedTrue(nil),
436+
CheckIfManaged: true,
437+
QueueURL: "https://test-queue",
438+
InterruptionChan: drainChan,
439+
}
440+
go func() {
441+
result := <-drainChan
442+
h.Equals(t, result.Kind, sqsevent.SQSTerminateKind)
443+
}()
444+
445+
err = sqsMonitor.Monitor()
446+
h.Nok(t, err)
447+
}
448+
413449
func TestMonitor_SQSDeleteFailure(t *testing.T) {
414450
msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
415451
h.Ok(t, err)

0 commit comments

Comments
 (0)