Skip to content

Commit 31f4b9f

Browse files
authored
add complete lifecycle action delay (#680)
1 parent d4d447c commit 31f4b9f

File tree

8 files changed

+129
-63
lines changed

8 files changed

+129
-63
lines changed

cmd/node-termination-handler.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -179,15 +179,17 @@ func main() {
179179
}
180180
log.Debug().Msgf("AWS Credentials retrieved from provider: %s", creds.ProviderName)
181181

182+
completeLifecycleActionDelay := time.Duration(nthConfig.CompleteLifecycleActionDelaySeconds) * time.Second
182183
sqsMonitor := sqsevent.SQSMonitor{
183-
CheckIfManaged: nthConfig.CheckTagBeforeDraining,
184-
ManagedTag: nthConfig.ManagedTag,
185-
QueueURL: nthConfig.QueueURL,
186-
InterruptionChan: interruptionChan,
187-
CancelChan: cancelChan,
188-
SQS: sqs.New(sess),
189-
ASG: autoscaling.New(sess),
190-
EC2: ec2.New(sess),
184+
CheckIfManaged: nthConfig.CheckTagBeforeDraining,
185+
ManagedTag: nthConfig.ManagedTag,
186+
QueueURL: nthConfig.QueueURL,
187+
InterruptionChan: interruptionChan,
188+
CancelChan: cancelChan,
189+
SQS: sqs.New(sess),
190+
ASG: autoscaling.New(sess),
191+
EC2: ec2.New(sess),
192+
BeforeCompleteLifecycleAction: func() { <-time.After(completeLifecycleActionDelay) },
191193
}
192194
monitoringFns[sqsEvents] = sqsMonitor
193195
}

config/helm/aws-node-termination-handler/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
8282
| `podTerminationGracePeriod` | The time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used, which defaults to 30 seconds if not specified for the pod. | `-1` |
8383
| `nodeTerminationGracePeriod` | Period of time in seconds given to each node to terminate gracefully. Node draining will be scheduled based on this value to optimize the amount of compute time, but still safely drain the node before an event. | `120` |
8484
| `emitKubernetesEvents` | If `true`, Kubernetes events will be emitted when interruption events are received and when actions are taken on Kubernetes nodes. In IMDS Processor mode a default set of annotations with all the node metadata gathered from IMDS will be attached to each event. More information [here](https://github.com/aws/aws-node-termination-handler/blob/main/docs/kubernetes_events.md). | `false` |
85+
| `completeLifecycleActionDelaySeconds` | Pause after draining the node before completing the EC2 Autoscaling lifecycle action. This may be helpful if Pods on the node have Persistent Volume Claims. | -1 |
8586
| `kubernetesEventsExtraAnnotations` | A comma-separated list of `key=value` extra annotations to attach to all emitted Kubernetes events (e.g. `first=annotation,sample.annotation/number=two"`). | `""` |
8687
| `webhookURL` | Posts event data to URL upon instance interruption action. | `""` |
8788
| `webhookURLSecretName` | Pass the webhook URL as a Secret using the key `webhookurl`. | `""` |

config/helm/aws-node-termination-handler/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ spec:
106106
value: {{ .Values.nodeTerminationGracePeriod | quote }}
107107
- name: EMIT_KUBERNETES_EVENTS
108108
value: {{ .Values.emitKubernetesEvents | quote }}
109+
- name: COMPLETE_LIFECYCLE_ACTION_DELAY_SECONDS
110+
value: {{ .Values.completeLifecycleActionDelaySeconds | quote }}
109111
{{- with .Values.kubernetesEventsExtraAnnotations }}
110112
- name: KUBERNETES_EVENTS_EXTRA_ANNOTATIONS
111113
value: {{ . | quote }}

config/helm/aws-node-termination-handler/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ nodeTerminationGracePeriod: 120
100100
# emitKubernetesEvents If true, Kubernetes events will be emitted when interruption events are received and when actions are taken on Kubernetes nodes. In IMDS Processor mode a default set of annotations with all the node metadata gathered from IMDS will be attached to each event
101101
emitKubernetesEvents: false
102102

103+
# completeLifecycleActionDelaySeconds will pause for the configured duration after draining the node before completing the EC2 Autoscaling lifecycle action. This may be helpful if Pods on the node have Persistent Volume Claims.
104+
completeLifecycleActionDelaySeconds: -1
105+
103106
# kubernetesEventsExtraAnnotations A comma-separated list of key=value extra annotations to attach to all emitted Kubernetes events
104107
# Example: "first=annotation,sample.annotation/number=two"
105108
kubernetesEventsExtraAnnotations: ""

pkg/config/config.go

Lines changed: 49 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -102,57 +102,59 @@ const (
102102
awsRegionConfigKey = "AWS_REGION"
103103
awsEndpointConfigKey = "AWS_ENDPOINT"
104104
queueURLConfigKey = "QUEUE_URL"
105+
completeLifecycleActionDelaySecondsKey = "COMPLETE_LIFECYCLE_ACTION_DELAY_SECONDS"
105106
)
106107

107-
//Config arguments set via CLI, environment variables, or defaults
108+
// Config arguments set via CLI, environment variables, or defaults
108109
type Config struct {
109-
DryRun bool
110-
NodeName string
111-
PodName string
112-
MetadataURL string
113-
IgnoreDaemonSets bool
114-
DeleteLocalData bool
115-
KubernetesServiceHost string
116-
KubernetesServicePort string
117-
PodTerminationGracePeriod int
118-
NodeTerminationGracePeriod int
119-
WebhookURL string
120-
WebhookHeaders string
121-
WebhookTemplate string
122-
WebhookTemplateFile string
123-
WebhookProxy string
124-
EnableScheduledEventDraining bool
125-
EnableSpotInterruptionDraining bool
126-
EnableSQSTerminationDraining bool
127-
EnableRebalanceMonitoring bool
128-
EnableRebalanceDraining bool
129-
CheckASGTagBeforeDraining bool
130-
CheckTagBeforeDraining bool
131-
ManagedAsgTag string
132-
ManagedTag string
133-
MetadataTries int
134-
CordonOnly bool
135-
TaintNode bool
136-
TaintEffect string
137-
ExcludeFromLoadBalancers bool
138-
JsonLogging bool
139-
LogLevel string
140-
UptimeFromFile string
141-
EnablePrometheus bool
142-
PrometheusPort int
143-
EnableProbes bool
144-
ProbesPort int
145-
ProbesEndpoint string
146-
EmitKubernetesEvents bool
147-
KubernetesEventsExtraAnnotations string
148-
AWSRegion string
149-
AWSEndpoint string
150-
QueueURL string
151-
Workers int
152-
UseProviderId bool
110+
DryRun bool
111+
NodeName string
112+
PodName string
113+
MetadataURL string
114+
IgnoreDaemonSets bool
115+
DeleteLocalData bool
116+
KubernetesServiceHost string
117+
KubernetesServicePort string
118+
PodTerminationGracePeriod int
119+
NodeTerminationGracePeriod int
120+
WebhookURL string
121+
WebhookHeaders string
122+
WebhookTemplate string
123+
WebhookTemplateFile string
124+
WebhookProxy string
125+
EnableScheduledEventDraining bool
126+
EnableSpotInterruptionDraining bool
127+
EnableSQSTerminationDraining bool
128+
EnableRebalanceMonitoring bool
129+
EnableRebalanceDraining bool
130+
CheckASGTagBeforeDraining bool
131+
CheckTagBeforeDraining bool
132+
ManagedAsgTag string
133+
ManagedTag string
134+
MetadataTries int
135+
CordonOnly bool
136+
TaintNode bool
137+
TaintEffect string
138+
ExcludeFromLoadBalancers bool
139+
JsonLogging bool
140+
LogLevel string
141+
UptimeFromFile string
142+
EnablePrometheus bool
143+
PrometheusPort int
144+
EnableProbes bool
145+
ProbesPort int
146+
ProbesEndpoint string
147+
EmitKubernetesEvents bool
148+
KubernetesEventsExtraAnnotations string
149+
AWSRegion string
150+
AWSEndpoint string
151+
QueueURL string
152+
Workers int
153+
UseProviderId bool
154+
CompleteLifecycleActionDelaySeconds int
153155
}
154156

155-
//ParseCliArgs parses cli arguments and uses environment variables as fallback values
157+
// ParseCliArgs parses cli arguments and uses environment variables as fallback values
156158
func ParseCliArgs() (config Config, err error) {
157159
var gracePeriod int
158160
defer func() {
@@ -208,6 +210,7 @@ func ParseCliArgs() (config Config, err error) {
208210
flag.StringVar(&config.QueueURL, "queue-url", getEnv(queueURLConfigKey, ""), "Listens for messages on the specified SQS queue URL")
209211
flag.IntVar(&config.Workers, "workers", getIntEnv(workersConfigKey, workersDefault), "The amount of parallel event processors.")
210212
flag.BoolVar(&config.UseProviderId, "use-provider-id", getBoolEnv(useProviderIdConfigKey, useProviderIdDefault), "If true, fetch node name through Kubernetes node spec ProviderID instead of AWS event PrivateDnsHostname.")
213+
flag.IntVar(&config.CompleteLifecycleActionDelaySeconds, "complete-lifecycle-action-delay-seconds", getIntEnv(completeLifecycleActionDelaySecondsKey, -1), "Delay completing the Autoscaling lifecycle action after a node has been drained.")
211214
flag.Parse()
212215

213216
if isConfigProvided("pod-termination-grace-period", podTerminationGracePeriodConfigKey) && isConfigProvided("grace-period", gracePeriodConfigKey) {

pkg/monitor/sqsevent/asg-lifecycle-event.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m
9191
}
9292

9393
interruptionEvent.PostDrainTask = func(interruptionEvent monitor.InterruptionEvent, _ node.Node) error {
94-
_, err := m.ASG.CompleteLifecycleAction(&autoscaling.CompleteLifecycleActionInput{
94+
_, err := m.completeLifecycleAction(&autoscaling.CompleteLifecycleActionInput{
9595
AutoScalingGroupName: &lifecycleDetail.AutoScalingGroupName,
9696
LifecycleActionResult: aws.String("CONTINUE"),
9797
LifecycleHookName: &lifecycleDetail.LifecycleHookName,

pkg/monitor/sqsevent/sqs-monitor.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/aws/aws-node-termination-handler/pkg/monitor"
2222
"github.com/aws/aws-sdk-go/aws"
2323
"github.com/aws/aws-sdk-go/aws/awserr"
24+
"github.com/aws/aws-sdk-go/service/autoscaling"
2425
"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
2526
"github.com/aws/aws-sdk-go/service/ec2"
2627
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
@@ -41,14 +42,15 @@ const (
4142

4243
// SQSMonitor is a struct definition that knows how to process events from Amazon EventBridge
4344
type SQSMonitor struct {
44-
InterruptionChan chan<- monitor.InterruptionEvent
45-
CancelChan chan<- monitor.InterruptionEvent
46-
QueueURL string
47-
SQS sqsiface.SQSAPI
48-
ASG autoscalingiface.AutoScalingAPI
49-
EC2 ec2iface.EC2API
50-
CheckIfManaged bool
51-
ManagedTag string
45+
InterruptionChan chan<- monitor.InterruptionEvent
46+
CancelChan chan<- monitor.InterruptionEvent
47+
QueueURL string
48+
SQS sqsiface.SQSAPI
49+
ASG autoscalingiface.AutoScalingAPI
50+
EC2 ec2iface.EC2API
51+
CheckIfManaged bool
52+
ManagedTag string
53+
BeforeCompleteLifecycleAction func()
5254
}
5355

5456
// InterruptionEventWrapper is a convenience wrapper for associating an interruption event with its error, if any
@@ -283,6 +285,14 @@ func (m SQSMonitor) deleteMessages(messages []*sqs.Message) []error {
283285
return errs
284286
}
285287

288+
// completeLifecycleAction completes the lifecycle action after calling the "before" hook.
289+
func (m SQSMonitor) completeLifecycleAction(input *autoscaling.CompleteLifecycleActionInput) (*autoscaling.CompleteLifecycleActionOutput, error) {
290+
if m.BeforeCompleteLifecycleAction != nil {
291+
m.BeforeCompleteLifecycleAction()
292+
}
293+
return m.ASG.CompleteLifecycleAction(input)
294+
}
295+
286296
// NodeInfo is relevant information about a single node
287297
type NodeInfo struct {
288298
AsgName string

pkg/monitor/sqsevent/sqs-monitor_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,51 @@ func TestMonitor_DrainTasks(t *testing.T) {
345345
}
346346
}
347347

348+
func TestMonitor_DrainTasks_Delay(t *testing.T) {
349+
msg, err := getSQSMessageFromEvent(asgLifecycleEvent)
350+
h.Ok(t, err)
351+
352+
sqsMock := h.MockedSQS{
353+
ReceiveMessageResp: sqs.ReceiveMessageOutput{Messages: []*sqs.Message{&msg}},
354+
ReceiveMessageErr: nil,
355+
DeleteMessageResp: sqs.DeleteMessageOutput{},
356+
}
357+
dnsNodeName := "ip-10-0-0-157.us-east-2.compute.internal"
358+
ec2Mock := h.MockedEC2{
359+
DescribeInstancesResp: getDescribeInstancesResp(dnsNodeName, true, true),
360+
}
361+
asgMock := h.MockedASG{
362+
CompleteLifecycleActionResp: autoscaling.CompleteLifecycleActionOutput{},
363+
}
364+
drainChan := make(chan monitor.InterruptionEvent, 1)
365+
366+
hookCalled := false
367+
sqsMonitor := sqsevent.SQSMonitor{
368+
SQS: sqsMock,
369+
EC2: ec2Mock,
370+
ManagedTag: "aws-node-termination-handler/managed",
371+
ASG: mockIsManagedTrue(&asgMock),
372+
CheckIfManaged: true,
373+
QueueURL: "https://test-queue",
374+
InterruptionChan: drainChan,
375+
BeforeCompleteLifecycleAction: func() { hookCalled = true },
376+
}
377+
378+
err = sqsMonitor.Monitor()
379+
h.Ok(t, err)
380+
381+
t.Run(asgLifecycleEvent.DetailType, func(st *testing.T) {
382+
result := <-drainChan
383+
h.Equals(st, sqsevent.SQSTerminateKind, result.Kind)
384+
h.Equals(st, result.NodeName, dnsNodeName)
385+
h.Assert(st, result.PostDrainTask != nil, "PostDrainTask should have been set")
386+
h.Assert(st, result.PreDrainTask != nil, "PreDrainTask should have been set")
387+
err := result.PostDrainTask(result, node.Node{})
388+
h.Ok(st, err)
389+
h.Assert(st, hookCalled, "BeforeCompleteLifecycleAction hook not called")
390+
})
391+
}
392+
348393
func TestMonitor_DrainTasks_Errors(t *testing.T) {
349394
testEvents := []sqsevent.EventBridgeEvent{spotItnEvent, asgLifecycleEvent, {}, rebalanceRecommendationEvent}
350395
messages := make([]*sqs.Message, 0, len(testEvents))

0 commit comments

Comments
 (0)