Skip to content

Commit 654d8f1

Browse files
committed
improve test + logging
1 parent 316906e commit 654d8f1

File tree

7 files changed

+171
-32
lines changed

7 files changed

+171
-32
lines changed

cmd/node-termination-handler.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ func main() {
120120
log.Fatal().Err(err).Msg("Unable to instantiate a node for various kubernetes node functions,")
121121
}
122122

123-
metrics, err := observability.InitMetrics(nthConfig)
123+
metrics, err := observability.InitMetrics(nthConfig.EnablePrometheus, nthConfig.PrometheusPort)
124124
if err != nil {
125125
nthConfig.Print()
126126
log.Fatal().Err(err).Msg("Unable to instantiate observability metrics,")
@@ -218,7 +218,7 @@ func main() {
218218
ec2Client := ec2.New(sess)
219219

220220
if nthConfig.EnablePrometheus {
221-
go metrics.InitNodeMetrics(node, ec2Client)
221+
go metrics.InitNodeMetrics(nthConfig, node, ec2Client)
222222
}
223223

224224
completeLifecycleActionDelay := time.Duration(nthConfig.CompleteLifecycleActionDelaySeconds) * time.Second

pkg/ec2helper/ec2helper.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func (h EC2Helper) GetInstanceIdsByTagKey(tag string) ([]string, error) {
5555
}
5656

5757
if result == nil || result.Reservations == nil {
58-
return nil, fmt.Errorf("failed to describe instances")
58+
return nil, fmt.Errorf("describe instances success but return empty response for tag key: %s", tag)
5959
}
6060

6161
for _, reservation := range result.Reservations {
@@ -87,7 +87,7 @@ func (h EC2Helper) GetInstanceIdsMapByTagKey(tag string) (map[string]bool, error
8787
}
8888

8989
if ids == nil {
90-
return nil, fmt.Errorf("failed to describe instances")
90+
return nil, fmt.Errorf("get instance ids success but return empty response for tag key: %s", tag)
9191
}
9292

9393
for _, id := range ids {

pkg/ec2helper/ec2helper_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/aws/aws-node-termination-handler/pkg/ec2helper"
2020
h "github.com/aws/aws-node-termination-handler/pkg/test"
2121
"github.com/aws/aws-sdk-go/aws"
22+
"github.com/aws/aws-sdk-go/aws/awserr"
2223
"github.com/aws/aws-sdk-go/service/ec2"
2324
)
2425

@@ -40,6 +41,85 @@ func TestGetInstanceIdsByTagKey(t *testing.T) {
4041
h.Equals(t, instanceId2, instanceIds[1])
4142
}
4243

44+
func TestGetInstanceIdsByTagKeyAPIError(t *testing.T) {
45+
ec2Mock := h.MockedEC2{
46+
DescribeInstancesResp: getDescribeInstancesResp(),
47+
DescribeInstancesErr: awserr.New("ThrottlingException", "Rate exceeded", nil),
48+
}
49+
ec2Helper := ec2helper.New(ec2Mock)
50+
_, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
51+
h.Nok(t, err)
52+
}
53+
54+
func TestGetInstanceIdsByTagKeyNilResponse(t *testing.T) {
55+
ec2Mock := h.MockedEC2{}
56+
ec2Helper := ec2helper.New(ec2Mock)
57+
_, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
58+
h.Nok(t, err)
59+
}
60+
61+
func TestGetInstanceIdsByTagKeyNilReservations(t *testing.T) {
62+
ec2Mock := h.MockedEC2{
63+
DescribeInstancesResp: ec2.DescribeInstancesOutput{
64+
Reservations: nil,
65+
},
66+
}
67+
ec2Helper := ec2helper.New(ec2Mock)
68+
_, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
69+
h.Nok(t, err)
70+
}
71+
72+
func TestGetInstanceIdsByTagKeyEmptyReservation(t *testing.T) {
73+
ec2Mock := h.MockedEC2{
74+
DescribeInstancesResp: ec2.DescribeInstancesOutput{
75+
Reservations: []*ec2.Reservation{},
76+
},
77+
}
78+
ec2Helper := ec2helper.New(ec2Mock)
79+
instanceIds, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
80+
h.Ok(t, err)
81+
h.Equals(t, 0, len(instanceIds))
82+
}
83+
84+
func TestGetInstanceIdsByTagKeyEmptyInstances(t *testing.T) {
85+
ec2Mock := h.MockedEC2{
86+
DescribeInstancesResp: ec2.DescribeInstancesOutput{
87+
Reservations: []*ec2.Reservation{
88+
{
89+
Instances: []*ec2.Instance{},
90+
},
91+
},
92+
},
93+
}
94+
ec2Helper := ec2helper.New(ec2Mock)
95+
instanceIds, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
96+
h.Ok(t, err)
97+
h.Equals(t, 0, len(instanceIds))
98+
}
99+
100+
func TestGetInstanceIdsByTagKeyNilInstancesId(t *testing.T) {
101+
ec2Mock := h.MockedEC2{
102+
DescribeInstancesResp: ec2.DescribeInstancesOutput{
103+
Reservations: []*ec2.Reservation{
104+
{
105+
Instances: []*ec2.Instance{
106+
{
107+
InstanceId: nil,
108+
},
109+
{
110+
InstanceId: aws.String(instanceId1),
111+
},
112+
},
113+
},
114+
},
115+
},
116+
}
117+
ec2Helper := ec2helper.New(ec2Mock)
118+
instanceIds, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
119+
h.Ok(t, err)
120+
h.Equals(t, 1, len(instanceIds))
121+
}
122+
43123
func TestGetInstanceIdsMapByTagKey(t *testing.T) {
44124
ec2Mock := h.MockedEC2{
45125
DescribeInstancesResp: getDescribeInstancesResp(),

pkg/node/node.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -652,20 +652,22 @@ func (n Node) FetchKubernetesNodeInstanceIds() ([]string, error) {
652652
}
653653

654654
if matchingNodes == nil || matchingNodes.Items == nil {
655-
return nil, fmt.Errorf("failed to list nodes")
655+
return nil, fmt.Errorf("list nodes success but return empty response")
656656
}
657657

658658
for _, node := range matchingNodes.Items {
659659
// sample providerID: aws:///us-west-2a/i-0abcd1234efgh5678
660660
parts := strings.Split(node.Spec.ProviderID, "/")
661-
if len(parts) < 2 {
662-
log.Warn().Msgf("Found invalid providerID: %s", node.Spec.ProviderID)
661+
if len(parts) != 5 {
662+
log.Warn().Msgf("Invalid providerID format found for node %s: %s (expected format: aws:///region/instance-id)", node.Name, node.Spec.ProviderID)
663663
continue
664664
}
665665

666666
instanceId := parts[len(parts)-1]
667667
if instanceIDRegex.MatchString(instanceId) {
668668
ids = append(ids, parts[len(parts)-1])
669+
} else {
670+
log.Warn().Msgf("Invalid instance id format found for node %s: %s (expected format: ^i-.*)", node.Name, instanceId)
669671
}
670672
}
671673

pkg/node/node_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,61 @@ func TestFetchKubernetesNodeInstanceIds(t *testing.T) {
409409
h.Equals(t, instanceId2, instanceIds[1])
410410
}
411411

412+
func TestFetchKubernetesNodeInstanceIdsEmptyResponse(t *testing.T) {
413+
client := fake.NewSimpleClientset()
414+
415+
_, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
416+
h.Ok(t, err)
417+
418+
node, err := newNode(config.Config{}, client)
419+
h.Ok(t, err)
420+
421+
instanceIds, err := node.FetchKubernetesNodeInstanceIds()
422+
h.Ok(t, err)
423+
h.Equals(t, 0, len(instanceIds))
424+
}
425+
426+
func TestFetchKubernetesNodeInstanceIdsInvalidProviderID(t *testing.T) {
427+
client := fake.NewSimpleClientset(
428+
&v1.Node{
429+
ObjectMeta: metav1.ObjectMeta{Name: "invalid-providerId-1"},
430+
Spec: v1.NodeSpec{ProviderID: "dummyProviderId"},
431+
},
432+
&v1.Node{
433+
ObjectMeta: metav1.ObjectMeta{Name: "invalid-providerId-2"},
434+
Spec: v1.NodeSpec{ProviderID: fmt.Sprintf("aws:/%s", instanceId2)},
435+
},
436+
&v1.Node{
437+
ObjectMeta: metav1.ObjectMeta{Name: "invalid-providerId-3"},
438+
Spec: v1.NodeSpec{ProviderID: fmt.Sprintf("us-west-2a/%s", instanceId2)},
439+
},
440+
&v1.Node{
441+
ObjectMeta: metav1.ObjectMeta{Name: "invalid-providerId-4"},
442+
Spec: v1.NodeSpec{ProviderID: fmt.Sprintf("aws:///us-west-2a/%s/dummyPart", instanceId2)},
443+
},
444+
&v1.Node{
445+
ObjectMeta: metav1.ObjectMeta{Name: "valid-providerId-2"},
446+
Spec: v1.NodeSpec{ProviderID: fmt.Sprintf("aws:///us-west-2a/%s", instanceId2)},
447+
},
448+
&v1.Node{
449+
ObjectMeta: metav1.ObjectMeta{Name: "valid-providerId-1"},
450+
Spec: v1.NodeSpec{ProviderID: fmt.Sprintf("aws:///us-west-2a/%s", instanceId1)},
451+
},
452+
)
453+
454+
_, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
455+
h.Ok(t, err)
456+
457+
node, err := newNode(config.Config{}, client)
458+
h.Ok(t, err)
459+
460+
instanceIds, err := node.FetchKubernetesNodeInstanceIds()
461+
h.Ok(t, err)
462+
h.Equals(t, 2, len(instanceIds))
463+
h.Equals(t, instanceId1, instanceIds[0])
464+
h.Equals(t, instanceId2, instanceIds[1])
465+
}
466+
412467
func TestFilterOutDaemonSetPods(t *testing.T) {
413468
tNode, err := newNode(config.Config{IgnoreDaemonSets: true}, fake.NewSimpleClientset())
414469
h.Ok(t, err)

pkg/observability/opentelemetry.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ type Metrics struct {
6060
}
6161

6262
// InitMetrics will initialize, register and expose, via http server, the metrics with Opentelemetry.
63-
func InitMetrics(nthConfig config.Config) (Metrics, error) {
63+
func InitMetrics(enabled bool, port int) (Metrics, error) {
6464
exporter, err := prometheus.New()
6565
if err != nil {
6666
return Metrics{}, fmt.Errorf("failed to create Prometheus exporter: %w", err)
@@ -70,8 +70,7 @@ func InitMetrics(nthConfig config.Config) (Metrics, error) {
7070
if err != nil {
7171
return Metrics{}, fmt.Errorf("failed to register metrics with Prometheus provider: %w", err)
7272
}
73-
metrics.enabled = nthConfig.EnablePrometheus
74-
metrics.nthConfig = nthConfig
73+
metrics.enabled = enabled
7574

7675
// Starts an async process to collect golang runtime stats
7776
// go.opentelemetry.io/contrib/instrumentation/runtime
@@ -80,14 +79,15 @@ func InitMetrics(nthConfig config.Config) (Metrics, error) {
8079
return Metrics{}, fmt.Errorf("failed to start Go runtime metrics collection: %w", err)
8180
}
8281

83-
if metrics.enabled {
84-
serveMetrics(nthConfig.PrometheusPort)
82+
if enabled {
83+
serveMetrics(port)
8584
}
8685

8786
return metrics, nil
8887
}
8988

90-
func (m Metrics) InitNodeMetrics(node *node.Node, ec2 ec2iface.EC2API) {
89+
func (m Metrics) InitNodeMetrics(nthConfig config.Config, node *node.Node, ec2 ec2iface.EC2API) {
90+
m.nthConfig = nthConfig
9191
m.ec2Helper = ec2helper.New(ec2)
9292
m.node = node
9393

@@ -105,10 +105,10 @@ func (m Metrics) serveNodeMetrics() {
105105
if err != nil || instanceIdsMap == nil {
106106
log.Err(err).Msg("Failed to get AWS instance ids")
107107
return
108-
} else {
109-
m.InstancesRecord(int64(len(instanceIdsMap)))
110108
}
111109

110+
m.InstancesRecord(int64(len(instanceIdsMap)))
111+
112112
nodeInstanceIds, err := m.node.FetchKubernetesNodeInstanceIds()
113113
if err != nil || nodeInstanceIds == nil {
114114
log.Err(err).Msg("Failed to get node instance ids")

test/e2e/prometheus-metrics-sqs-test

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,20 @@ set +x
3939

4040
sleep 10
4141

42-
RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'"
42+
MANAGED_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'"
43+
MANAGED_INSTANCE_WITHOUT_TAG_VALUE_CMD="awslocal ec2 run-instances --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=\"\"}]'"
44+
UNMANAGED_INSTANCE_CMD="awslocal ec2 run-instances --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test}]'"
4345
set -x
4446
localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \
4547
-o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \
4648
| awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }')
4749
echo "🥑 Using localstack pod $localstack_pod"
48-
run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "$RUN_INSTANCE_CMD")
49-
instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId')
50-
echo "🥑 Started mock EC2 instance ($instance_id)"
50+
51+
for instance_cmd in "$MANAGED_INSTANCE_WITHOUT_TAG_VALUE_CMD" "$UNMANAGED_INSTANCE_CMD" "$MANAGED_INSTANCE_CMD"; do
52+
run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "$instance_cmd")
53+
instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId')
54+
echo "🥑 Started mock EC2 instance ($instance_id)"
55+
done
5156

5257
CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}"
5358
queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "$CREATE_SQS_CMD" | jq -r .QueueUrl)
@@ -168,6 +173,7 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
168173
if [[ ${evicted} -eq 1 && $(kubectl exec -i "${localstack_pod}" -- bash -c "$GET_ATTRS_SQS_CMD" | jq '(.Attributes.ApproximateNumberOfMessagesNotVisible|tonumber) + (.Attributes.ApproximateNumberOfMessages|tonumber)' ) -eq 0 ]]; then
169174
kubectl exec -i "${localstack_pod}" -- bash -c "$GET_ATTRS_SQS_CMD"
170175
echo "✅ Verified the message was deleted from the queue after processing!"
176+
break
171177
fi
172178

173179
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
@@ -211,7 +217,7 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
211217
sleep $TAINT_CHECK_SLEEP
212218
done
213219

214-
if [[ -n $failed ]];then
220+
if [[ -n $failed ]]; then
215221
exit 4
216222
fi
217223

@@ -227,15 +233,11 @@ for action in cordon-and-drain post-drain; do
227233
echo "✅ Fetched counter:$counter_value for metric with action:$action"
228234
done
229235

230-
for gauge in nth_tagged_instances; do
231-
query=''$gauge'{otel_scope_name="aws.node.termination.handler",otel_scope_version=""}'
232-
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}')
233-
if (($counter_value < 1)); then
234-
echo "❌ Failed gauge count for metric:$gauge"
235-
exit 5
236-
fi
237-
echo "✅ Fetched gauge:$counter_value for metric:$gauge"
238-
done
239-
240-
241-
exit 0
236+
gauge="nth_tagged_instances"
237+
query=''$gauge'{otel_scope_name="aws.node.termination.handler",otel_scope_version=""}'
238+
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}')
239+
if (($counter_value < 2)); then
240+
echo "❌ Failed gauge count for metric:$gauge"
241+
exit 5
242+
fi
243+
echo "✅ Fetched gauge:$counter_value for metric:$gauge"

0 commit comments

Comments
 (0)