Skip to content

Commit 2b50c31

Browse files
reduce prometheus metric cardinality (#950)
1 parent aed263a commit 2b50c31

File tree

4 files changed

+41
-8
lines changed

4 files changed

+41
-8
lines changed

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -573,10 +573,11 @@ For build instructions please consult [BUILD.md](./BUILD.md).
573573
## Metrics
574574
Available Prometheus metrics:
575575

576-
| Metric name | Description |
577-
| -------------- | ------------------------------------- |
578-
| `actions_node` | Number of actions per node |
579-
| `events_error` | Number of errors in events processing |
576+
| Metric name | Description |
577+
| -------------- | -------------------------------------------------------------------|
578+
| `actions` | Number of actions |
579+
| `actions_node` | Number of actions per node (Deprecated: Use actions metric instead)|
580+
| `events_error` | Number of errors in events processing |
580581

581582

582583
## Communication

config/helm/aws-node-termination-handler/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,6 @@ The configuration in this table applies to AWS Node Termination Handler testing
174174

175175
## Metrics Endpoint Considerations
176176

177-
AWS Node Termination HAndler in IMDS mode runs as a DaemonSet with `useHostNetwork: true` by default. If the Prometheus server is enabled with `enablePrometheusServer: true` nothing else will be able to bind to the configured port (by default `prometheusServerPort: 9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint.
177+
AWS Node Termination Handler in IMDS mode runs as a DaemonSet with `useHostNetwork: true` by default. If the Prometheus server is enabled with `enablePrometheusServer: true` nothing else will be able to bind to the configured port (by default `prometheusServerPort: 9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint.
178178

179179
You can switch NTH in IMDS mode to run w/ `useHostNetwork: false`, but you will need to make sure that IMDSv1 is enabled or IMDSv2 IP hop count will need to be incremented to 2 (see the [IMDSv2 documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html).

pkg/observability/opentelemetry.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ type Metrics struct {
4545
enabled bool
4646
meter api.Meter
4747
actionsCounter instrument.Int64Counter
48+
actionsCounterV2 instrument.Int64Counter
4849
errorEventsCounter instrument.Int64Counter
4950
}
5051

@@ -89,25 +90,39 @@ func (m Metrics) NodeActionsInc(action, nodeName string, eventID string, err err
8990
}
9091

9192
labels := []attribute.KeyValue{labelNodeActionKey.String(action), labelNodeNameKey.String(nodeName), labelEventIDKey.String(eventID)}
93+
labelsV2 := []attribute.KeyValue{labelNodeActionKey.String(action)}
9294
if err != nil {
9395
labels = append(labels, labelNodeStatusKey.String("error"))
96+
labelsV2 = append(labelsV2, labelNodeStatusKey.String("error"))
9497
} else {
9598
labels = append(labels, labelNodeStatusKey.String("success"))
99+
labelsV2 = append(labelsV2, labelNodeStatusKey.String("success"))
96100
}
97101

98102
m.actionsCounter.Add(context.Background(), 1, labels...)
103+
m.actionsCounterV2.Add(context.Background(), 1, labelsV2...)
99104
}
100105

101106
func registerMetricsWith(provider *metric.MeterProvider) (Metrics, error) {
102107
meter := provider.Meter("aws.node.termination.handler")
103108

109+
// Deprecated: actionsCounter metric has a high label cardinality, resulting in numerous time-series which utilize
110+
// a large amount of memory. Use actionsCounterV2 metric instead.
104111
name := "actions.node"
105112
actionsCounter, err := meter.Int64Counter(name, instrument.WithDescription("Number of actions per node"))
106113
if err != nil {
107114
return Metrics{}, fmt.Errorf("failed to create Prometheus counter %q: %w", name, err)
108115
}
109116
actionsCounter.Add(context.Background(), 0)
110117

118+
// Recommended replacement for actionsCounter metric
119+
name = "actions"
120+
actionsCounterV2, err := meter.Int64Counter(name, instrument.WithDescription("Number of actions"))
121+
if err != nil {
122+
return Metrics{}, fmt.Errorf("failed to create Prometheus counter %q: %w", name, err)
123+
}
124+
actionsCounterV2.Add(context.Background(), 0)
125+
111126
name = "events.error"
112127
errorEventsCounter, err := meter.Int64Counter(name, instrument.WithDescription("Number of errors in events processing"))
113128
if err != nil {
@@ -118,6 +133,7 @@ func registerMetricsWith(provider *metric.MeterProvider) (Metrics, error) {
118133
meter: meter,
119134
errorEventsCounter: errorEventsCounter,
120135
actionsCounter: actionsCounter,
136+
actionsCounterV2: actionsCounterV2,
121137
}, nil
122138
}
123139

test/e2e/prometheus-metrics-test

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,27 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
162162
fi
163163
done
164164
if [ -z $failed ]; then
165-
exit 0
165+
break
166166
fi
167167
echo "Metrics Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
168168
sleep $TAINT_CHECK_SLEEP
169169
done
170170

171-
echo "❌ Failed checking metric for $METRIC"
172-
exit 3
171+
if [[ -n $failed ]];then
172+
exit 4
173+
fi
174+
175+
metric_name="actions_total"
176+
for action in cordon-and-drain pre-drain; do
177+
labels='node_action="'$action'",node_status="success",otel_scope_name="aws.node.termination.handler",otel_scope_version=""'
178+
query="$metric_name{$labels}"
179+
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}')
180+
if (($counter_value <= 1)); then
181+
echo "❌ Failed counter count for metric action:$action"
182+
exit 5
183+
fi
184+
echo "✅ Fetched counter:$counter_value for metric with action:$action"
185+
done
186+
187+
188+
exit 0

0 commit comments

Comments
 (0)