diff --git a/CHANGELOG.md b/CHANGELOG.md index b91b4c0b6de..5233a07d3ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ * [FEATURE] Query Frontend: Add `-frontend.retry-on-too-many-outstanding-requests` to re-enqueue 429 requests if there are multiple query-schedulers available. #5496 * [FEATURE] Store Gateway: Add `-blocks-storage.bucket-store.max-inflight-requests` for store gateways to reject further requests upon reaching the limit. #5553 * [FEATURE] Store Gateway: Add `cortex_bucket_store_block_load_duration_seconds` histogram to track time to load blocks. #5580 +* [FEATURE] AlertManager: Add `cortex_alertmanager_dispatcher_aggregation_groups` and `cortex_alertmanager_dispatcher_alert_processing_duration_seconds` metrics for dispatcher. #5592 * [ENHANCEMENT] Distributor/Ingester: Add span on push path #5319 * [ENHANCEMENT] Support object storage backends for runtime configuration file. #5292 * [ENHANCEMENT] Query Frontend: Reject subquery with too small step size. #5323 diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index dbedd9bca49..21d77d2b4b0 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -60,6 +60,8 @@ type alertmanagerMetrics struct { persistFailed *prometheus.Desc notificationRateLimited *prometheus.Desc + dispatcherAggregationGroups *prometheus.Desc + dispatcherProcessingDuration *prometheus.Desc dispatcherAggregationGroupsLimitReached *prometheus.Desc insertAlertFailures *prometheus.Desc alertsLimiterAlertsCount *prometheus.Desc @@ -217,6 +219,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { "cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total", "Number of times when dispatcher failed to create new aggregation group due to limit.", []string{"user"}, nil), + dispatcherAggregationGroups: prometheus.NewDesc( + "cortex_alertmanager_dispatcher_aggregation_groups", + "Number of active aggregation groups.", + []string{"user"}, nil), + dispatcherProcessingDuration: prometheus.NewDesc( + "cortex_alertmanager_dispatcher_alert_processing_duration_seconds", + "Summary of latencies for the processing of alerts.", + []string{"user"}, nil), insertAlertFailures: prometheus.NewDesc( "cortex_alertmanager_alerts_insert_limited_total", "Total number of failures to store alert due to hitting alertmanager limits.", @@ -279,6 +289,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.persistTotal out <- m.persistFailed out <- m.notificationRateLimited + out <- m.dispatcherAggregationGroups + out <- m.dispatcherProcessingDuration out <- m.dispatcherAggregationGroupsLimitReached out <- m.insertAlertFailures out <- m.alertsLimiterAlertsCount @@ -330,6 +342,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCounters(out, m.persistFailed, "alertmanager_state_persist_failed_total") data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration") + data.SendSumOfGaugesPerUser(out, m.dispatcherAggregationGroups, "alertmanager_dispatcher_aggregation_groups") + data.SendSumOfSummariesPerUser(out, m.dispatcherProcessingDuration, "alertmanager_dispatcher_alert_processing_duration_seconds") data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total") data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total") data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts") diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index dac8753c20d..25cad5f344a 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -60,6 +60,14 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_config_hash{user="user1"} 0 cortex_alertmanager_config_hash{user="user2"} 0 cortex_alertmanager_config_hash{user="user3"} 0 + # HELP cortex_alertmanager_dispatcher_alert_processing_duration_seconds Summary of latencies for the processing of alerts. + # TYPE cortex_alertmanager_dispatcher_alert_processing_duration_seconds summary + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user1"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user1"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user2"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user2"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user3"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user3"} 0 # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary cortex_alertmanager_nflog_gc_duration_seconds_sum 111 @@ -354,6 +362,14 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { cortex_alertmanager_config_hash{user="user1"} 0 cortex_alertmanager_config_hash{user="user2"} 0 cortex_alertmanager_config_hash{user="user3"} 0 + # HELP cortex_alertmanager_dispatcher_alert_processing_duration_seconds Summary of latencies for the processing of alerts. + # TYPE cortex_alertmanager_dispatcher_alert_processing_duration_seconds summary + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user1"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user1"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user2"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user2"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user3"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user3"} 0 # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary @@ -649,6 +665,12 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # TYPE cortex_alertmanager_config_hash gauge cortex_alertmanager_config_hash{user="user1"} 0 cortex_alertmanager_config_hash{user="user2"} 0 + # HELP cortex_alertmanager_dispatcher_alert_processing_duration_seconds Summary of latencies for the processing of alerts. + # TYPE cortex_alertmanager_dispatcher_alert_processing_duration_seconds summary + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user1"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user1"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user2"} 0 + cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user2"} 0 # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle. # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary