cortexproject · yeya24 · Oct 7, 2023 · Oct 6, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,7 @@
 * [FEATURE] Query Frontend: Add `-frontend.retry-on-too-many-outstanding-requests` to re-enqueue 429 requests if there are multiple query-schedulers available. #5496
 * [FEATURE] Store Gateway: Add `-blocks-storage.bucket-store.max-inflight-requests` for store gateways to reject further requests upon reaching the limit. #5553
 * [FEATURE] Store Gateway: Add `cortex_bucket_store_block_load_duration_seconds` histogram to track time to load blocks. #5580
+* [FEATURE] AlertManager: Add `cortex_alertmanager_dispatcher_aggregation_groups` and `cortex_alertmanager_dispatcher_alert_processing_duration_seconds` metrics for dispatcher. #5592
 * [ENHANCEMENT] Distributor/Ingester: Add span on push path #5319
 * [ENHANCEMENT] Support object storage backends for runtime configuration file. #5292
 * [ENHANCEMENT] Query Frontend: Reject subquery with too small step size. #5323

diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go
@@ -60,6 +60,8 @@ type alertmanagerMetrics struct {
 	persistFailed           *prometheus.Desc
 
 	notificationRateLimited                 *prometheus.Desc
+	dispatcherAggregationGroups             *prometheus.Desc
+	dispatcherProcessingDuration            *prometheus.Desc
 	dispatcherAggregationGroupsLimitReached *prometheus.Desc
 	insertAlertFailures                     *prometheus.Desc
 	alertsLimiterAlertsCount                *prometheus.Desc
@@ -217,6 +219,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
 			"cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total",
 			"Number of times when dispatcher failed to create new aggregation group due to limit.",
 			[]string{"user"}, nil),
+		dispatcherAggregationGroups: prometheus.NewDesc(
+			"cortex_alertmanager_dispatcher_aggregation_groups",
+			"Number of active aggregation groups.",
+			[]string{"user"}, nil),
+		dispatcherProcessingDuration: prometheus.NewDesc(
+			"cortex_alertmanager_dispatcher_alert_processing_duration_seconds",
+			"Summary of latencies for the processing of alerts.",
+			[]string{"user"}, nil),
 		insertAlertFailures: prometheus.NewDesc(
 			"cortex_alertmanager_alerts_insert_limited_total",
 			"Total number of failures to store alert due to hitting alertmanager limits.",
@@ -279,6 +289,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
 	out <- m.persistTotal
 	out <- m.persistFailed
 	out <- m.notificationRateLimited
+	out <- m.dispatcherAggregationGroups
+	out <- m.dispatcherProcessingDuration
 	out <- m.dispatcherAggregationGroupsLimitReached
 	out <- m.insertAlertFailures
 	out <- m.alertsLimiterAlertsCount
@@ -330,6 +342,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
 	data.SendSumOfCounters(out, m.persistFailed, "alertmanager_state_persist_failed_total")
 
 	data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration")
+	data.SendSumOfGaugesPerUser(out, m.dispatcherAggregationGroups, "alertmanager_dispatcher_aggregation_groups")
+	data.SendSumOfSummariesPerUser(out, m.dispatcherProcessingDuration, "alertmanager_dispatcher_alert_processing_duration_seconds")
 	data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total")
 	data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total")
 	data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts")

diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go
@@ -60,6 +60,14 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
 		cortex_alertmanager_config_hash{user="user1"} 0
 		cortex_alertmanager_config_hash{user="user2"} 0
 		cortex_alertmanager_config_hash{user="user3"} 0
+		# HELP cortex_alertmanager_dispatcher_alert_processing_duration_seconds Summary of latencies for the processing of alerts.
+		# TYPE cortex_alertmanager_dispatcher_alert_processing_duration_seconds summary
+		cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user1"} 0
+		cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user1"} 0
+		cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user2"} 0
+		cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user2"} 0
+		cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user3"} 0
+		cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user3"} 0
 		# HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle.
 		# TYPE cortex_alertmanager_nflog_gc_duration_seconds summary
 		cortex_alertmanager_nflog_gc_duration_seconds_sum 111
@@ -354,6 +362,14 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
         	            cortex_alertmanager_config_hash{user="user1"} 0
         	            cortex_alertmanager_config_hash{user="user2"} 0
         	            cortex_alertmanager_config_hash{user="user3"} 0
+						# HELP cortex_alertmanager_dispatcher_alert_processing_duration_seconds Summary of latencies for the processing of alerts.
+						# TYPE cortex_alertmanager_dispatcher_alert_processing_duration_seconds summary
+						cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user1"} 0
+						cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user1"} 0
+						cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user2"} 0
+						cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user2"} 0
+						cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user3"} 0
+						cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user3"} 0
 
         	            # HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle.
         	            # TYPE cortex_alertmanager_nflog_gc_duration_seconds summary
@@ -649,6 +665,12 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
     		# TYPE cortex_alertmanager_config_hash gauge
     		cortex_alertmanager_config_hash{user="user1"} 0
     		cortex_alertmanager_config_hash{user="user2"} 0
+			# HELP cortex_alertmanager_dispatcher_alert_processing_duration_seconds Summary of latencies for the processing of alerts.
+			# TYPE cortex_alertmanager_dispatcher_alert_processing_duration_seconds summary
+			cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user1"} 0
+			cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user1"} 0
+			cortex_alertmanager_dispatcher_alert_processing_duration_seconds_sum{user="user2"} 0
+			cortex_alertmanager_dispatcher_alert_processing_duration_seconds_count{user="user2"} 0
 
     		# HELP cortex_alertmanager_nflog_gc_duration_seconds Duration of the last notification log garbage collection cycle.
     		# TYPE cortex_alertmanager_nflog_gc_duration_seconds summary