From 950221fdc47854f8756552e243b0a83285852dc3 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Wed, 19 Aug 2020 12:22:19 +0200 Subject: [PATCH 1/2] Add integration label to notification metrics Also, add a metric for when creation of rules manager would fail. Signed-off-by: Goutham Veeramachaneni --- CHANGELOG.md | 2 + pkg/alertmanager/alertmanager_metrics.go | 8 +-- pkg/alertmanager/alertmanager_metrics_test.go | 54 ++++++++++++++++--- pkg/alertmanager/multitenant.go | 2 +- pkg/alertmanager/multitenant_test.go | 8 +-- pkg/ruler/manager.go | 22 ++++++++ pkg/ruler/ruler.go | 10 ---- 7 files changed, 81 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29ed51f8fa7..1097c584b63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ * [ENHANCEMENT] Expose `storage.aws.dynamodb.backoff_config` configuration file field. #3026 * [ENHANCEMENT] Added `cortex_request_message_bytes` and `cortex_response_message_bytes` histograms to track received and sent gRPC message and HTTP request/response sizes. Added `cortex_inflight_requests` gauge to track number of inflight gRPC and HTTP requests. #3064 * [ENHANCEMENT] Add config validation to the experimental Alertmanager API. Invalid configs are no longer accepted. #3053 +* [ENHANCEMENT] Add "integration" as a label for `cortex_alertmanager_notifications_total` and `cortex_alertmanager_notifications_failed_total` metrics. #3056 +* [ENHANCEMENT] Add `cortex_ruler_config_last_reload_successful` and `cortex_ruler_config_last_reload_successful_seconds` to check status of users rule manager. #3056 * [BUGFIX] Query-frontend: Fixed rounding for incoming query timestamps, to be 100% Prometheus compatible. #2990 * [BUGFIX] Querier: query /series from ingesters regardless the `-querier.query-ingesters-within` setting. #3035 * [BUGFIX] Experimental blocks storage: Ingester is less likely to hit gRPC message size limit when streaming data to queriers. #3015 diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 721053dcbb4..354370d016a 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -62,11 +62,11 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { numNotifications: prometheus.NewDesc( "cortex_alertmanager_notifications_total", "The total number of attempted notifications.", - []string{"user"}, nil), + []string{"user", "integration"}, nil), numFailedNotifications: prometheus.NewDesc( "cortex_alertmanager_notifications_failed_total", "The total number of failed notifications.", - []string{"user"}, nil), + []string{"user", "integration"}, nil), notificationLatencySeconds: prometheus.NewDesc( "cortex_alertmanager_notification_latency_seconds", "The latency of notifications in seconds.", @@ -186,8 +186,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total") data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") - data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total") - data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total") + data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration") + data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration") data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds") data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state") diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 494ec9c72b6..3005791056b 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -99,14 +99,56 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_notification_latency_seconds_count 24 # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. # TYPE cortex_alertmanager_notifications_failed_total counter - cortex_alertmanager_notifications_failed_total{user="user1"} 28 - cortex_alertmanager_notifications_failed_total{user="user2"} 280 - cortex_alertmanager_notifications_failed_total{user="user3"} 2800 + cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100 + cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3 + cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30 + cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300 + cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4 + cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40 + cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400 + cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7 + cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70 + cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700 + cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6 + cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60 + cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600 + cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2 + cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20 + cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200 # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. # TYPE cortex_alertmanager_notifications_total counter - cortex_alertmanager_notifications_total{user="user1"} 28 - cortex_alertmanager_notifications_total{user="user2"} 280 - cortex_alertmanager_notifications_total{user="user3"} 2800 + cortex_alertmanager_notifications_total{integration="email",user="user1"} 0 + cortex_alertmanager_notifications_total{integration="email",user="user2"} 0 + cortex_alertmanager_notifications_total{integration="email",user="user3"} 0 + cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5 + cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 50 + cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 500 + cortex_alertmanager_notifications_total{integration="pagerduty",user="user1"} 1 + cortex_alertmanager_notifications_total{integration="pagerduty",user="user2"} 10 + cortex_alertmanager_notifications_total{integration="pagerduty",user="user3"} 100 + cortex_alertmanager_notifications_total{integration="pushover",user="user1"} 3 + cortex_alertmanager_notifications_total{integration="pushover",user="user2"} 30 + cortex_alertmanager_notifications_total{integration="pushover",user="user3"} 300 + cortex_alertmanager_notifications_total{integration="slack",user="user1"} 4 + cortex_alertmanager_notifications_total{integration="slack",user="user2"} 40 + cortex_alertmanager_notifications_total{integration="slack",user="user3"} 400 + cortex_alertmanager_notifications_total{integration="victorops",user="user1"} 7 + cortex_alertmanager_notifications_total{integration="victorops",user="user2"} 70 + cortex_alertmanager_notifications_total{integration="victorops",user="user3"} 700 + cortex_alertmanager_notifications_total{integration="webhook",user="user1"} 6 + cortex_alertmanager_notifications_total{integration="webhook",user="user2"} 60 + cortex_alertmanager_notifications_total{integration="webhook",user="user3"} 600 + cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 2 + cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20 + cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200 # HELP cortex_alertmanager_silences How many silences by state. # TYPE cortex_alertmanager_silences gauge cortex_alertmanager_silences{state="active",user="user1"} 1 diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 322e5f63f2b..d54c7419a6d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -135,7 +135,7 @@ func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAl m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ Namespace: "cortex", Name: "alertmanager_config_invalid", - Help: "Whenever the Alertmanager config is invalid for a user.", + Help: "Boolean set to 1 whenever the Alertmanager config is invalid for a user.", }, []string{"user"}) return m diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 14b920cdd57..e807f0d33d9 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -96,7 +96,7 @@ func TestLoadAllConfigs(t *testing.T) { require.Equal(t, simpleConfigOne, currentConfig.RawConfig) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. # TYPE cortex_alertmanager_config_invalid gauge cortex_alertmanager_config_invalid{user="user1"} 0 cortex_alertmanager_config_invalid{user="user2"} 0 @@ -113,7 +113,7 @@ func TestLoadAllConfigs(t *testing.T) { require.Len(t, am.alertmanagers, 3) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. # TYPE cortex_alertmanager_config_invalid gauge cortex_alertmanager_config_invalid{user="user1"} 0 cortex_alertmanager_config_invalid{user="user2"} 0 @@ -146,7 +146,7 @@ func TestLoadAllConfigs(t *testing.T) { require.False(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. # TYPE cortex_alertmanager_config_invalid gauge cortex_alertmanager_config_invalid{user="user1"} 0 cortex_alertmanager_config_invalid{user="user2"} 0 @@ -170,7 +170,7 @@ func TestLoadAllConfigs(t *testing.T) { require.True(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. # TYPE cortex_alertmanager_config_invalid gauge cortex_alertmanager_config_invalid{user="user1"} 0 cortex_alertmanager_config_invalid{user="user2"} 0 diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index f1a2082f7b3..281b0efc58f 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -20,6 +20,24 @@ import ( "github.com/cortexproject/cortex/pkg/util" ) +var ( + configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "cortex", + Name: "ruler_config_updates_total", + Help: "Total number of config updates triggered by a user", + }, []string{"user"}) + configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "cortex", + Name: "ruler_config_update_failures_total", + Help: "Total number of config update failures triggered by a user", + }, []string{"user", "reason"}) + userManagerFailed = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "cortex", + Name: "ruler_manager_failed", + Help: "Boolean set to 1 whenever the Ruler manager failed to start for a user.", + }, []string{"user"}) +) + type DefaultMultiTenantManager struct { cfg Config notifierCfg *config.Config @@ -112,6 +130,7 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user manager, err = r.newManager(ctx, user) if err != nil { configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc() + userManagerFailed.WithLabelValues(user).Set(1) level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err) return } @@ -123,9 +142,12 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user err = manager.Update(r.cfg.EvaluationInterval, files, nil) if err != nil { configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc() + userManagerFailed.WithLabelValues(user).Set(1) level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err) return } + + userManagerFailed.WithLabelValues(user).Set(0) } } diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 6c8c23f4e12..24a72457c36 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -38,16 +38,6 @@ var ( Name: "ruler_ring_check_errors_total", Help: "Number of errors that have occurred when checking the ring for ownership", }) - configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{ - Namespace: "cortex", - Name: "ruler_config_updates_total", - Help: "Total number of config updates triggered by a user", - }, []string{"user"}) - configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{ - Namespace: "cortex", - Name: "ruler_config_update_failures_total", - Help: "Total number of config update failures triggered by a user", - }, []string{"user", "reason"}) ) // Config is the configuration for the recording rules server. From c3727b32651755590448f5f22c2f581e0cdac151 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Fri, 21 Aug 2020 15:00:04 +0200 Subject: [PATCH 2/2] Address feedback Signed-off-by: Goutham Veeramachaneni --- CHANGELOG.md | 1 + pkg/ruler/manager.go | 57 +++++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1097c584b63..5d801118f7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ * [CHANGE] Experimental Delete Series: `/api/v1/admin/tsdb/delete_series` and `/api/v1/admin/tsdb/cancel_delete_request` purger APIs to return status code `204` instead of `200` for success. #2946 * [CHANGE] Histogram `cortex_memcache_request_duration_seconds` `method` label value changes from `Memcached.Get` to `Memcached.GetBatched` for batched lookups, and is not reported for non-batched lookups (label value `Memcached.GetMulti` remains, and had exactly the same value as `Get` in nonbatched lookups). The same change applies to tracing spans. #3046 * [CHANGE] TLS server validation is now enabled by default, a new parameter `tls_insecure_skip_verify` can be set to true to skip validation optionally. #3030 +* [CHANGE] `cortex_ruler_config_update_failures_total` has been removed in favor of `cortex_ruler_config_last_reload_successful`. #3056 * [ENHANCEMENT] Add support for azure storage in China, German and US Government environments. #2988 * [ENHANCEMENT] Query-tee: added a small tolerance to floating point sample values comparison. #2994 * [ENHANCEMENT] Query-tee: add support for doing a passthrough of requests to preferred backend for unregistered routes #3018 diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 281b0efc58f..6e1fdfefe26 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -20,24 +20,6 @@ import ( "github.com/cortexproject/cortex/pkg/util" ) -var ( - configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{ - Namespace: "cortex", - Name: "ruler_config_updates_total", - Help: "Total number of config updates triggered by a user", - }, []string{"user"}) - configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{ - Namespace: "cortex", - Name: "ruler_config_update_failures_total", - Help: "Total number of config update failures triggered by a user", - }, []string{"user", "reason"}) - userManagerFailed = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: "cortex", - Name: "ruler_manager_failed", - Help: "Boolean set to 1 whenever the Ruler manager failed to start for a user.", - }, []string{"user"}) -) - type DefaultMultiTenantManager struct { cfg Config notifierCfg *config.Config @@ -55,9 +37,12 @@ type DefaultMultiTenantManager struct { notifiersMtx sync.Mutex notifiers map[string]*rulerNotifier - managersTotal prometheus.Gauge - registry prometheus.Registerer - logger log.Logger + managersTotal prometheus.Gauge + lastReloadSuccessful *prometheus.GaugeVec + lastReloadSuccessfulTimestamp *prometheus.GaugeVec + configUpdatesTotal *prometheus.CounterVec + registry prometheus.Registerer + logger log.Logger } func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) { @@ -84,6 +69,21 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg Name: "ruler_managers_total", Help: "Total number of managers registered and running in the ruler", }), + lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "cortex", + Name: "ruler_config_last_reload_successful", + Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.", + }, []string{"user"}), + lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "cortex", + Name: "ruler_config_last_reload_successful_seconds", + Help: "Timestamp of the last successful configuration reload.", + }, []string{"user"}), + configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Namespace: "cortex", + Name: "ruler_config_updates_total", + Help: "Total number of config updates triggered by a user", + }, []string{"user"}), registry: reg, logger: logger, }, nil @@ -104,6 +104,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou if _, exists := ruleGroups[userID]; !exists { go mngr.Stop() delete(r.userManagers, userID) + r.lastReloadSuccessful.DeleteLabelValues(userID) + r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) + r.configUpdatesTotal.DeleteLabelValues(userID) level.Info(r.logger).Log("msg", "deleting rule manager", "user", userID) } } @@ -118,19 +121,19 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user // have been updated update, files, err := r.mapper.MapRules(user, groups.Formatted()) if err != nil { + r.lastReloadSuccessful.WithLabelValues(user).Set(0) level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err) return } if update { level.Debug(r.logger).Log("msg", "updating rules", "user", "user") - configUpdatesTotal.WithLabelValues(user).Inc() + r.configUpdatesTotal.WithLabelValues(user).Inc() manager, exists := r.userManagers[user] if !exists { manager, err = r.newManager(ctx, user) if err != nil { - configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc() - userManagerFailed.WithLabelValues(user).Set(1) + r.lastReloadSuccessful.WithLabelValues(user).Set(0) level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err) return } @@ -141,13 +144,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user } err = manager.Update(r.cfg.EvaluationInterval, files, nil) if err != nil { - configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc() - userManagerFailed.WithLabelValues(user).Set(1) + r.lastReloadSuccessful.WithLabelValues(user).Set(0) level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err) return } - userManagerFailed.WithLabelValues(user).Set(0) + r.lastReloadSuccessful.WithLabelValues(user).Set(1) + r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() } }