From 950221fdc47854f8756552e243b0a83285852dc3 Mon Sep 17 00:00:00 2001
From: Goutham Veeramachaneni <gouthamve@gmail.com>
Date: Wed, 19 Aug 2020 12:22:19 +0200
Subject: [PATCH 1/2] Add integration label to notification metrics

Also, add a metric for when creation of rules manager would fail.

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>
---
 CHANGELOG.md                                  |  2 +
 pkg/alertmanager/alertmanager_metrics.go      |  8 +--
 pkg/alertmanager/alertmanager_metrics_test.go | 54 ++++++++++++++++---
 pkg/alertmanager/multitenant.go               |  2 +-
 pkg/alertmanager/multitenant_test.go          |  8 +--
 pkg/ruler/manager.go                          | 22 ++++++++
 pkg/ruler/ruler.go                            | 10 ----
 7 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 29ed51f8fa7..1097c584b63 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,8 @@
 * [ENHANCEMENT] Expose `storage.aws.dynamodb.backoff_config` configuration file field. #3026
 * [ENHANCEMENT] Added `cortex_request_message_bytes` and `cortex_response_message_bytes` histograms to track received and sent gRPC message and HTTP request/response sizes. Added `cortex_inflight_requests` gauge to track number of inflight gRPC and HTTP requests. #3064
 * [ENHANCEMENT] Add config validation to the experimental Alertmanager API. Invalid configs are no longer accepted. #3053
+* [ENHANCEMENT] Add "integration" as a label for `cortex_alertmanager_notifications_total` and `cortex_alertmanager_notifications_failed_total` metrics. #3056
+* [ENHANCEMENT] Add `cortex_ruler_config_last_reload_successful` and `cortex_ruler_config_last_reload_successful_seconds` to check status of users rule manager. #3056
 * [BUGFIX] Query-frontend: Fixed rounding for incoming query timestamps, to be 100% Prometheus compatible. #2990
 * [BUGFIX] Querier: query /series from ingesters regardless the `-querier.query-ingesters-within` setting. #3035
 * [BUGFIX] Experimental blocks storage: Ingester is less likely to hit gRPC message size limit when streaming data to queriers. #3015
diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go
index 721053dcbb4..354370d016a 100644
--- a/pkg/alertmanager/alertmanager_metrics.go
+++ b/pkg/alertmanager/alertmanager_metrics.go
@@ -62,11 +62,11 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
 		numNotifications: prometheus.NewDesc(
 			"cortex_alertmanager_notifications_total",
 			"The total number of attempted notifications.",
-			[]string{"user"}, nil),
+			[]string{"user", "integration"}, nil),
 		numFailedNotifications: prometheus.NewDesc(
 			"cortex_alertmanager_notifications_failed_total",
 			"The total number of failed notifications.",
-			[]string{"user"}, nil),
+			[]string{"user", "integration"}, nil),
 		notificationLatencySeconds: prometheus.NewDesc(
 			"cortex_alertmanager_notification_latency_seconds",
 			"The latency of notifications in seconds.",
@@ -186,8 +186,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
 	data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total")
 	data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")
 
-	data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total")
-	data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total")
+	data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration")
+	data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration")
 	data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
 	data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")
 
diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go
index 494ec9c72b6..3005791056b 100644
--- a/pkg/alertmanager/alertmanager_metrics_test.go
+++ b/pkg/alertmanager/alertmanager_metrics_test.go
@@ -99,14 +99,56 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
 		cortex_alertmanager_notification_latency_seconds_count 24
 		# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
 		# TYPE cortex_alertmanager_notifications_failed_total counter
-		cortex_alertmanager_notifications_failed_total{user="user1"} 28
-		cortex_alertmanager_notifications_failed_total{user="user2"} 280
-		cortex_alertmanager_notifications_failed_total{user="user3"} 2800
+		cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
+		cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
+		cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0
+		cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
+		cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
+		cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500
+		cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1
+		cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10
+		cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100
+		cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3
+		cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30
+		cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300
+		cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4
+		cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40
+		cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400
+		cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7
+		cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70
+		cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700
+		cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6
+		cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60
+		cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600
+		cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2
+		cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20
+		cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200
 		# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
 		# TYPE cortex_alertmanager_notifications_total counter
-		cortex_alertmanager_notifications_total{user="user1"} 28
-		cortex_alertmanager_notifications_total{user="user2"} 280
-		cortex_alertmanager_notifications_total{user="user3"} 2800
+		cortex_alertmanager_notifications_total{integration="email",user="user1"} 0
+		cortex_alertmanager_notifications_total{integration="email",user="user2"} 0
+		cortex_alertmanager_notifications_total{integration="email",user="user3"} 0
+		cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
+		cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 50
+		cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 500
+		cortex_alertmanager_notifications_total{integration="pagerduty",user="user1"} 1
+		cortex_alertmanager_notifications_total{integration="pagerduty",user="user2"} 10
+		cortex_alertmanager_notifications_total{integration="pagerduty",user="user3"} 100
+		cortex_alertmanager_notifications_total{integration="pushover",user="user1"} 3
+		cortex_alertmanager_notifications_total{integration="pushover",user="user2"} 30
+		cortex_alertmanager_notifications_total{integration="pushover",user="user3"} 300
+		cortex_alertmanager_notifications_total{integration="slack",user="user1"} 4
+		cortex_alertmanager_notifications_total{integration="slack",user="user2"} 40
+		cortex_alertmanager_notifications_total{integration="slack",user="user3"} 400
+		cortex_alertmanager_notifications_total{integration="victorops",user="user1"} 7
+		cortex_alertmanager_notifications_total{integration="victorops",user="user2"} 70
+		cortex_alertmanager_notifications_total{integration="victorops",user="user3"} 700
+		cortex_alertmanager_notifications_total{integration="webhook",user="user1"} 6
+		cortex_alertmanager_notifications_total{integration="webhook",user="user2"} 60
+		cortex_alertmanager_notifications_total{integration="webhook",user="user3"} 600
+		cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 2
+		cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
+		cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200
 		# HELP cortex_alertmanager_silences How many silences by state.
 		# TYPE cortex_alertmanager_silences gauge
 		cortex_alertmanager_silences{state="active",user="user1"} 1
diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go
index 322e5f63f2b..d54c7419a6d 100644
--- a/pkg/alertmanager/multitenant.go
+++ b/pkg/alertmanager/multitenant.go
@@ -135,7 +135,7 @@ func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAl
 	m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
 		Namespace: "cortex",
 		Name:      "alertmanager_config_invalid",
-		Help:      "Whenever the Alertmanager config is invalid for a user.",
+		Help:      "Boolean set to 1 whenever the Alertmanager config is invalid for a user.",
 	}, []string{"user"})
 
 	return m
diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go
index 14b920cdd57..e807f0d33d9 100644
--- a/pkg/alertmanager/multitenant_test.go
+++ b/pkg/alertmanager/multitenant_test.go
@@ -96,7 +96,7 @@ func TestLoadAllConfigs(t *testing.T) {
 	require.Equal(t, simpleConfigOne, currentConfig.RawConfig)
 
 	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
-		# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
+		# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
 		# TYPE cortex_alertmanager_config_invalid gauge
 		cortex_alertmanager_config_invalid{user="user1"} 0
 		cortex_alertmanager_config_invalid{user="user2"} 0
@@ -113,7 +113,7 @@ func TestLoadAllConfigs(t *testing.T) {
 	require.Len(t, am.alertmanagers, 3)
 
 	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
-		# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
+		# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
 		# TYPE cortex_alertmanager_config_invalid gauge
 		cortex_alertmanager_config_invalid{user="user1"} 0
 		cortex_alertmanager_config_invalid{user="user2"} 0
@@ -146,7 +146,7 @@ func TestLoadAllConfigs(t *testing.T) {
 	require.False(t, userAM.IsActive())
 
 	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
-		# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
+		# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
 		# TYPE cortex_alertmanager_config_invalid gauge
 		cortex_alertmanager_config_invalid{user="user1"} 0
 		cortex_alertmanager_config_invalid{user="user2"} 0
@@ -170,7 +170,7 @@ func TestLoadAllConfigs(t *testing.T) {
 	require.True(t, userAM.IsActive())
 
 	assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
-		# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
+		# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
 		# TYPE cortex_alertmanager_config_invalid gauge
 		cortex_alertmanager_config_invalid{user="user1"} 0
 		cortex_alertmanager_config_invalid{user="user2"} 0
diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go
index f1a2082f7b3..281b0efc58f 100644
--- a/pkg/ruler/manager.go
+++ b/pkg/ruler/manager.go
@@ -20,6 +20,24 @@ import (
 	"github.com/cortexproject/cortex/pkg/util"
 )
 
+var (
+	configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "cortex",
+		Name:      "ruler_config_updates_total",
+		Help:      "Total number of config updates triggered by a user",
+	}, []string{"user"})
+	configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "cortex",
+		Name:      "ruler_config_update_failures_total",
+		Help:      "Total number of config update failures triggered by a user",
+	}, []string{"user", "reason"})
+	userManagerFailed = promauto.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: "cortex",
+		Name:      "ruler_manager_failed",
+		Help:      "Boolean set to 1 whenever the Ruler manager failed to start for a user.",
+	}, []string{"user"})
+)
+
 type DefaultMultiTenantManager struct {
 	cfg            Config
 	notifierCfg    *config.Config
@@ -112,6 +130,7 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
 			manager, err = r.newManager(ctx, user)
 			if err != nil {
 				configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc()
+				userManagerFailed.WithLabelValues(user).Set(1)
 				level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err)
 				return
 			}
@@ -123,9 +142,12 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
 		err = manager.Update(r.cfg.EvaluationInterval, files, nil)
 		if err != nil {
 			configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc()
+			userManagerFailed.WithLabelValues(user).Set(1)
 			level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err)
 			return
 		}
+
+		userManagerFailed.WithLabelValues(user).Set(0)
 	}
 }
 
diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go
index 6c8c23f4e12..24a72457c36 100644
--- a/pkg/ruler/ruler.go
+++ b/pkg/ruler/ruler.go
@@ -38,16 +38,6 @@ var (
 		Name:      "ruler_ring_check_errors_total",
 		Help:      "Number of errors that have occurred when checking the ring for ownership",
 	})
-	configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
-		Namespace: "cortex",
-		Name:      "ruler_config_updates_total",
-		Help:      "Total number of config updates triggered by a user",
-	}, []string{"user"})
-	configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{
-		Namespace: "cortex",
-		Name:      "ruler_config_update_failures_total",
-		Help:      "Total number of config update failures triggered by a user",
-	}, []string{"user", "reason"})
 )
 
 // Config is the configuration for the recording rules server.

From c3727b32651755590448f5f22c2f581e0cdac151 Mon Sep 17 00:00:00 2001
From: Goutham Veeramachaneni <gouthamve@gmail.com>
Date: Fri, 21 Aug 2020 15:00:04 +0200
Subject: [PATCH 2/2] Address feedback

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>
---
 CHANGELOG.md         |  1 +
 pkg/ruler/manager.go | 57 +++++++++++++++++++++++---------------------
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1097c584b63..5d801118f7e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
 * [CHANGE] Experimental Delete Series: `/api/v1/admin/tsdb/delete_series` and `/api/v1/admin/tsdb/cancel_delete_request` purger APIs to return status code `204` instead of `200` for success. #2946
 * [CHANGE] Histogram `cortex_memcache_request_duration_seconds` `method` label value changes from `Memcached.Get` to `Memcached.GetBatched` for batched lookups, and is not reported for non-batched lookups (label value `Memcached.GetMulti` remains, and had exactly the same value as `Get` in nonbatched lookups).  The same change applies to tracing spans. #3046
 * [CHANGE] TLS server validation is now enabled by default, a new parameter `tls_insecure_skip_verify` can be set to true to skip validation optionally. #3030
+* [CHANGE] `cortex_ruler_config_update_failures_total` has been removed in favor of `cortex_ruler_config_last_reload_successful`. #3056
 * [ENHANCEMENT] Add support for azure storage in China, German and US Government environments. #2988
 * [ENHANCEMENT] Query-tee: added a small tolerance to floating point sample values comparison. #2994
 * [ENHANCEMENT] Query-tee: add support for doing a passthrough of requests to preferred backend for unregistered routes #3018
diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go
index 281b0efc58f..6e1fdfefe26 100644
--- a/pkg/ruler/manager.go
+++ b/pkg/ruler/manager.go
@@ -20,24 +20,6 @@ import (
 	"github.com/cortexproject/cortex/pkg/util"
 )
 
-var (
-	configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
-		Namespace: "cortex",
-		Name:      "ruler_config_updates_total",
-		Help:      "Total number of config updates triggered by a user",
-	}, []string{"user"})
-	configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{
-		Namespace: "cortex",
-		Name:      "ruler_config_update_failures_total",
-		Help:      "Total number of config update failures triggered by a user",
-	}, []string{"user", "reason"})
-	userManagerFailed = promauto.NewGaugeVec(prometheus.GaugeOpts{
-		Namespace: "cortex",
-		Name:      "ruler_manager_failed",
-		Help:      "Boolean set to 1 whenever the Ruler manager failed to start for a user.",
-	}, []string{"user"})
-)
-
 type DefaultMultiTenantManager struct {
 	cfg            Config
 	notifierCfg    *config.Config
@@ -55,9 +37,12 @@ type DefaultMultiTenantManager struct {
 	notifiersMtx sync.Mutex
 	notifiers    map[string]*rulerNotifier
 
-	managersTotal prometheus.Gauge
-	registry      prometheus.Registerer
-	logger        log.Logger
+	managersTotal                 prometheus.Gauge
+	lastReloadSuccessful          *prometheus.GaugeVec
+	lastReloadSuccessfulTimestamp *prometheus.GaugeVec
+	configUpdatesTotal            *prometheus.CounterVec
+	registry                      prometheus.Registerer
+	logger                        log.Logger
 }
 
 func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
@@ -84,6 +69,21 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
 			Name:      "ruler_managers_total",
 			Help:      "Total number of managers registered and running in the ruler",
 		}),
+		lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
+			Namespace: "cortex",
+			Name:      "ruler_config_last_reload_successful",
+			Help:      "Boolean set to 1 whenever the last configuration reload attempt was successful.",
+		}, []string{"user"}),
+		lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
+			Namespace: "cortex",
+			Name:      "ruler_config_last_reload_successful_seconds",
+			Help:      "Timestamp of the last successful configuration reload.",
+		}, []string{"user"}),
+		configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+			Namespace: "cortex",
+			Name:      "ruler_config_updates_total",
+			Help:      "Total number of config updates triggered by a user",
+		}, []string{"user"}),
 		registry: reg,
 		logger:   logger,
 	}, nil
@@ -104,6 +104,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
 		if _, exists := ruleGroups[userID]; !exists {
 			go mngr.Stop()
 			delete(r.userManagers, userID)
+			r.lastReloadSuccessful.DeleteLabelValues(userID)
+			r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
+			r.configUpdatesTotal.DeleteLabelValues(userID)
 			level.Info(r.logger).Log("msg", "deleting rule manager", "user", userID)
 		}
 	}
@@ -118,19 +121,19 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
 	// have been updated
 	update, files, err := r.mapper.MapRules(user, groups.Formatted())
 	if err != nil {
+		r.lastReloadSuccessful.WithLabelValues(user).Set(0)
 		level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err)
 		return
 	}
 
 	if update {
 		level.Debug(r.logger).Log("msg", "updating rules", "user", "user")
-		configUpdatesTotal.WithLabelValues(user).Inc()
+		r.configUpdatesTotal.WithLabelValues(user).Inc()
 		manager, exists := r.userManagers[user]
 		if !exists {
 			manager, err = r.newManager(ctx, user)
 			if err != nil {
-				configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc()
-				userManagerFailed.WithLabelValues(user).Set(1)
+				r.lastReloadSuccessful.WithLabelValues(user).Set(0)
 				level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err)
 				return
 			}
@@ -141,13 +144,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
 		}
 		err = manager.Update(r.cfg.EvaluationInterval, files, nil)
 		if err != nil {
-			configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc()
-			userManagerFailed.WithLabelValues(user).Set(1)
+			r.lastReloadSuccessful.WithLabelValues(user).Set(0)
 			level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err)
 			return
 		}
 
-		userManagerFailed.WithLabelValues(user).Set(0)
+		r.lastReloadSuccessful.WithLabelValues(user).Set(1)
+		r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
 	}
 }