Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
* [CHANGE] Experimental Delete Series: `/api/v1/admin/tsdb/delete_series` and `/api/v1/admin/tsdb/cancel_delete_request` purger APIs to return status code `204` instead of `200` for success. #2946
* [CHANGE] Histogram `cortex_memcache_request_duration_seconds` `method` label value changes from `Memcached.Get` to `Memcached.GetBatched` for batched lookups, and is not reported for non-batched lookups (label value `Memcached.GetMulti` remains, and had exactly the same value as `Get` in nonbatched lookups). The same change applies to tracing spans. #3046
* [CHANGE] TLS server validation is now enabled by default, a new parameter `tls_insecure_skip_verify` can be set to true to skip validation optionally. #3030
* [CHANGE] `cortex_ruler_config_update_failures_total` has been removed in favor of `cortex_ruler_config_last_reload_successful`. #3056
* [ENHANCEMENT] Add support for azure storage in China, German and US Government environments. #2988
* [ENHANCEMENT] Query-tee: added a small tolerance to floating point sample values comparison. #2994
* [ENHANCEMENT] Query-tee: add support for doing a passthrough of requests to preferred backend for unregistered routes #3018
* [ENHANCEMENT] Expose `storage.aws.dynamodb.backoff_config` configuration file field. #3026
* [ENHANCEMENT] Added `cortex_request_message_bytes` and `cortex_response_message_bytes` histograms to track received and sent gRPC message and HTTP request/response sizes. Added `cortex_inflight_requests` gauge to track number of inflight gRPC and HTTP requests. #3064
* [ENHANCEMENT] Add config validation to the experimental Alertmanager API. Invalid configs are no longer accepted. #3053
* [ENHANCEMENT] Add "integration" as a label for `cortex_alertmanager_notifications_total` and `cortex_alertmanager_notifications_failed_total` metrics. #3056
* [ENHANCEMENT] Add `cortex_ruler_config_last_reload_successful` and `cortex_ruler_config_last_reload_successful_seconds` to check status of users rule manager. #3056
* [BUGFIX] Query-frontend: Fixed rounding for incoming query timestamps, to be 100% Prometheus compatible. #2990
* [BUGFIX] Querier: query /series from ingesters regardless the `-querier.query-ingesters-within` setting. #3035
* [BUGFIX] Experimental blocks storage: Ingester is less likely to hit gRPC message size limit when streaming data to queriers. #3015
Expand Down
8 changes: 4 additions & 4 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
numNotifications: prometheus.NewDesc(
"cortex_alertmanager_notifications_total",
"The total number of attempted notifications.",
[]string{"user"}, nil),
[]string{"user", "integration"}, nil),
numFailedNotifications: prometheus.NewDesc(
"cortex_alertmanager_notifications_failed_total",
"The total number of failed notifications.",
[]string{"user"}, nil),
[]string{"user", "integration"}, nil),
notificationLatencySeconds: prometheus.NewDesc(
"cortex_alertmanager_notification_latency_seconds",
"The latency of notifications in seconds.",
Expand Down Expand Up @@ -186,8 +186,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total")
data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")

data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total")
data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total")
data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration")
data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration")
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")

Expand Down
54 changes: 48 additions & 6 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,56 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notification_latency_seconds_count 24
# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
# TYPE cortex_alertmanager_notifications_failed_total counter
cortex_alertmanager_notifications_failed_total{user="user1"} 28
cortex_alertmanager_notifications_failed_total{user="user2"} 280
cortex_alertmanager_notifications_failed_total{user="user3"} 2800
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300
cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4
cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40
cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200
# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
cortex_alertmanager_notifications_total{user="user1"} 28
cortex_alertmanager_notifications_total{user="user2"} 280
cortex_alertmanager_notifications_total{user="user3"} 2800
cortex_alertmanager_notifications_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 500
cortex_alertmanager_notifications_total{integration="pagerduty",user="user1"} 1
cortex_alertmanager_notifications_total{integration="pagerduty",user="user2"} 10
cortex_alertmanager_notifications_total{integration="pagerduty",user="user3"} 100
cortex_alertmanager_notifications_total{integration="pushover",user="user1"} 3
cortex_alertmanager_notifications_total{integration="pushover",user="user2"} 30
cortex_alertmanager_notifications_total{integration="pushover",user="user3"} 300
cortex_alertmanager_notifications_total{integration="slack",user="user1"} 4
cortex_alertmanager_notifications_total{integration="slack",user="user2"} 40
cortex_alertmanager_notifications_total{integration="slack",user="user3"} 400
cortex_alertmanager_notifications_total{integration="victorops",user="user1"} 7
cortex_alertmanager_notifications_total{integration="victorops",user="user2"} 70
cortex_alertmanager_notifications_total{integration="victorops",user="user3"} 700
cortex_alertmanager_notifications_total{integration="webhook",user="user1"} 6
cortex_alertmanager_notifications_total{integration="webhook",user="user2"} 60
cortex_alertmanager_notifications_total{integration="webhook",user="user3"} 600
cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 2
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200
# HELP cortex_alertmanager_silences How many silences by state.
# TYPE cortex_alertmanager_silences gauge
cortex_alertmanager_silences{state="active",user="user1"} 1
Expand Down
2 changes: 1 addition & 1 deletion pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAl
m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_config_invalid",
Help: "Whenever the Alertmanager config is invalid for a user.",
Help: "Boolean set to 1 whenever the Alertmanager config is invalid for a user.",
}, []string{"user"})

return m
Expand Down
8 changes: 4 additions & 4 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func TestLoadAllConfigs(t *testing.T) {
require.Equal(t, simpleConfigOne, currentConfig.RawConfig)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
Expand All @@ -113,7 +113,7 @@ func TestLoadAllConfigs(t *testing.T) {
require.Len(t, am.alertmanagers, 3)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
Expand Down Expand Up @@ -146,7 +146,7 @@ func TestLoadAllConfigs(t *testing.T) {
require.False(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
Expand All @@ -170,7 +170,7 @@ func TestLoadAllConfigs(t *testing.T) {
require.True(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
Expand Down
37 changes: 31 additions & 6 deletions pkg/ruler/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,12 @@ type DefaultMultiTenantManager struct {
notifiersMtx sync.Mutex
notifiers map[string]*rulerNotifier

managersTotal prometheus.Gauge
registry prometheus.Registerer
logger log.Logger
managersTotal prometheus.Gauge
lastReloadSuccessful *prometheus.GaugeVec
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
configUpdatesTotal *prometheus.CounterVec
registry prometheus.Registerer
logger log.Logger
}

func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
Expand All @@ -66,6 +69,21 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
Name: "ruler_managers_total",
Help: "Total number of managers registered and running in the ruler",
}),
lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "ruler_config_last_reload_successful",
Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.",
}, []string{"user"}),
lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "ruler_config_last_reload_successful_seconds",
Help: "Timestamp of the last successful configuration reload.",
}, []string{"user"}),
configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Namespace: "cortex",
Name: "ruler_config_updates_total",
Help: "Total number of config updates triggered by a user",
}, []string{"user"}),
registry: reg,
logger: logger,
}, nil
Expand All @@ -86,6 +104,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
if _, exists := ruleGroups[userID]; !exists {
go mngr.Stop()
delete(r.userManagers, userID)
r.lastReloadSuccessful.DeleteLabelValues(userID)
r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
r.configUpdatesTotal.DeleteLabelValues(userID)
level.Info(r.logger).Log("msg", "deleting rule manager", "user", userID)
}
}
Expand All @@ -100,18 +121,19 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
// have been updated
update, files, err := r.mapper.MapRules(user, groups.Formatted())
if err != nil {
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err)
return
}

if update {
level.Debug(r.logger).Log("msg", "updating rules", "user", "user")
configUpdatesTotal.WithLabelValues(user).Inc()
r.configUpdatesTotal.WithLabelValues(user).Inc()
manager, exists := r.userManagers[user]
if !exists {
manager, err = r.newManager(ctx, user)
if err != nil {
configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc()
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err)
return
}
Expand All @@ -122,10 +144,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
}
err = manager.Update(r.cfg.EvaluationInterval, files, nil)
if err != nil {
configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc()
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err)
return
}

r.lastReloadSuccessful.WithLabelValues(user).Set(1)
r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
}
}

Expand Down
10 changes: 0 additions & 10 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,6 @@ var (
Name: "ruler_ring_check_errors_total",
Help: "Number of errors that have occurred when checking the ring for ownership",
})
configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "cortex",
Name: "ruler_config_updates_total",
Help: "Total number of config updates triggered by a user",
}, []string{"user"})
configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "cortex",
Name: "ruler_config_update_failures_total",
Help: "Total number of config update failures triggered by a user",
}, []string{"user", "reason"})
)

// Config is the configuration for the recording rules server.
Expand Down