Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
- `-ruler.ring.instance-interface` renamed to `-ruler.ring.instance-interface-names`
* [CHANGE] Renamed `-<prefix>.redis.enable-tls` CLI flag to `-<prefix>.redis.tls-enabled`, and its respective YAML config option from `enable_tls` to `tls_enabled`. #3298
* [CHANGE] Increased default `-<prefix>.redis.timeout` from `100ms` to `500ms`. #3301
* [CHANGE] `cortex_alertmanager_config_invalid` has been removed in favor of `cortex_alertmanager_config_last_reload_successful`. #3289
* [FEATURE] Added support for shuffle-sharding queriers in the query-frontend. When configured (`-frontend.max-queriers-per-tenant` globally, or using per-tenant limit `max_queriers_per_tenant`), each tenants's requests will be handled by different set of queriers. #3113 #3257
* [FEATURE] Query-frontend: added `compression` config to support results cache with compression. #3217
* [ENHANCEMENT] Allow to specify multiple comma-separated Cortex services to `-target` CLI option (or its respective YAML config option). For example, `-target=all,compactor` can be used to start Cortex single-binary with compactor as well. #3275
Expand Down Expand Up @@ -77,6 +78,7 @@
* [ENHANCEMENT] Return an explicit error when the store-gateway is explicitly requested without a blocks storage engine. #3287
* [ENHANCEMENT] Ruler: only load rules that belong to the ruler. Improves rules synching performances when ruler sharding is enabled. #3269
* [ENHANCEMENT] Added `-<prefix>.redis.tls-insecure-skip-verify` flag. #3298
* [ENHANCEMENT] Added `cortex_alertmanager_config_last_reload_successful_seconds` metric to show timestamp of last successful AM config reload. #3289
* [BUGFIX] No-longer-needed ingester operations for queries triggered by queriers and rulers are now canceled. #3178
* [BUGFIX] Ruler: directories in the configured `rules-path` will be removed on startup and shutdown in order to ensure they don't persist between runs. #3195
* [BUGFIX] Handle hash-collisions in the query path. #3192
Expand Down
11 changes: 8 additions & 3 deletions integration/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func TestAlertmanager(t *testing.T) {
"",
)
require.NoError(t, s.StartAndWaitReady(alertmanager))
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid"))
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_config_last_reload_successful"))

c, err := e2ecortex.NewClient("", "", alertmanager.HTTPEndpoint(), "", "user-1")
require.NoError(t, err)
Expand Down Expand Up @@ -81,7 +81,10 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
err = c.SetAlertmanagerConfig(context.Background(), cortexAlertmanagerUserConfigYaml, map[string]string{})
require.NoError(t, err)

require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_alertmanager_config_invalid"},
require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_alertmanager_config_last_reload_successful"},
e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")),
e2e.WaitMissingMetrics))
require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Greater(0), []string{"cortex_alertmanager_config_last_reload_successful_seconds"},
e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")),
e2e.WaitMissingMetrics))

Expand All @@ -108,7 +111,9 @@ func TestAlertmanagerStoreAPI(t *testing.T) {

// The deleted config is applied asynchronously, so we should wait until the metric
// disappear for the specific user.
require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_invalid", e2e.WithLabelMatchers(
require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_last_reload_successful", e2e.WithLabelMatchers(
labels.MustNewMatcher(labels.MatchEqual, "user", "user-1"))))
require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_last_reload_successful_seconds", e2e.WithLabelMatchers(
labels.MustNewMatcher(labels.MatchEqual, "user", "user-1"))))

cfg, err = c.GetAlertmanagerConfig(context.Background())
Expand Down
23 changes: 16 additions & 7 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,23 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
}

type multitenantAlertmanagerMetrics struct {
invalidConfig *prometheus.GaugeVec
lastReloadSuccessful *prometheus.GaugeVec
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
}

func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
m := &multitenantAlertmanagerMetrics{}

m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
m.lastReloadSuccessful = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_config_invalid",
Help: "Boolean set to 1 whenever the Alertmanager config is invalid for a user.",
Name: "alertmanager_config_last_reload_successful",
Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.",
}, []string{"user"})

m.lastReloadSuccessfulTimestamp = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_config_last_reload_successful_seconds",
Help: "Timestamp of the last successful configuration reload.",
}, []string{"user"})

return m
Expand Down Expand Up @@ -314,12 +321,13 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
for user, cfg := range cfgs {
err := am.setConfig(cfg)
if err != nil {
am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1))
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0))
level.Warn(am.logger).Log("msg", "error applying config", "err", err)
continue
}

am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0))
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1))
am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
}

am.alertmanagersMtx.Lock()
Expand All @@ -332,7 +340,8 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user)
userAM.Pause()
delete(am.cfgs, user)
am.multitenantMetrics.invalidConfig.DeleteLabelValues(user)
am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(user)
am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(user)
level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user)
}
}
Expand Down
44 changes: 22 additions & 22 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.Equal(t, simpleConfigOne, currentConfig.RawConfig)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
`), "cortex_alertmanager_config_last_reload_successful"))

// Ensure when a 3rd config is added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -113,12 +113,12 @@ func TestLoadAllConfigs(t *testing.T) {
require.Len(t, am.alertmanagers, 3)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
cortex_alertmanager_config_invalid{user="user3"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
cortex_alertmanager_config_last_reload_successful{user="user3"} 1
`), "cortex_alertmanager_config_last_reload_successful"))

// Ensure the config is updated
mockStore.configs["user1"] = alerts.AlertConfigDesc{
Expand Down Expand Up @@ -146,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.False(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
`), "cortex_alertmanager_config_last_reload_successful"))

// Ensure when a 3rd config is re-added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -170,12 +170,12 @@ func TestLoadAllConfigs(t *testing.T) {
require.True(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
cortex_alertmanager_config_invalid{user="user3"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
cortex_alertmanager_config_last_reload_successful{user="user3"} 1
`), "cortex_alertmanager_config_last_reload_successful"))
}

func TestAlertmanager_NoExternalURL(t *testing.T) {
Expand Down