diff --git a/CHANGELOG.md b/CHANGELOG.md index 824c46567b8..7c3a3dade8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master / unreleased +* [CHANGE] Replace the metric `cortex_alertmanager_configs` with `cortex_alertmanager_config_invalid` exposed by Alertmanager. #2960 * [CHANGE] Experimental Delete Series: Change target flag for purger from `data-purger` to `purger`. #2777 * [CHANGE] Experimental TSDB: The max concurrent queries against the long-term storage, configured via `-experimental.tsdb.bucket-store.max-concurrent`, is now a limit shared across all tenants and not a per-tenant limit anymore. The default value has changed from `20` to `100` and the following new metrics have been added: #2797 * `cortex_bucket_stores_gate_queries_concurrent_max` diff --git a/integration/alertmanager_test.go b/integration/alertmanager_test.go index c4a01a10419..3069dc10b2a 100644 --- a/integration/alertmanager_test.go +++ b/integration/alertmanager_test.go @@ -5,6 +5,7 @@ package main import ( "context" "testing" + "time" "github.com/stretchr/testify/require" @@ -29,7 +30,7 @@ func TestAlertmanager(t *testing.T) { "", ) require.NoError(t, s.StartAndWaitReady(alertmanager)) - require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs")) + require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid")) c, err := e2ecortex.NewClient("", "", alertmanager.HTTPEndpoint(), "", "user-1") require.NoError(t, err) @@ -67,7 +68,7 @@ func TestAlertmanagerStoreAPI(t *testing.T) { ) require.NoError(t, s.StartAndWaitReady(am)) - require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs")) + require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "alertmanager_cluster_members")) c, err := e2ecortex.NewClient("", "", am.HTTPEndpoint(), "", "user-1") require.NoError(t, err) @@ -79,7 +80,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) { err = c.SetAlertmanagerConfig(context.Background(), cortexAlertmanagerUserConfigYaml, map[string]string{}) require.NoError(t, err) - require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs")) + time.Sleep(2 * time.Second) + require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid")) cfg, err := c.GetAlertmanagerConfig(context.Background()) require.NoError(t, err) @@ -95,7 +97,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) { err = c.DeleteAlertmanagerConfig(context.Background()) require.NoError(t, err) - require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs")) + time.Sleep(2 * time.Second) + cfg, err = c.GetAlertmanagerConfig(context.Background()) require.Error(t, err) require.Nil(t, cfg) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 4a13ff1c242..ae9deafe1af 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -39,9 +39,6 @@ const ( // a URL derived from Config.AutoWebhookRoot autoWebhookURL = "http://internal.monitor" - configStatusValid = "valid" - configStatusInvalid = "invalid" - statusPage = ` @@ -129,19 +126,17 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) { } type multitenantAlertmanagerMetrics struct { - totalConfigs *prometheus.GaugeVec + invalidConfig *prometheus.GaugeVec } func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics { m := &multitenantAlertmanagerMetrics{} - m.totalConfigs = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ Namespace: "cortex", - Name: "alertmanager_configs", - Help: "How many configs the multitenant alertmanager knows about.", - }, []string{"status"}) - m.totalConfigs.WithLabelValues(configStatusInvalid).Set(0) - m.totalConfigs.WithLabelValues(configStatusValid).Set(0) + Name: "alertmanager_config_invalid", + Help: "Whenever the Alertmanager config is invalid for a user.", + }, []string{"user"}) return m } @@ -311,15 +306,16 @@ func (am *MultitenantAlertmanager) poll() (map[string]alerts.AlertConfigDesc, er } func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfigDesc) { - invalid := 0 // Count the number of invalid configs as we go. - level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgs)) - for _, cfg := range cfgs { + for user, cfg := range cfgs { err := am.setConfig(cfg) if err != nil { - invalid++ + am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1)) level.Warn(am.logger).Log("msg", "error applying config", "err", err) + continue } + + am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0)) } am.alertmanagersMtx.Lock() @@ -332,11 +328,10 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user) userAM.Pause() delete(am.cfgs, user) + am.multitenantMetrics.invalidConfig.DeleteLabelValues(user) level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user) } } - am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid)) - am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid)) } func (am *MultitenantAlertmanager) transformConfig(userID string, amConfig *amconfig.Config) (*amconfig.Config, error) { @@ -407,7 +402,7 @@ func (am *MultitenantAlertmanager) setConfig(cfg alerts.AlertConfigDesc) error { if am.fallbackConfig == "" { return fmt.Errorf("blank Alertmanager configuration for %v", cfg.User) } - level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user_id", cfg.User) + level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user", cfg.User) userAmConfig, err = amconfig.Load(am.fallbackConfig) if err != nil { return fmt.Errorf("unable to load fallback configuration for %v: %v", cfg.User, err) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 6801d148895..14b920cdd57 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) { require.Equal(t, simpleConfigOne, currentConfig.RawConfig) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. - # TYPE cortex_alertmanager_configs gauge - cortex_alertmanager_configs{status="valid"} 2 - cortex_alertmanager_configs{status="invalid"} 0 - `), "cortex_alertmanager_configs")) + # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # TYPE cortex_alertmanager_config_invalid gauge + cortex_alertmanager_config_invalid{user="user1"} 0 + cortex_alertmanager_config_invalid{user="user2"} 0 + `), "cortex_alertmanager_config_invalid")) // Ensure when a 3rd config is added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ @@ -113,11 +113,12 @@ func TestLoadAllConfigs(t *testing.T) { require.Len(t, am.alertmanagers, 3) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. - # TYPE cortex_alertmanager_configs gauge - cortex_alertmanager_configs{status="valid"} 3 - cortex_alertmanager_configs{status="invalid"} 0 - `), "cortex_alertmanager_configs")) + # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # TYPE cortex_alertmanager_config_invalid gauge + cortex_alertmanager_config_invalid{user="user1"} 0 + cortex_alertmanager_config_invalid{user="user2"} 0 + cortex_alertmanager_config_invalid{user="user3"} 0 + `), "cortex_alertmanager_config_invalid")) // Ensure the config is updated mockStore.configs["user1"] = alerts.AlertConfigDesc{ @@ -145,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) { require.False(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. - # TYPE cortex_alertmanager_configs gauge - cortex_alertmanager_configs{status="valid"} 2 - cortex_alertmanager_configs{status="invalid"} 0 - `), "cortex_alertmanager_configs")) + # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # TYPE cortex_alertmanager_config_invalid gauge + cortex_alertmanager_config_invalid{user="user1"} 0 + cortex_alertmanager_config_invalid{user="user2"} 0 + `), "cortex_alertmanager_config_invalid")) // Ensure when a 3rd config is re-added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ @@ -169,9 +170,10 @@ func TestLoadAllConfigs(t *testing.T) { require.True(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about. - # TYPE cortex_alertmanager_configs gauge - cortex_alertmanager_configs{status="valid"} 3 - cortex_alertmanager_configs{status="invalid"} 0 - `), "cortex_alertmanager_configs")) + # HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user. + # TYPE cortex_alertmanager_config_invalid gauge + cortex_alertmanager_config_invalid{user="user1"} 0 + cortex_alertmanager_config_invalid{user="user2"} 0 + cortex_alertmanager_config_invalid{user="user3"} 0 + `), "cortex_alertmanager_config_invalid")) }