Skip to content

Commit b404a95

Browse files
authored
AM Metric: Add tenant label to valid/invalid configs (#2960)
* AM Metric: Add tenant label to valid/invalid configs Gives us a way to know whenever a tenant has an invalid configuration in place. Signed-off-by: gotjosh <[email protected]> * Remove the metrics when we have no active config Signed-off-by: gotjosh <[email protected]> * These metrics no longer get initialised on starting up the component Signed-off-by: gotjosh <[email protected]> * Address review feedback Signed-off-by: gotjosh <[email protected]> * s/alertmanager_invalid_config/alertmanager_config_invalid Signed-off-by: gotjosh <[email protected]>
1 parent e2f8663 commit b404a95

File tree

4 files changed

+42
-41
lines changed

4 files changed

+42
-41
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## master / unreleased
44

5+
* [CHANGE] Replace the metric `cortex_alertmanager_configs` with `cortex_alertmanager_config_invalid` exposed by Alertmanager. #2960
56
* [CHANGE] Experimental Delete Series: Change target flag for purger from `data-purger` to `purger`. #2777
67
* [CHANGE] Experimental TSDB: The max concurrent queries against the long-term storage, configured via `-experimental.tsdb.bucket-store.max-concurrent`, is now a limit shared across all tenants and not a per-tenant limit anymore. The default value has changed from `20` to `100` and the following new metrics have been added: #2797
78
* `cortex_bucket_stores_gate_queries_concurrent_max`

integration/alertmanager_test.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package main
55
import (
66
"context"
77
"testing"
8+
"time"
89

910
"github.com/stretchr/testify/require"
1011

@@ -29,7 +30,7 @@ func TestAlertmanager(t *testing.T) {
2930
"",
3031
)
3132
require.NoError(t, s.StartAndWaitReady(alertmanager))
32-
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs"))
33+
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid"))
3334

3435
c, err := e2ecortex.NewClient("", "", alertmanager.HTTPEndpoint(), "", "user-1")
3536
require.NoError(t, err)
@@ -67,7 +68,7 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
6768
)
6869

6970
require.NoError(t, s.StartAndWaitReady(am))
70-
require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs"))
71+
require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "alertmanager_cluster_members"))
7172

7273
c, err := e2ecortex.NewClient("", "", am.HTTPEndpoint(), "", "user-1")
7374
require.NoError(t, err)
@@ -79,7 +80,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
7980
err = c.SetAlertmanagerConfig(context.Background(), cortexAlertmanagerUserConfigYaml, map[string]string{})
8081
require.NoError(t, err)
8182

82-
require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs"))
83+
time.Sleep(2 * time.Second)
84+
require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid"))
8385

8486
cfg, err := c.GetAlertmanagerConfig(context.Background())
8587
require.NoError(t, err)
@@ -95,7 +97,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
9597
err = c.DeleteAlertmanagerConfig(context.Background())
9698
require.NoError(t, err)
9799

98-
require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs"))
100+
time.Sleep(2 * time.Second)
101+
99102
cfg, err = c.GetAlertmanagerConfig(context.Background())
100103
require.Error(t, err)
101104
require.Nil(t, cfg)

pkg/alertmanager/multitenant.go

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@ const (
3939
// a URL derived from Config.AutoWebhookRoot
4040
autoWebhookURL = "http://internal.monitor"
4141

42-
configStatusValid = "valid"
43-
configStatusInvalid = "invalid"
44-
4542
statusPage = `
4643
<!doctype html>
4744
<html>
@@ -129,19 +126,17 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
129126
}
130127

131128
type multitenantAlertmanagerMetrics struct {
132-
totalConfigs *prometheus.GaugeVec
129+
invalidConfig *prometheus.GaugeVec
133130
}
134131

135132
func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
136133
m := &multitenantAlertmanagerMetrics{}
137134

138-
m.totalConfigs = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
135+
m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
139136
Namespace: "cortex",
140-
Name: "alertmanager_configs",
141-
Help: "How many configs the multitenant alertmanager knows about.",
142-
}, []string{"status"})
143-
m.totalConfigs.WithLabelValues(configStatusInvalid).Set(0)
144-
m.totalConfigs.WithLabelValues(configStatusValid).Set(0)
137+
Name: "alertmanager_config_invalid",
138+
Help: "Whenever the Alertmanager config is invalid for a user.",
139+
}, []string{"user"})
145140

146141
return m
147142
}
@@ -311,15 +306,16 @@ func (am *MultitenantAlertmanager) poll() (map[string]alerts.AlertConfigDesc, er
311306
}
312307

313308
func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfigDesc) {
314-
invalid := 0 // Count the number of invalid configs as we go.
315-
316309
level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgs))
317-
for _, cfg := range cfgs {
310+
for user, cfg := range cfgs {
318311
err := am.setConfig(cfg)
319312
if err != nil {
320-
invalid++
313+
am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1))
321314
level.Warn(am.logger).Log("msg", "error applying config", "err", err)
315+
continue
322316
}
317+
318+
am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0))
323319
}
324320

325321
am.alertmanagersMtx.Lock()
@@ -332,11 +328,10 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
332328
level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user)
333329
userAM.Pause()
334330
delete(am.cfgs, user)
331+
am.multitenantMetrics.invalidConfig.DeleteLabelValues(user)
335332
level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user)
336333
}
337334
}
338-
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid))
339-
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid))
340335
}
341336

342337
func (am *MultitenantAlertmanager) transformConfig(userID string, amConfig *amconfig.Config) (*amconfig.Config, error) {
@@ -407,7 +402,7 @@ func (am *MultitenantAlertmanager) setConfig(cfg alerts.AlertConfigDesc) error {
407402
if am.fallbackConfig == "" {
408403
return fmt.Errorf("blank Alertmanager configuration for %v", cfg.User)
409404
}
410-
level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user_id", cfg.User)
405+
level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user", cfg.User)
411406
userAmConfig, err = amconfig.Load(am.fallbackConfig)
412407
if err != nil {
413408
return fmt.Errorf("unable to load fallback configuration for %v: %v", cfg.User, err)

pkg/alertmanager/multitenant_test.go

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) {
9696
require.Equal(t, simpleConfigOne, currentConfig.RawConfig)
9797

9898
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
99-
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
100-
# TYPE cortex_alertmanager_configs gauge
101-
cortex_alertmanager_configs{status="valid"} 2
102-
cortex_alertmanager_configs{status="invalid"} 0
103-
`), "cortex_alertmanager_configs"))
99+
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
100+
# TYPE cortex_alertmanager_config_invalid gauge
101+
cortex_alertmanager_config_invalid{user="user1"} 0
102+
cortex_alertmanager_config_invalid{user="user2"} 0
103+
`), "cortex_alertmanager_config_invalid"))
104104

105105
// Ensure when a 3rd config is added, it is synced correctly
106106
mockStore.configs["user3"] = alerts.AlertConfigDesc{
@@ -113,11 +113,12 @@ func TestLoadAllConfigs(t *testing.T) {
113113
require.Len(t, am.alertmanagers, 3)
114114

115115
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
116-
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
117-
# TYPE cortex_alertmanager_configs gauge
118-
cortex_alertmanager_configs{status="valid"} 3
119-
cortex_alertmanager_configs{status="invalid"} 0
120-
`), "cortex_alertmanager_configs"))
116+
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
117+
# TYPE cortex_alertmanager_config_invalid gauge
118+
cortex_alertmanager_config_invalid{user="user1"} 0
119+
cortex_alertmanager_config_invalid{user="user2"} 0
120+
cortex_alertmanager_config_invalid{user="user3"} 0
121+
`), "cortex_alertmanager_config_invalid"))
121122

122123
// Ensure the config is updated
123124
mockStore.configs["user1"] = alerts.AlertConfigDesc{
@@ -145,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) {
145146
require.False(t, userAM.IsActive())
146147

147148
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
148-
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
149-
# TYPE cortex_alertmanager_configs gauge
150-
cortex_alertmanager_configs{status="valid"} 2
151-
cortex_alertmanager_configs{status="invalid"} 0
152-
`), "cortex_alertmanager_configs"))
149+
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
150+
# TYPE cortex_alertmanager_config_invalid gauge
151+
cortex_alertmanager_config_invalid{user="user1"} 0
152+
cortex_alertmanager_config_invalid{user="user2"} 0
153+
`), "cortex_alertmanager_config_invalid"))
153154

154155
// Ensure when a 3rd config is re-added, it is synced correctly
155156
mockStore.configs["user3"] = alerts.AlertConfigDesc{
@@ -169,9 +170,10 @@ func TestLoadAllConfigs(t *testing.T) {
169170
require.True(t, userAM.IsActive())
170171

171172
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
172-
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
173-
# TYPE cortex_alertmanager_configs gauge
174-
cortex_alertmanager_configs{status="valid"} 3
175-
cortex_alertmanager_configs{status="invalid"} 0
176-
`), "cortex_alertmanager_configs"))
173+
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
174+
# TYPE cortex_alertmanager_config_invalid gauge
175+
cortex_alertmanager_config_invalid{user="user1"} 0
176+
cortex_alertmanager_config_invalid{user="user2"} 0
177+
cortex_alertmanager_config_invalid{user="user3"} 0
178+
`), "cortex_alertmanager_config_invalid"))
177179
}

0 commit comments

Comments
 (0)