Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## master / unreleased

* [CHANGE] Replace the metric `cortex_alertmanager_configs` with `cortex_alertmanager_config_invalid` exposed by Alertmanager. #2960
* [CHANGE] Experimental Delete Series: Change target flag for purger from `data-purger` to `purger`. #2777
* [CHANGE] Experimental TSDB: The max concurrent queries against the long-term storage, configured via `-experimental.tsdb.bucket-store.max-concurrent`, is now a limit shared across all tenants and not a per-tenant limit anymore. The default value has changed from `20` to `100` and the following new metrics have been added: #2797
* `cortex_bucket_stores_gate_queries_concurrent_max`
Expand Down
11 changes: 7 additions & 4 deletions integration/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package main
import (
"context"
"testing"
"time"

"github.com/stretchr/testify/require"

Expand All @@ -29,7 +30,7 @@ func TestAlertmanager(t *testing.T) {
"",
)
require.NoError(t, s.StartAndWaitReady(alertmanager))
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs"))
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid"))

c, err := e2ecortex.NewClient("", "", alertmanager.HTTPEndpoint(), "", "user-1")
require.NoError(t, err)
Expand Down Expand Up @@ -67,7 +68,7 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
)

require.NoError(t, s.StartAndWaitReady(am))
require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs"))
require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "alertmanager_cluster_members"))

c, err := e2ecortex.NewClient("", "", am.HTTPEndpoint(), "", "user-1")
require.NoError(t, err)
Expand All @@ -79,7 +80,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
err = c.SetAlertmanagerConfig(context.Background(), cortexAlertmanagerUserConfigYaml, map[string]string{})
require.NoError(t, err)

require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs"))
time.Sleep(2 * time.Second)
Copy link
Contributor Author

@gotjosh gotjosh Aug 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I solved the CI failure using a quick hack, but I dislike Sleep is tests as it a general source of flaky tests. The problem here is the metric does not get initially registered (given it is per tenant) until much later.

A solution I can see would be to create a new method (or argument to WaitSumMetrics) that waits for a metric to appear instead of bailing out if we don't see the metric on the initial metrics page load. As I can see this being useful in other areas.

Please let me know and I'll be happy to make the change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's work on it in a separate PR. Could you open an issue, please? I have this pending PR #2522 which could be used as a baseline to add a functional option to enable the logic you suggest.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks marco, filed in #2975

require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid"))

cfg, err := c.GetAlertmanagerConfig(context.Background())
require.NoError(t, err)
Expand All @@ -95,7 +97,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
err = c.DeleteAlertmanagerConfig(context.Background())
require.NoError(t, err)

require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs"))
time.Sleep(2 * time.Second)

cfg, err = c.GetAlertmanagerConfig(context.Background())
require.Error(t, err)
require.Nil(t, cfg)
Expand Down
29 changes: 12 additions & 17 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ const (
// a URL derived from Config.AutoWebhookRoot
autoWebhookURL = "http://internal.monitor"

configStatusValid = "valid"
configStatusInvalid = "invalid"

statusPage = `
<!doctype html>
<html>
Expand Down Expand Up @@ -129,19 +126,17 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
}

type multitenantAlertmanagerMetrics struct {
totalConfigs *prometheus.GaugeVec
invalidConfig *prometheus.GaugeVec
}

func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
m := &multitenantAlertmanagerMetrics{}

m.totalConfigs = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_configs",
Help: "How many configs the multitenant alertmanager knows about.",
}, []string{"status"})
m.totalConfigs.WithLabelValues(configStatusInvalid).Set(0)
m.totalConfigs.WithLabelValues(configStatusValid).Set(0)
Name: "alertmanager_config_invalid",
Help: "Whenever the Alertmanager config is invalid for a user.",
}, []string{"user"})

return m
}
Expand Down Expand Up @@ -311,15 +306,16 @@ func (am *MultitenantAlertmanager) poll() (map[string]alerts.AlertConfigDesc, er
}

func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfigDesc) {
invalid := 0 // Count the number of invalid configs as we go.

level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgs))
for _, cfg := range cfgs {
for user, cfg := range cfgs {
err := am.setConfig(cfg)
if err != nil {
invalid++
am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1))
level.Warn(am.logger).Log("msg", "error applying config", "err", err)
continue
}

am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0))
}

am.alertmanagersMtx.Lock()
Expand All @@ -332,11 +328,10 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user)
userAM.Pause()
delete(am.cfgs, user)
am.multitenantMetrics.invalidConfig.DeleteLabelValues(user)
level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user)
}
}
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid))
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid))
}

func (am *MultitenantAlertmanager) transformConfig(userID string, amConfig *amconfig.Config) (*amconfig.Config, error) {
Expand Down Expand Up @@ -407,7 +402,7 @@ func (am *MultitenantAlertmanager) setConfig(cfg alerts.AlertConfigDesc) error {
if am.fallbackConfig == "" {
return fmt.Errorf("blank Alertmanager configuration for %v", cfg.User)
}
level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user_id", cfg.User)
level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user", cfg.User)
userAmConfig, err = amconfig.Load(am.fallbackConfig)
if err != nil {
return fmt.Errorf("unable to load fallback configuration for %v: %v", cfg.User, err)
Expand Down
42 changes: 22 additions & 20 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.Equal(t, simpleConfigOne, currentConfig.RawConfig)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 2
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
`), "cortex_alertmanager_config_invalid"))

// Ensure when a 3rd config is added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -113,11 +113,12 @@ func TestLoadAllConfigs(t *testing.T) {
require.Len(t, am.alertmanagers, 3)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 3
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
cortex_alertmanager_config_invalid{user="user3"} 0
`), "cortex_alertmanager_config_invalid"))

// Ensure the config is updated
mockStore.configs["user1"] = alerts.AlertConfigDesc{
Expand Down Expand Up @@ -145,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.False(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 2
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
`), "cortex_alertmanager_config_invalid"))

// Ensure when a 3rd config is re-added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -169,9 +170,10 @@ func TestLoadAllConfigs(t *testing.T) {
require.True(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 3
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
cortex_alertmanager_config_invalid{user="user3"} 0
`), "cortex_alertmanager_config_invalid"))
}