Skip to content

Commit 6267964

Browse files
gotjoshpracucci
andauthored
Support multiple Alertmanagers explicitly in the Ruler (#2851)
* Support Multiple Alertmanager URLs At the moment, the Ruler only support sending alerts to multiple Alertmanagers via DNS based service discovery. However, sending to multiple Alertmanager groups is something that Prometheus allows. I believe the Ruler should support this too and this commit introduces that. To keep backward compatibility we're reusing the same flag but allowing a list of 1+. With this, we can treat each URL as an Alertmanager group where multiple Alertmanagers per group is only supported if DNS service discovery is enabled. Signed-off-by: gotjosh <[email protected]> * Address review comments Signed-off-by: gotjosh <[email protected]> * Add the host check when service discovery is enabled We need to ensure address provided follow the SRV dns specification format. Signed-off-by: gotjosh <[email protected]> * It is actually space-separated Signed-off-by: gotjosh <[email protected]> * Update pkg/ruler/notifier.go Signed-off-by: Marco Pracucci <[email protected]> * Update pkg/ruler/notifier_test.go Signed-off-by: Marco Pracucci <[email protected]> Co-authored-by: Marco Pracucci <[email protected]>
1 parent 192fc86 commit 6267964

File tree

5 files changed

+268
-53
lines changed

5 files changed

+268
-53
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
* Optimized labels regex matchers for patterns containing literals (eg. `foo.*`, `.*foo`, `.*foo.*`)
5555
* [ENHANCEMENT] Add metric `cortex_ruler_config_update_failures_total` to Ruler to track failures of loading rules files. #2857
5656
* [ENHANCEMENT] Experimental Alertmanager: Alertmanager configuration persisted to object storage using an experimental API that accepts and returns YAML-based Alertmanager configuration. #2768
57+
* [ENHANCEMENT] Ruler: `-ruler.alertmanager-url` now supports multiple URLs. Each URL is treated as a separate Alertmanager group. Support for multiple Alertmanagers in a group can be achieved by using DNS service discovery. #2851
5758
* [BUGFIX] Fixed a bug in the index intersect code causing storage to return more chunks/series than required. #2796
5859
* [BUGFIX] Fixed the number of reported keys in the background cache queue. #2764
5960
* [BUGFIX] Fix race in processing of headers in sharded queries. #2762

docs/configuration/config-file-reference.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,19 +1000,22 @@ storage:
10001000
# CLI flag: -ruler.rule-path
10011001
[rule_path: <string> | default = "/rules"]
10021002

1003-
# URL of the Alertmanager to send notifications to.
1003+
# Space-separated list of URL(s) of the Alertmanager(s) to send notifications
1004+
# to. Each Alertmanager URL is treated as a separate group in the configuration.
1005+
# Multiple Alertmanagers in HA per group can be supported by using DNS
1006+
# resolution via -ruler.alertmanager-discovery.
10041007
# CLI flag: -ruler.alertmanager-url
1005-
[alertmanager_url: <url> | default = ]
1008+
[alertmanager_url: <list of string> | default = ]
10061009

1007-
# Use DNS SRV records to discover alertmanager hosts.
1010+
# Use DNS SRV records to discover Alertmanager hosts.
10081011
# CLI flag: -ruler.alertmanager-discovery
10091012
[enable_alertmanager_discovery: <boolean> | default = false]
10101013

1011-
# How long to wait between refreshing alertmanager hosts.
1014+
# How long to wait between refreshing DNS resolutions of Alertmanager hosts.
10121015
# CLI flag: -ruler.alertmanager-refresh-interval
10131016
[alertmanager_refresh_interval: <duration> | default = 1m]
10141017

1015-
# If enabled requests to alertmanager will utilize the V2 API.
1018+
# If enabled requests to Alertmanager will utilize the V2 API.
10161019
# CLI flag: -ruler.alertmanager-use-v2
10171020
[enable_alertmanager_v2: <boolean> | default = false]
10181021

pkg/ruler/notifier.go

Lines changed: 59 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ package ruler
33
import (
44
"context"
55
"fmt"
6-
"strings"
6+
"net/url"
7+
"regexp"
78
"sync"
89

910
gklog "github.com/go-kit/kit/log"
@@ -74,68 +75,86 @@ func (rn *rulerNotifier) stop() {
7475
// Builds a Prometheus config.Config from a ruler.Config with just the required
7576
// options to configure notifications to Alertmanager.
7677
func buildNotifierConfig(rulerConfig *Config) (*config.Config, error) {
77-
if rulerConfig.AlertmanagerURL.URL == nil {
78-
return &config.Config{}, nil
79-
}
78+
validURLs := make([]*url.URL, 0, len(rulerConfig.AlertmanagerURL))
8079

81-
u := rulerConfig.AlertmanagerURL
82-
var sdConfig sd_config.ServiceDiscoveryConfig
83-
if rulerConfig.AlertmanagerDiscovery {
84-
if !strings.Contains(u.Host, "_tcp.") {
85-
return nil, fmt.Errorf("When alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is %q)", u.Host)
80+
srvDNSregexp := regexp.MustCompile(`^_.+._.+`)
81+
for _, h := range rulerConfig.AlertmanagerURL {
82+
url, err := url.Parse(h)
83+
if err != nil {
84+
return nil, err
8685
}
87-
dnsSDConfig := dns.SDConfig{
88-
Names: []string{u.Host},
89-
RefreshInterval: model.Duration(rulerConfig.AlertmanagerRefreshInterval),
90-
Type: "SRV",
91-
Port: 0, // Ignored, because of SRV.
92-
}
93-
sdConfig = sd_config.ServiceDiscoveryConfig{
94-
DNSSDConfigs: []*dns.SDConfig{&dnsSDConfig},
86+
87+
if url.String() == "" {
88+
continue
9589
}
96-
} else {
97-
sdConfig = sd_config.ServiceDiscoveryConfig{
98-
StaticConfigs: []*targetgroup.Group{
99-
{
100-
Targets: []model.LabelSet{
101-
{
102-
model.AddressLabel: model.LabelValue(u.Host),
103-
},
104-
},
105-
},
106-
},
90+
91+
// Given we only support SRV lookups as part of service discovery, we need to ensure
92+
// hosts provided follow this specification: _service._proto.name
93+
// e.g. _http._tcp.alertmanager.com
94+
if rulerConfig.AlertmanagerDiscovery && !srvDNSregexp.MatchString(url.Host) {
95+
return nil, fmt.Errorf("when alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is %q)", url.Host)
10796
}
97+
98+
validURLs = append(validURLs, url)
10899
}
109100

110-
amConfig := &config.AlertmanagerConfig{
111-
APIVersion: config.AlertmanagerAPIVersionV1,
112-
Scheme: u.Scheme,
113-
PathPrefix: u.Path,
114-
Timeout: model.Duration(rulerConfig.NotificationTimeout),
115-
ServiceDiscoveryConfig: sdConfig,
101+
if len(validURLs) == 0 {
102+
return &config.Config{}, nil
116103
}
117104

105+
apiVersion := config.AlertmanagerAPIVersionV1
118106
if rulerConfig.AlertmanangerEnableV2API {
119-
amConfig.APIVersion = config.AlertmanagerAPIVersionV2
107+
apiVersion = config.AlertmanagerAPIVersionV2
108+
}
109+
110+
amConfigs := make([]*config.AlertmanagerConfig, 0, len(validURLs))
111+
for _, url := range validURLs {
112+
amConfigs = append(amConfigs, amConfigFromURL(rulerConfig, url, apiVersion))
120113
}
121114

122115
promConfig := &config.Config{
123116
AlertingConfig: config.AlertingConfig{
124-
AlertmanagerConfigs: []*config.AlertmanagerConfig{amConfig},
117+
AlertmanagerConfigs: amConfigs,
125118
},
126119
}
127120

128-
if u.User != nil {
121+
return promConfig, nil
122+
}
123+
124+
func amConfigFromURL(rulerConfig *Config, url *url.URL, apiVersion config.AlertmanagerAPIVersion) *config.AlertmanagerConfig {
125+
var sdConfig sd_config.ServiceDiscoveryConfig
126+
if rulerConfig.AlertmanagerDiscovery {
127+
sdConfig.DNSSDConfigs = []*dns.SDConfig{{
128+
Names: []string{url.Host},
129+
RefreshInterval: model.Duration(rulerConfig.AlertmanagerRefreshInterval),
130+
Type: "SRV",
131+
Port: 0, // Ignored, because of SRV.
132+
}}
133+
} else {
134+
sdConfig.StaticConfigs = []*targetgroup.Group{{
135+
Targets: []model.LabelSet{{model.AddressLabel: model.LabelValue(url.Host)}},
136+
}}
137+
}
138+
139+
amConfig := &config.AlertmanagerConfig{
140+
APIVersion: apiVersion,
141+
Scheme: url.Scheme,
142+
PathPrefix: url.Path,
143+
Timeout: model.Duration(rulerConfig.NotificationTimeout),
144+
ServiceDiscoveryConfig: sdConfig,
145+
}
146+
147+
if url.User != nil {
129148
amConfig.HTTPClientConfig = config_util.HTTPClientConfig{
130149
BasicAuth: &config_util.BasicAuth{
131-
Username: u.User.Username(),
150+
Username: url.User.Username(),
132151
},
133152
}
134153

135-
if password, isSet := u.User.Password(); isSet {
154+
if password, isSet := url.User.Password(); isSet {
136155
amConfig.HTTPClientConfig.BasicAuth.Password = config_util.Secret(password)
137156
}
138157
}
139158

140-
return promConfig, nil
159+
return amConfig
141160
}

pkg/ruler/notifier_test.go

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
package ruler
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
"time"
7+
8+
config_util "github.com/prometheus/common/config"
9+
"github.com/prometheus/common/model"
10+
"github.com/prometheus/prometheus/config"
11+
sd_config "github.com/prometheus/prometheus/discovery/config"
12+
"github.com/prometheus/prometheus/discovery/dns"
13+
"github.com/prometheus/prometheus/discovery/targetgroup"
14+
"github.com/stretchr/testify/require"
15+
)
16+
17+
func TestBuildNotifierConfig(t *testing.T) {
18+
tests := []struct {
19+
name string
20+
cfg *Config
21+
ncfg *config.Config
22+
err error
23+
}{
24+
{
25+
name: "with no valid hosts, returns an empty config",
26+
cfg: &Config{},
27+
ncfg: &config.Config{},
28+
},
29+
{
30+
name: "with a single URL and no service discovery",
31+
cfg: &Config{
32+
AlertmanagerURL: []string{"http://alertmanager.default.svc.cluster.local/alertmanager"},
33+
},
34+
ncfg: &config.Config{
35+
AlertingConfig: config.AlertingConfig{
36+
AlertmanagerConfigs: []*config.AlertmanagerConfig{
37+
{
38+
APIVersion: "v1",
39+
Scheme: "http",
40+
PathPrefix: "/alertmanager",
41+
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
42+
Targets: []model.LabelSet{{"__address__": "alertmanager.default.svc.cluster.local"}},
43+
}}},
44+
},
45+
},
46+
},
47+
},
48+
},
49+
{
50+
name: "with a single URL and service discovery",
51+
cfg: &Config{
52+
AlertmanagerURL: []string{"http://_http._tcp.alertmanager.default.svc.cluster.local/alertmanager"},
53+
AlertmanagerDiscovery: true,
54+
AlertmanagerRefreshInterval: time.Duration(60),
55+
},
56+
ncfg: &config.Config{
57+
AlertingConfig: config.AlertingConfig{
58+
AlertmanagerConfigs: []*config.AlertmanagerConfig{
59+
{
60+
APIVersion: "v1",
61+
Scheme: "http",
62+
PathPrefix: "/alertmanager",
63+
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{
64+
Names: []string{"_http._tcp.alertmanager.default.svc.cluster.local"},
65+
RefreshInterval: 60,
66+
Type: "SRV",
67+
Port: 0,
68+
}}},
69+
},
70+
},
71+
},
72+
},
73+
},
74+
{
75+
name: "with service discovery and an invalid URL",
76+
cfg: &Config{
77+
AlertmanagerURL: []string{"http://_http.default.svc.cluster.local/alertmanager"},
78+
AlertmanagerDiscovery: true,
79+
},
80+
err: fmt.Errorf("when alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is \"alertmanager.default.svc.cluster.local\")"),
81+
},
82+
{
83+
name: "with multiple URLs and no service discovery",
84+
cfg: &Config{
85+
AlertmanagerURL: []string{
86+
"http://alertmanager-0.default.svc.cluster.local/alertmanager",
87+
"http://alertmanager-1.default.svc.cluster.local/alertmanager",
88+
},
89+
},
90+
ncfg: &config.Config{
91+
AlertingConfig: config.AlertingConfig{
92+
AlertmanagerConfigs: []*config.AlertmanagerConfig{
93+
{
94+
APIVersion: "v1",
95+
Scheme: "http",
96+
PathPrefix: "/alertmanager",
97+
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
98+
Targets: []model.LabelSet{{"__address__": "alertmanager-0.default.svc.cluster.local"}},
99+
}}},
100+
},
101+
{
102+
APIVersion: "v1",
103+
Scheme: "http",
104+
PathPrefix: "/alertmanager",
105+
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
106+
Targets: []model.LabelSet{{"__address__": "alertmanager-1.default.svc.cluster.local"}},
107+
}}},
108+
},
109+
},
110+
},
111+
},
112+
},
113+
{
114+
name: "with multiple URLs and service discovery",
115+
cfg: &Config{
116+
AlertmanagerURL: []string{
117+
"http://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager",
118+
"http://_http._tcp.alertmanager-1.default.svc.cluster.local/alertmanager",
119+
},
120+
AlertmanagerDiscovery: true,
121+
AlertmanagerRefreshInterval: time.Duration(60),
122+
},
123+
ncfg: &config.Config{
124+
AlertingConfig: config.AlertingConfig{
125+
AlertmanagerConfigs: []*config.AlertmanagerConfig{
126+
{
127+
APIVersion: "v1",
128+
Scheme: "http",
129+
PathPrefix: "/alertmanager",
130+
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{
131+
Names: []string{"_http._tcp.alertmanager-0.default.svc.cluster.local"},
132+
RefreshInterval: 60,
133+
Type: "SRV",
134+
Port: 0,
135+
}}},
136+
},
137+
{
138+
APIVersion: "v1",
139+
Scheme: "http",
140+
PathPrefix: "/alertmanager",
141+
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{
142+
Names: []string{"_http._tcp.alertmanager-1.default.svc.cluster.local"},
143+
RefreshInterval: 60,
144+
Type: "SRV",
145+
Port: 0,
146+
}}},
147+
},
148+
},
149+
},
150+
},
151+
},
152+
{
153+
name: "with Basic Authentication",
154+
cfg: &Config{
155+
AlertmanagerURL: []string{
156+
"http://marco:[email protected]/alertmanager",
157+
},
158+
},
159+
ncfg: &config.Config{
160+
AlertingConfig: config.AlertingConfig{
161+
AlertmanagerConfigs: []*config.AlertmanagerConfig{
162+
{
163+
HTTPClientConfig: config_util.HTTPClientConfig{
164+
BasicAuth: &config_util.BasicAuth{Username: "marco", Password: "hunter2"},
165+
},
166+
APIVersion: "v1",
167+
Scheme: "http",
168+
PathPrefix: "/alertmanager",
169+
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
170+
Targets: []model.LabelSet{{"__address__": "alertmanager-0.default.svc.cluster.local"}},
171+
}}},
172+
},
173+
},
174+
},
175+
},
176+
},
177+
}
178+
179+
for _, tt := range tests {
180+
t.Run(tt.name, func(t *testing.T) {
181+
ncfg, err := buildNotifierConfig(tt.cfg)
182+
if tt.err == nil {
183+
require.NoError(t, err)
184+
require.Equal(t, tt.ncfg, ncfg)
185+
} else {
186+
require.Error(t, tt.err, err)
187+
}
188+
})
189+
}
190+
}

pkg/ruler/ruler.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,12 @@ type Config struct {
8181
RulePath string `yaml:"rule_path"`
8282

8383
// URL of the Alertmanager to send notifications to.
84-
AlertmanagerURL flagext.URLValue `yaml:"alertmanager_url"`
85-
// Whether to use DNS SRV records to discover alertmanagers.
84+
AlertmanagerURL flagext.StringSlice `yaml:"alertmanager_url"`
85+
// Whether to use DNS SRV records to discover Alertmanager.
8686
AlertmanagerDiscovery bool `yaml:"enable_alertmanager_discovery"`
87-
// How long to wait between refreshing the list of alertmanagers based on DNS service discovery.
87+
// How long to wait between refreshing the list of Alertmanager based on DNS service discovery.
8888
AlertmanagerRefreshInterval time.Duration `yaml:"alertmanager_refresh_interval"`
89-
// Enables the ruler notifier to use the alertmananger V2 API.
89+
// Enables the ruler notifier to use the Alertmananger V2 API.
9090
AlertmanangerEnableV2API bool `yaml:"enable_alertmanager_v2"`
9191
// Capacity of the queue for notifications to be sent to the Alertmanager.
9292
NotificationQueueCapacity int `yaml:"notification_queue_capacity"`
@@ -133,12 +133,14 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
133133
f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules")
134134
f.DurationVar(&cfg.EvaluationDelay, "ruler.evaluation-delay-duration", 0, "Duration to delay the evaluation of rules to ensure they underlying metrics have been pushed to cortex.")
135135
f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes")
136-
f.Var(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "URL of the Alertmanager to send notifications to.")
137-
f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover alertmanager hosts.")
138-
f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing alertmanager hosts.")
139-
f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to alertmanager will utilize the V2 API.")
136+
137+
f.Var(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "Space-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.")
138+
f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.")
139+
f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing DNS resolutions of Alertmanager hosts.")
140+
f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to Alertmanager will utilize the V2 API.")
140141
f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.")
141142
f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.")
143+
142144
f.DurationVar(&cfg.SearchPendingFor, "ruler.search-pending-for", 5*time.Minute, "Time to spend searching for a pending ruler when shutting down.")
143145
f.BoolVar(&cfg.EnableSharding, "ruler.enable-sharding", false, "Distribute rule evaluation using ring backend")
144146
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")

0 commit comments

Comments
 (0)