-
Notifications
You must be signed in to change notification settings - Fork 816
disable rule groups #5521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
disable rule groups #5521
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,14 @@ const ( | |
recordingRuleFilter string = "record" | ||
) | ||
|
||
type DisabledRuleGroupErr struct { | ||
Message string | ||
} | ||
|
||
func (e *DisabledRuleGroupErr) Error() string { | ||
return e.Message | ||
} | ||
|
||
// Config is the configuration for the recording rules server. | ||
type Config struct { | ||
// This is used for template expansion in alerts; must be a valid URL. | ||
|
@@ -400,6 +408,17 @@ func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { | |
} | ||
} | ||
|
||
func ruleGroupDisabled(ruleGroup *rulespb.RuleGroupDesc, disabledRuleGroupsForUser validation.DisabledRuleGroups) bool { | ||
for _, disabledRuleGroupForUser := range disabledRuleGroupsForUser { | ||
if ruleGroup.Namespace == disabledRuleGroupForUser.Namespace && | ||
ruleGroup.Name == disabledRuleGroupForUser.Name && | ||
ruleGroup.User == disabledRuleGroupForUser.User { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
var sep = []byte("/") | ||
|
||
func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 { | ||
|
@@ -415,15 +434,21 @@ func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 { | |
return ringHasher.Sum32() | ||
} | ||
|
||
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, instanceAddr string) (bool, error) { | ||
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, instanceAddr string) (bool, error) { | ||
|
||
hash := tokenForGroup(g) | ||
|
||
rlrs, err := r.Get(hash, RingOp, nil, nil, nil) | ||
if err != nil { | ||
return false, errors.Wrap(err, "error reading ring to verify rule group ownership") | ||
} | ||
|
||
return rlrs.Instances[0].Addr == instanceAddr, nil | ||
ownsRuleGroup := rlrs.Instances[0].Addr == instanceAddr | ||
if ownsRuleGroup && ruleGroupDisabled(g, disabledRuleGroups) { | ||
return false, &DisabledRuleGroupErr{Message: fmt.Sprintf("rule group %s, namespace %s, user %s is disabled", g.Name, g.Namespace, g.User)} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Should this return an error? i see why u did that (u wanna catch on the caller layer) but is weird to return an error on something expected. But im ok with it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree this is a little bit weird... But seems like a workaround here as Go doesn't have a good way to capture such things... |
||
} | ||
|
||
return ownsRuleGroup, nil | ||
} | ||
|
||
func (r *Ruler) ServeHTTP(w http.ResponseWriter, req *http.Request) { | ||
|
@@ -533,7 +558,26 @@ func (r *Ruler) listRules(ctx context.Context) (result map[string]rulespb.RuleGr | |
} | ||
|
||
func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.RuleGroupList, error) { | ||
return r.store.ListAllRuleGroups(ctx) | ||
allRuleGroups, err := r.store.ListAllRuleGroups(ctx) | ||
if err != nil { | ||
return nil, err | ||
} | ||
for userID, groups := range allRuleGroups { | ||
disabledRuleGroupsForUser := r.limits.DisabledRuleGroups(userID) | ||
if len(disabledRuleGroupsForUser) == 0 { | ||
continue | ||
} | ||
Comment on lines
+567
to
+569
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could do this check right in the beginning of the method and return the original There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have to loop thru and check if there are overrides for each user, correct? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh my bad... this is for all users! ok |
||
filteredGroupsForUser := rulespb.RuleGroupList{} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: should we use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We won't know how many rule groups are disabled until the loop is complete |
||
for _, group := range groups { | ||
if !ruleGroupDisabled(group, disabledRuleGroupsForUser) { | ||
filteredGroupsForUser = append(filteredGroupsForUser, group) | ||
} else { | ||
level.Info(r.logger).Log("msg", "rule group disabled", "name", group.Name, "namespace", group.Namespace, "user", group.User) | ||
} | ||
} | ||
allRuleGroups[userID] = filteredGroupsForUser | ||
} | ||
return allRuleGroups, nil | ||
} | ||
|
||
func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulespb.RuleGroupList, error) { | ||
|
@@ -544,7 +588,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp | |
|
||
filteredConfigs := make(map[string]rulespb.RuleGroupList) | ||
for userID, groups := range configs { | ||
filtered := filterRuleGroups(userID, groups, r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) | ||
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) | ||
if len(filtered) > 0 { | ||
filteredConfigs[userID] = filtered | ||
} | ||
|
@@ -602,7 +646,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp | |
return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID) | ||
} | ||
|
||
filtered := filterRuleGroups(userID, groups, userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) | ||
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) | ||
if len(filtered) == 0 { | ||
continue | ||
} | ||
|
@@ -624,15 +668,21 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp | |
// | ||
// Reason why this function is not a method on Ruler is to make sure we don't accidentally use r.ring, | ||
// but only ring passed as parameter. | ||
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc { | ||
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc { | ||
// Prune the rule group to only contain rules that this ruler is responsible for, based on ring. | ||
var result []*rulespb.RuleGroupDesc | ||
for _, g := range ruleGroups { | ||
owned, err := instanceOwnsRuleGroup(ring, g, instanceAddr) | ||
owned, err := instanceOwnsRuleGroup(ring, g, disabledRuleGroups, instanceAddr) | ||
if err != nil { | ||
ringCheckErrors.Inc() | ||
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err) | ||
continue | ||
switch e := err.(type) { | ||
case *DisabledRuleGroupErr: | ||
level.Info(log).Log("msg", e.Message) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it too verbose to log it every time during filtering? And this doesn't feel like an error to me There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To give visibility, I think it needs to be either Info or Warn There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, all the instance will log this right even if that instance doesn't own the group? I think this will produce too much logs for large clusters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't seems to be an error to me +1 |
||
continue | ||
default: | ||
ringCheckErrors.Inc() | ||
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err) | ||
continue | ||
} | ||
} | ||
|
||
if owned { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should default be []?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we document this format somehow?
We have others "complex configs" like https://cortexmetrics.io/docs/configuration/configuration-file/#memcached_config
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will submit another PR for this. Existing doc generator does not handle slices of structs well