Skip to content

Remove APIEnableRulesBackup config and use Ring.ReplicationFactor ins… #5901

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## master / unreleased
* [CHANGE] Ruler: Remove `experimental.ruler.api-enable-rules-backup` flag and use `ruler.ring.replication-factor` to check if rules backup is enabled

## 1.17.0 in progress

Expand Down
9 changes: 0 additions & 9 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -4271,15 +4271,6 @@ ring:
# CLI flag: -experimental.ruler.enable-api
[enable_api: <boolean> | default = false]

# EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers
# with default state (state before any evaluation) and send this copy in list
# API requests as backup in case the ruler who owns the rule fails to send its
# rules. This allows the rules API to handle ruler outage by returning rules
# with default state. Ring replication-factor needs to be set to 2 or more for
# this to be useful.
# CLI flag: -experimental.ruler.api-enable-rules-backup
[api_enable_rules_backup: <boolean> | default = false]

# EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API
# response. If there are duplicate rules the rule with the latest evaluation
# timestamp will be kept.
Expand Down
9 changes: 4 additions & 5 deletions integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ func TestRulerAPIShardingWithAPIRulesBackupEnabled(t *testing.T) {
testRulerAPIWithSharding(t, true)
}

func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
func testRulerAPIWithSharding(t *testing.T, enableRulesBackup bool) {
const numRulesGroups = 100

random := rand.New(rand.NewSource(time.Now().UnixNano()))
Expand Down Expand Up @@ -459,9 +459,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
// Enable the bucket index so we can skip the initial bucket scan.
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
}
if enableAPIRulesBackup {
if enableRulesBackup {
overrides["-ruler.ring.replication-factor"] = "3"
overrides["-experimental.ruler.api-enable-rules-backup"] = "true"
}
rulerFlags := mergeFlags(
BlocksStorageFlags(),
Expand Down Expand Up @@ -556,8 +555,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
},
}
// For each test case, fetch the rules with configured filters, and ensure the results match.
if enableAPIRulesBackup {
err := ruler2.Kill() // if api-enable-rules-backup is enabled the APIs should be able to handle a ruler going down
if enableRulesBackup {
err := ruler2.Kill() // if rules backup is enabled the APIs should be able to handle a ruler going down
require.NoError(t, err)
}
for name, tc := range testCases {
Expand Down
2 changes: 1 addition & 1 deletion pkg/ruler/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva
registry: reg,
logger: logger,
}
if cfg.APIEnableRulesBackup {
if cfg.RulesBackupEnabled() {
m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg)
}
return m, nil
Expand Down
4 changes: 3 additions & 1 deletion pkg/ruler/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,9 @@ func TestBackupRules(t *testing.T) {
1 * time.Millisecond,
}
ruleManagerFactory := RuleManagerFactory(nil, waitDurations)
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir, APIEnableRulesBackup: true}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
config := Config{RulePath: dir}
config.Ring.ReplicationFactor = 3
m, err := NewDefaultMultiTenantManager(config, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
require.NoError(t, err)

const user1 = "testUser"
Expand Down
30 changes: 17 additions & 13 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ type Config struct {
Ring RingConfig `yaml:"ring"`
FlushCheckPeriod time.Duration `yaml:"flush_period"`

EnableAPI bool `yaml:"enable_api"`
APIEnableRulesBackup bool `yaml:"api_enable_rules_backup"`
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`
EnableAPI bool `yaml:"enable_api"`
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`

EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
Expand Down Expand Up @@ -200,7 +199,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
f.BoolVar(&cfg.APIEnableRulesBackup, "experimental.ruler.api-enable-rules-backup", false, "EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers with default state (state before any evaluation) and send this copy in list API requests as backup in case the ruler who owns the rule fails to send its rules. This allows the rules API to handle ruler outage by returning rules with default state. Ring replication-factor needs to be set to 2 or more for this to be useful.")
f.BoolVar(&cfg.APIDeduplicateRules, "experimental.ruler.api-deduplicate-rules", false, "EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API response. If there are duplicate rules the rule with the latest evaluation timestamp will be kept.")
f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
Expand All @@ -217,6 +215,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
cfg.RingCheckPeriod = 5 * time.Second
}

func (cfg *Config) RulesBackupEnabled() bool {
// If the replication factor is greater the 1, only the first replica is responsible for evaluating the rule,
// the rest of the replica will store the rule groups as backup only for API HA.
return cfg.Ring.ReplicationFactor > 1
}

// MultiTenantManager is the interface of interaction with a Manager that is tenant aware.
type MultiTenantManager interface {
// SyncRuleGroups is used to sync the Manager with rules from the RuleStore.
Expand Down Expand Up @@ -581,7 +585,7 @@ func (r *Ruler) syncRules(ctx context.Context, reason string) {
// This will also delete local group files for users that are no longer in 'configs' map.
r.manager.SyncRuleGroups(ctx, loadedConfigs)

if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
r.manager.BackUpRuleGroups(ctx, backupConfigs)
}
}
Expand All @@ -604,7 +608,7 @@ func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGrou
if err != nil {
level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(ownedConfigs)-len(loadedOwnedConfigs), "err", err)
}
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
loadedBackupConfigs, err := r.store.LoadRuleGroups(ctx, backupConfigs)
if err != nil {
level.Warn(r.logger).Log("msg", "failed to load some rules backed up by this ruler", "count", len(backupConfigs)-len(loadedBackupConfigs), "err", err)
Expand Down Expand Up @@ -685,7 +689,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
if len(owned) > 0 {
ownedConfigs[userID] = owned
}
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
backup := filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
if len(backup) > 0 {
backedUpConfigs[userID] = backup
Expand Down Expand Up @@ -748,7 +752,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp

filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
var filterBackup []*rulespb.RuleGroupDesc
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
filterBackup = filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
}
if len(filterOwned) == 0 && len(filterBackup) == 0 {
Expand Down Expand Up @@ -1121,7 +1125,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
)

zoneByAddress := make(map[string]string)
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
for _, ruler := range rulers.Instances {
zoneByAddress[ruler.Addr] = ruler.Zone
}
Expand All @@ -1146,9 +1150,9 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
if err != nil {
level.Error(r.logger).Log("msg", "unable to retrieve rules from ruler", "addr", addr, "err", err)
r.rulerGetRulesFailures.WithLabelValues(addr).Inc()
// If APIEnableRulesBackup is enabled and there are enough rulers replicating the rules, we should
// If rules backup is enabled and there are enough rulers replicating the rules, we should
// be able to handle failures.
if r.cfg.APIEnableRulesBackup && len(jobs) >= r.cfg.Ring.ReplicationFactor {
if r.cfg.RulesBackupEnabled() && len(jobs) >= r.cfg.Ring.ReplicationFactor {
mtx.Lock()
failedZones[zoneByAddress[addr]] = struct{}{}
errCount += 1
Expand All @@ -1168,7 +1172,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
return nil
})

if err == nil && (r.cfg.APIEnableRulesBackup || r.cfg.APIDeduplicateRules) {
if err == nil && (r.cfg.RulesBackupEnabled() || r.cfg.APIDeduplicateRules) {
merged = mergeGroupStateDesc(merged)
}

Expand All @@ -1183,7 +1187,7 @@ func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, er
return nil, fmt.Errorf("no user id found in context")
}

groupDescs, err := r.getLocalRules(userID, *in, r.cfg.APIEnableRulesBackup)
groupDescs, err := r.getLocalRules(userID, *in, r.cfg.RulesBackupEnabled())
if err != nil {
return nil, err
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/ruler/ruler_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ func GetReplicationSetForListRule(r ring.ReadRing, cfg *RingConfig) (ring.Replic
// to 0, and then we update them whether zone-awareness is enabled or not.
maxErrors := 0
maxUnavailableZones := 0
if cfg.ZoneAwarenessEnabled {
// Because ring's Get method returns a number of ruler equal to the replication factor even if there is only 1 zone
// and ZoneAwarenessEnabled, we can consider that ZoneAwarenessEnabled is disabled if there is only 1 zone since
// rules are still replicated to rulers in the same zone.
if cfg.ZoneAwarenessEnabled && len(ringZones) > 1 {
numReplicatedZones := min(len(ringZones), r.ReplicationFactor())
// Given that quorum is not required, we only need at least one of the zone to be healthy to succeed. But we
// also need to handle case when RF < number of zones.
Expand Down
17 changes: 17 additions & 0 deletions pkg/ruler/ruler_ring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,23 @@ func TestGetReplicationSetForListRule(t *testing.T) {
"z2": {},
},
},
"should succeed on 1 unhealthy instances in RF=3 zone replication enabled but only 1 zone": {
ringInstances: map[string]ring.InstanceDesc{
"instance-1": {Addr: "127.0.0.1", State: ring.ACTIVE, Timestamp: now.Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-1", "z1", 128, true), Zone: "z1"},
"instance-2": {Addr: "127.0.0.2", State: ring.ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-2", "z1", 128, true), Zone: "z1"},
"instance-3": {Addr: "127.0.0.3", State: ring.ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-3", "z1", 128, true), Zone: "z1"},
"instance-4": {Addr: "127.0.0.4", State: ring.PENDING, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-4", "z1", 128, true), Zone: "z1"},
},
ringHeartbeatTimeout: time.Minute,
ringReplicationFactor: 3,
expectedSet: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"},
enableAZReplication: true,
expectedFailedZones: map[string]struct{}{
"z1": {},
},
expectedMaxUnavailableZones: 0,
expectedMaxError: 1,
},
}

for testName, testData := range tests {
Expand Down
Loading
Loading