diff --git a/CHANGELOG.md b/CHANGELOG.md index 458d5b8d1d5..5e66b25fb9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] Ruler: Remove `experimental.ruler.api-enable-rules-backup` flag and use `ruler.ring.replication-factor` to check if rules backup is enabled ## 1.17.0 in progress diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 57909431da0..30bbee288c9 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4271,15 +4271,6 @@ ring: # CLI flag: -experimental.ruler.enable-api [enable_api: | default = false] -# EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers -# with default state (state before any evaluation) and send this copy in list -# API requests as backup in case the ruler who owns the rule fails to send its -# rules. This allows the rules API to handle ruler outage by returning rules -# with default state. Ring replication-factor needs to be set to 2 or more for -# this to be useful. -# CLI flag: -experimental.ruler.api-enable-rules-backup -[api_enable_rules_backup: | default = false] - # EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API # response. If there are duplicate rules the rule with the latest evaluation # timestamp will be kept. diff --git a/integration/ruler_test.go b/integration/ruler_test.go index 81e2440c76e..be325835950 100644 --- a/integration/ruler_test.go +++ b/integration/ruler_test.go @@ -402,7 +402,7 @@ func TestRulerAPIShardingWithAPIRulesBackupEnabled(t *testing.T) { testRulerAPIWithSharding(t, true) } -func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) { +func testRulerAPIWithSharding(t *testing.T, enableRulesBackup bool) { const numRulesGroups = 100 random := rand.New(rand.NewSource(time.Now().UnixNano())) @@ -459,9 +459,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) { // Enable the bucket index so we can skip the initial bucket scan. "-blocks-storage.bucket-store.bucket-index.enabled": "true", } - if enableAPIRulesBackup { + if enableRulesBackup { overrides["-ruler.ring.replication-factor"] = "3" - overrides["-experimental.ruler.api-enable-rules-backup"] = "true" } rulerFlags := mergeFlags( BlocksStorageFlags(), @@ -556,8 +555,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) { }, } // For each test case, fetch the rules with configured filters, and ensure the results match. - if enableAPIRulesBackup { - err := ruler2.Kill() // if api-enable-rules-backup is enabled the APIs should be able to handle a ruler going down + if enableRulesBackup { + err := ruler2.Kill() // if rules backup is enabled the APIs should be able to handle a ruler going down require.NoError(t, err) } for name, tc := range testCases { diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 57d2d2907fe..eb0e5ce9e47 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -116,7 +116,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva registry: reg, logger: logger, } - if cfg.APIEnableRulesBackup { + if cfg.RulesBackupEnabled() { m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg) } return m, nil diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index d88d47e6613..3d539c913d2 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -262,7 +262,9 @@ func TestBackupRules(t *testing.T) { 1 * time.Millisecond, } ruleManagerFactory := RuleManagerFactory(nil, waitDurations) - m, err := NewDefaultMultiTenantManager(Config{RulePath: dir, APIEnableRulesBackup: true}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) + config := Config{RulePath: dir} + config.Ring.ReplicationFactor = 3 + m, err := NewDefaultMultiTenantManager(config, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) require.NoError(t, err) const user1 = "testUser" diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 19497db85e5..2fbfca361cc 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -130,9 +130,8 @@ type Config struct { Ring RingConfig `yaml:"ring"` FlushCheckPeriod time.Duration `yaml:"flush_period"` - EnableAPI bool `yaml:"enable_api"` - APIEnableRulesBackup bool `yaml:"api_enable_rules_backup"` - APIDeduplicateRules bool `yaml:"api_deduplicate_rules"` + EnableAPI bool `yaml:"enable_api"` + APIDeduplicateRules bool `yaml:"api_deduplicate_rules"` EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"` DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` @@ -200,7 +199,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.") f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers") f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api") - f.BoolVar(&cfg.APIEnableRulesBackup, "experimental.ruler.api-enable-rules-backup", false, "EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers with default state (state before any evaluation) and send this copy in list API requests as backup in case the ruler who owns the rule fails to send its rules. This allows the rules API to handle ruler outage by returning rules with default state. Ring replication-factor needs to be set to 2 or more for this to be useful.") f.BoolVar(&cfg.APIDeduplicateRules, "experimental.ruler.api-deduplicate-rules", false, "EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API response. If there are duplicate rules the rule with the latest evaluation timestamp will be kept.") f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`) f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`) @@ -217,6 +215,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { cfg.RingCheckPeriod = 5 * time.Second } +func (cfg *Config) RulesBackupEnabled() bool { + // If the replication factor is greater the 1, only the first replica is responsible for evaluating the rule, + // the rest of the replica will store the rule groups as backup only for API HA. + return cfg.Ring.ReplicationFactor > 1 +} + // MultiTenantManager is the interface of interaction with a Manager that is tenant aware. type MultiTenantManager interface { // SyncRuleGroups is used to sync the Manager with rules from the RuleStore. @@ -581,7 +585,7 @@ func (r *Ruler) syncRules(ctx context.Context, reason string) { // This will also delete local group files for users that are no longer in 'configs' map. r.manager.SyncRuleGroups(ctx, loadedConfigs) - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { r.manager.BackUpRuleGroups(ctx, backupConfigs) } } @@ -604,7 +608,7 @@ func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGrou if err != nil { level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(ownedConfigs)-len(loadedOwnedConfigs), "err", err) } - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { loadedBackupConfigs, err := r.store.LoadRuleGroups(ctx, backupConfigs) if err != nil { level.Warn(r.logger).Log("msg", "failed to load some rules backed up by this ruler", "count", len(backupConfigs)-len(loadedBackupConfigs), "err", err) @@ -685,7 +689,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp if len(owned) > 0 { ownedConfigs[userID] = owned } - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { backup := filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) if len(backup) > 0 { backedUpConfigs[userID] = backup @@ -748,7 +752,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) var filterBackup []*rulespb.RuleGroupDesc - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { filterBackup = filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) } if len(filterOwned) == 0 && len(filterBackup) == 0 { @@ -1121,7 +1125,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest ) zoneByAddress := make(map[string]string) - if r.cfg.APIEnableRulesBackup { + if r.cfg.RulesBackupEnabled() { for _, ruler := range rulers.Instances { zoneByAddress[ruler.Addr] = ruler.Zone } @@ -1146,9 +1150,9 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest if err != nil { level.Error(r.logger).Log("msg", "unable to retrieve rules from ruler", "addr", addr, "err", err) r.rulerGetRulesFailures.WithLabelValues(addr).Inc() - // If APIEnableRulesBackup is enabled and there are enough rulers replicating the rules, we should + // If rules backup is enabled and there are enough rulers replicating the rules, we should // be able to handle failures. - if r.cfg.APIEnableRulesBackup && len(jobs) >= r.cfg.Ring.ReplicationFactor { + if r.cfg.RulesBackupEnabled() && len(jobs) >= r.cfg.Ring.ReplicationFactor { mtx.Lock() failedZones[zoneByAddress[addr]] = struct{}{} errCount += 1 @@ -1168,7 +1172,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest return nil }) - if err == nil && (r.cfg.APIEnableRulesBackup || r.cfg.APIDeduplicateRules) { + if err == nil && (r.cfg.RulesBackupEnabled() || r.cfg.APIDeduplicateRules) { merged = mergeGroupStateDesc(merged) } @@ -1183,7 +1187,7 @@ func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, er return nil, fmt.Errorf("no user id found in context") } - groupDescs, err := r.getLocalRules(userID, *in, r.cfg.APIEnableRulesBackup) + groupDescs, err := r.getLocalRules(userID, *in, r.cfg.RulesBackupEnabled()) if err != nil { return nil, err } diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go index 2658d6e8e32..cac36e270d8 100644 --- a/pkg/ruler/ruler_ring.go +++ b/pkg/ruler/ruler_ring.go @@ -144,7 +144,10 @@ func GetReplicationSetForListRule(r ring.ReadRing, cfg *RingConfig) (ring.Replic // to 0, and then we update them whether zone-awareness is enabled or not. maxErrors := 0 maxUnavailableZones := 0 - if cfg.ZoneAwarenessEnabled { + // Because ring's Get method returns a number of ruler equal to the replication factor even if there is only 1 zone + // and ZoneAwarenessEnabled, we can consider that ZoneAwarenessEnabled is disabled if there is only 1 zone since + // rules are still replicated to rulers in the same zone. + if cfg.ZoneAwarenessEnabled && len(ringZones) > 1 { numReplicatedZones := min(len(ringZones), r.ReplicationFactor()) // Given that quorum is not required, we only need at least one of the zone to be healthy to succeed. But we // also need to handle case when RF < number of zones. diff --git a/pkg/ruler/ruler_ring_test.go b/pkg/ruler/ruler_ring_test.go index f9bfaeb03eb..95f80099128 100644 --- a/pkg/ruler/ruler_ring_test.go +++ b/pkg/ruler/ruler_ring_test.go @@ -180,6 +180,23 @@ func TestGetReplicationSetForListRule(t *testing.T) { "z2": {}, }, }, + "should succeed on 1 unhealthy instances in RF=3 zone replication enabled but only 1 zone": { + ringInstances: map[string]ring.InstanceDesc{ + "instance-1": {Addr: "127.0.0.1", State: ring.ACTIVE, Timestamp: now.Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-1", "z1", 128, true), Zone: "z1"}, + "instance-2": {Addr: "127.0.0.2", State: ring.ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-2", "z1", 128, true), Zone: "z1"}, + "instance-3": {Addr: "127.0.0.3", State: ring.ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-3", "z1", 128, true), Zone: "z1"}, + "instance-4": {Addr: "127.0.0.4", State: ring.PENDING, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-4", "z1", 128, true), Zone: "z1"}, + }, + ringHeartbeatTimeout: time.Minute, + ringReplicationFactor: 3, + expectedSet: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"}, + enableAZReplication: true, + expectedFailedZones: map[string]struct{}{ + "z1": {}, + }, + expectedMaxUnavailableZones: 0, + expectedMaxError: 1, + }, } for testName, testData := range tests { diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 22d52aa6495..6932a8e342a 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -349,7 +349,7 @@ func TestGetRules(t *testing.T) { rulerStateMap map[string]ring.InstanceState rulerAZMap map[string]string expectedError error - enableAPIRulesBackup bool + replicationFactor int enableZoneAwareReplication bool } @@ -473,6 +473,11 @@ func TestGetRules(t *testing.T) { "ruler2": "b", "ruler3": "c", } + rulerAZSingleZone := map[string]string{ + "ruler1": "a", + "ruler2": "a", + "ruler3": "a", + } expectedRules := expectedRulesMap{ "ruler1": map[string]rulespb.RuleGroupList{ @@ -541,7 +546,7 @@ func TestGetRules(t *testing.T) { "user2": 9, "user3": 3, }, - enableAPIRulesBackup: true, + replicationFactor: 3, expectedClientCallCount: len(expectedRules), }, "Shuffle Sharding and ShardSize = 2 with Rule Type Filter": { @@ -633,11 +638,11 @@ func TestGetRules(t *testing.T) { expectedClientCallCount: 0, }, "Shuffle Sharding and ShardSize = 3 with API Rules backup enabled": { - sharding: true, - shuffleShardSize: 3, - shardingStrategy: util.ShardingStrategyShuffle, - rulerStateMap: rulerStateMapAllActive, - enableAPIRulesBackup: true, + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + rulerStateMap: rulerStateMapAllActive, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -649,11 +654,11 @@ func TestGetRules(t *testing.T) { expectedClientCallCount: 3, }, "Shuffle Sharding and ShardSize = 3 with API Rules backup enabled and one ruler is in Pending state": { - sharding: true, - shuffleShardSize: 3, - shardingStrategy: util.ShardingStrategyShuffle, - rulerStateMap: rulerStateMapOnePending, - enableAPIRulesBackup: true, + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + rulerStateMap: rulerStateMapOnePending, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -665,11 +670,11 @@ func TestGetRules(t *testing.T) { expectedClientCallCount: 2, // one of the ruler is pending, so we don't expect that ruler to be called }, "Shuffle Sharding and ShardSize = 3 with API Rules backup enabled and two ruler is in Pending state": { - sharding: true, - shuffleShardSize: 3, - shardingStrategy: util.ShardingStrategyShuffle, - rulerStateMap: rulerStateMapTwoPending, - enableAPIRulesBackup: true, + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + rulerStateMap: rulerStateMapTwoPending, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -682,7 +687,7 @@ func TestGetRules(t *testing.T) { enableZoneAwareReplication: true, rulerStateMap: rulerStateMapAllActive, rulerAZMap: rulerAZEvenSpread, - enableAPIRulesBackup: true, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -700,7 +705,25 @@ func TestGetRules(t *testing.T) { enableZoneAwareReplication: true, rulerStateMap: rulerStateMapOnePending, rulerAZMap: rulerAZEvenSpread, - enableAPIRulesBackup: true, + replicationFactor: 3, + rulesRequest: RulesRequest{ + Type: recordingRuleFilter, + }, + expectedCount: map[string]int{ + "user1": 3, + "user2": 5, + "user3": 1, + }, + expectedClientCallCount: 2, // one of the ruler is pending, so we don't expect that ruler to be called + }, + "Shuffle Sharding and ShardSize = 3 and AZ replication with API Rules backup enabled and one ruler in pending state and rulers are in same az": { + sharding: true, + shuffleShardSize: 3, + shardingStrategy: util.ShardingStrategyShuffle, + enableZoneAwareReplication: true, + rulerStateMap: rulerStateMapOnePending, + rulerAZMap: rulerAZSingleZone, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -718,7 +741,7 @@ func TestGetRules(t *testing.T) { enableZoneAwareReplication: true, rulerStateMap: rulerStateMapTwoPending, rulerAZMap: rulerAZEvenSpread, - enableAPIRulesBackup: true, + replicationFactor: 3, rulesRequest: RulesRequest{ Type: recordingRuleFilter, }, @@ -741,7 +764,6 @@ func TestGetRules(t *testing.T) { cfg.ShardingStrategy = tc.shardingStrategy cfg.EnableSharding = tc.sharding - cfg.APIEnableRulesBackup = tc.enableAPIRulesBackup cfg.Ring = RingConfig{ InstanceID: id, @@ -751,8 +773,8 @@ func TestGetRules(t *testing.T) { }, ReplicationFactor: 1, } - if tc.enableAPIRulesBackup { - cfg.Ring.ReplicationFactor = 3 + if tc.replicationFactor > 0 { + cfg.Ring.ReplicationFactor = tc.replicationFactor cfg.Ring.ZoneAwarenessEnabled = tc.enableZoneAwareReplication } if tc.enableZoneAwareReplication { @@ -877,7 +899,7 @@ func TestGetRules(t *testing.T) { numberOfRulers := len(rulerAddrMap) require.Equal(t, totalConfiguredRules*numberOfRulers, totalLoadedRules) } - if tc.enableAPIRulesBackup && tc.sharding && tc.expectedError == nil { + if tc.replicationFactor > 1 && tc.sharding && tc.expectedError == nil { // all rules should be backed up require.Equal(t, totalConfiguredRules, len(ruleBackupCount)) var hasUnhealthyRuler bool @@ -896,7 +918,7 @@ func TestGetRules(t *testing.T) { } } } else { - // If APIEnableRulesBackup is disabled, rulers should not back up any rules + // If rules backup is disabled, rulers should not back up any rules require.Equal(t, 0, len(ruleBackupCount)) } }) @@ -983,7 +1005,6 @@ func TestGetRulesFromBackup(t *testing.T) { cfg.ShardingStrategy = util.ShardingStrategyShuffle cfg.EnableSharding = true - cfg.APIEnableRulesBackup = true cfg.EvaluationInterval = 5 * time.Minute cfg.Ring = RingConfig{ @@ -1141,16 +1162,15 @@ func TestSharding(t *testing.T) { type expectedRulesMap map[string]map[string]rulespb.RuleGroupList type testCase struct { - sharding bool - shardingStrategy string - enableAPIRulesBackup bool - replicationFactor int - shuffleShardSize int - setupRing func(*ring.Desc) - enabledUsers []string - disabledUsers []string - expectedRules expectedRulesMap - expectedBackupRules expectedRulesMap + sharding bool + shardingStrategy string + replicationFactor int + shuffleShardSize int + setupRing func(*ring.Desc) + enabledUsers []string + disabledUsers []string + expectedRules expectedRulesMap + expectedBackupRules expectedRulesMap } const ( @@ -1524,12 +1544,11 @@ func TestSharding(t *testing.T) { }, "shuffle sharding, three rulers, shard size 2, enable api backup": { - sharding: true, - replicationFactor: 2, - shardingStrategy: util.ShardingStrategyShuffle, - enableAPIRulesBackup: true, - shuffleShardSize: 2, - enabledUsers: []string{user1}, + sharding: true, + replicationFactor: 2, + shardingStrategy: util.ShardingStrategyShuffle, + shuffleShardSize: 2, + enabledUsers: []string{user1}, setupRing: func(desc *ring.Desc) { desc.AddIngester(ruler1, ruler1Addr, "", sortTokens([]uint32{userToken(user1, 0) + 1, user1Group1Token + 1}), ring.ACTIVE, time.Now()) @@ -1566,9 +1585,8 @@ func TestSharding(t *testing.T) { setupRuler := func(id string, host string, port int, forceRing *ring.Ring) *Ruler { store := newMockRuleStore(allRules, nil) cfg := Config{ - EnableSharding: tc.sharding, - APIEnableRulesBackup: tc.enableAPIRulesBackup, - ShardingStrategy: tc.shardingStrategy, + EnableSharding: tc.sharding, + ShardingStrategy: tc.shardingStrategy, Ring: RingConfig{ InstanceID: id, InstanceAddr: host, @@ -1657,7 +1675,7 @@ func TestSharding(t *testing.T) { require.Equal(t, tc.expectedRules, expected) - if !tc.enableAPIRulesBackup { + if tc.replicationFactor <= 1 { require.Equal(t, 0, len(expectedBackup[ruler1])) require.Equal(t, 0, len(expectedBackup[ruler2])) require.Equal(t, 0, len(expectedBackup[ruler3]))