Skip to content

Commit 4c7f4c5

Browse files
Minimize missed rule group evaluations
Signed-off-by: Anand Rajagopal <[email protected]>
1 parent 8df8246 commit 4c7f4c5

File tree

11 files changed

+1073
-170
lines changed

11 files changed

+1073
-170
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* [CHANGE] Querier: Remove `-querier.at-modifier-enabled` flag. #6157
1414
* [CHANGE] Tracing: Remove deprecated `oltp_endpoint` config entirely. #6158
1515
* [CHANGE] Store Gateway: Enable store gateway zone stable shuffle sharding by default. #6161
16+
* [FEATURE] Ruler: Minimize rule group missed evaluations via `-ruler.enable-ha` flag. #6129
1617
* [FEATURE] Ingester/Distributor: Experimental: Enable native histogram ingestion via `-blocks-storage.tsdb.enable-native-histograms` flag. #5986 #6010 #6020
1718
* [FEATURE] Querier: Enable querying native histogram chunks. #5944 #6031
1819
* [FEATURE] Query Frontend: Support native histogram in query frontend response. #5996 #6043

docs/configuration/config-file-reference.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4336,6 +4336,10 @@ ring:
43364336
# CLI flag: -ruler.ring.final-sleep
43374337
[final_sleep: <duration> | default = 0s]
43384338
4339+
# Keep instance in the ring on shut down.
4340+
# CLI flag: -ruler.ring.keep-instance-in-the-ring-on-shutdown
4341+
[keep_instance_in_the_ring_on_shutdown: <boolean> | default = false]
4342+
43394343
# Period with which to attempt to flush rule groups.
43404344
# CLI flag: -ruler.flush-period
43414345
[flush_period: <duration> | default = 1m]
@@ -4370,6 +4374,14 @@ ring:
43704374
# Disable the rule_group label on exported metrics
43714375
# CLI flag: -ruler.disable-rule-group-label
43724376
[disable_rule_group_label: <boolean> | default = false]
4377+
4378+
# Enable high availability
4379+
# CLI flag: -ruler.enable-ha-evaluation
4380+
[enable_ha_evaluation: <boolean> | default = false]
4381+
4382+
# Timeout for fanout calls to other rulers
4383+
# CLI flag: -ruler.list-rules-fanout-timeout
4384+
[list_rules_fanout_timeout: <duration> | default = 2m]
43734385
```
43744386

43754387
### `ruler_storage_config`

integration/ruler_test.go

Lines changed: 112 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -978,112 +978,144 @@ func TestRulerDisablesRuleGroups(t *testing.T) {
978978
})
979979
}
980980

981-
func TestRulerKeepFiring(t *testing.T) {
981+
func TestRulerHA(t *testing.T) {
982+
const numRulesGroups = 20
983+
984+
random := rand.New(rand.NewSource(time.Now().UnixNano()))
982985
s, err := e2e.NewScenario(networkName)
983986
require.NoError(t, err)
984987
defer s.Close()
985988

989+
// Generate multiple rule groups, with 1 rule each.
990+
ruleGroups := make([]rulefmt.RuleGroup, numRulesGroups)
991+
expectedNames := make([]string, numRulesGroups)
992+
alertCount := 0
993+
evalInterval, _ := model.ParseDuration("5s")
994+
for i := 0; i < numRulesGroups; i++ {
995+
num := random.Intn(10)
996+
var ruleNode yaml.Node
997+
var exprNode yaml.Node
998+
999+
ruleNode.SetString(fmt.Sprintf("rule_%d", i))
1000+
exprNode.SetString(strconv.Itoa(i))
1001+
ruleName := fmt.Sprintf("test_%d", i)
1002+
1003+
expectedNames[i] = ruleName
1004+
1005+
if num%2 == 0 {
1006+
alertCount++
1007+
ruleGroups[i] = rulefmt.RuleGroup{
1008+
Name: ruleName,
1009+
Interval: evalInterval,
1010+
Rules: []rulefmt.RuleNode{{
1011+
Alert: ruleNode,
1012+
Expr: exprNode,
1013+
}},
1014+
}
1015+
} else {
1016+
ruleGroups[i] = rulefmt.RuleGroup{
1017+
Name: ruleName,
1018+
Interval: evalInterval,
1019+
Rules: []rulefmt.RuleNode{{
1020+
Record: ruleNode,
1021+
Expr: exprNode,
1022+
}},
1023+
}
1024+
}
1025+
}
1026+
9861027
// Start dependencies.
9871028
consul := e2edb.NewConsul()
988-
minio := e2edb.NewMinio(9000, bucketName, rulestoreBucketName)
1029+
minio := e2edb.NewMinio(9000, rulestoreBucketName)
9891030
require.NoError(t, s.StartAndWaitReady(consul, minio))
9901031

9911032
// Configure the ruler.
992-
flags := mergeFlags(
1033+
overrides := map[string]string{
1034+
// Since we're not going to run any rule, we don't need the
1035+
// store-gateway to be configured to a valid address.
1036+
"-querier.store-gateway-addresses": "localhost:12345",
1037+
// Enable the bucket index so we can skip the initial bucket scan.
1038+
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
1039+
"-ruler.ring.replication-factor": "2",
1040+
"-ruler.enable-ha-evaluation": "true",
1041+
"-ruler.poll-interval": "5s",
1042+
}
1043+
1044+
rulerFlags := mergeFlags(
9931045
BlocksStorageFlags(),
9941046
RulerFlags(),
995-
map[string]string{
996-
// Since we're not going to run any rule (our only rule is invalid), we don't need the
997-
// store-gateway to be configured to a valid address.
998-
"-querier.store-gateway-addresses": "localhost:12345",
999-
// Enable the bucket index so we can skip the initial bucket scan.
1000-
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
1001-
// Evaluate rules often, so that we don't need to wait for metrics to show up.
1002-
"-ruler.evaluation-interval": "2s",
1003-
"-ruler.poll-interval": "2s",
1004-
// No delay
1005-
"-ruler.evaluation-delay-duration": "0",
1006-
1007-
"-blocks-storage.tsdb.block-ranges-period": "1h",
1008-
"-blocks-storage.bucket-store.sync-interval": "1s",
1009-
"-blocks-storage.tsdb.retention-period": "2h",
1047+
RulerShardingFlags(consul.NetworkHTTPEndpoint()),
1048+
overrides,
1049+
)
10101050

1011-
// We run single ingester only, no replication.
1012-
"-distributor.replication-factor": "1",
1051+
// Start rulers.
1052+
ruler1 := e2ecortex.NewRuler("ruler-1", consul.NetworkHTTPEndpoint(), rulerFlags, "")
1053+
ruler2 := e2ecortex.NewRuler("ruler-2", consul.NetworkHTTPEndpoint(), rulerFlags, "")
1054+
ruler3 := e2ecortex.NewRuler("ruler-3", consul.NetworkHTTPEndpoint(), rulerFlags, "")
1055+
rulers := e2ecortex.NewCompositeCortexService(ruler1, ruler2, ruler3)
1056+
require.NoError(t, s.StartAndWaitReady(ruler1, ruler2, ruler3))
10131057

1014-
"-querier.max-fetched-chunks-per-query": "50",
1015-
},
1016-
)
1058+
// Upload rule groups to one of the rulers.
1059+
c, err := e2ecortex.NewClient("", "", "", ruler1.HTTPEndpoint(), "user-1")
1060+
require.NoError(t, err)
1061+
namespaceNames := []string{"test1", "test2", "test3", "test4", "test5"}
1062+
namespaceNameCount := make([]int, len(namespaceNames))
1063+
nsRand := rand.New(rand.NewSource(time.Now().UnixNano()))
1064+
for _, ruleGroup := range ruleGroups {
1065+
index := nsRand.Intn(len(namespaceNames))
1066+
namespaceNameCount[index] = namespaceNameCount[index] + 1
1067+
require.NoError(t, c.SetRuleGroup(ruleGroup, namespaceNames[index]))
1068+
}
10171069

1018-
const namespace = "test"
1019-
const user = "user"
1070+
// Wait until rulers have loaded all rules.
1071+
require.NoError(t, rulers.WaitSumMetricsWithOptions(e2e.Equals(numRulesGroups), []string{"cortex_prometheus_rule_group_rules"}, e2e.WaitMissingMetrics))
10201072

1021-
distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
1022-
ruler := e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), flags, "")
1023-
ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
1024-
require.NoError(t, s.StartAndWaitReady(distributor, ingester, ruler))
1073+
ruler1SyncTotal, err := ruler1.SumMetrics([]string{"cortex_ruler_sync_rules_total"})
1074+
require.NoError(t, err)
1075+
ruler3SyncTotal, err := ruler3.SumMetrics([]string{"cortex_ruler_sync_rules_total"})
1076+
require.NoError(t, err)
10251077

1026-
// Wait until both the distributor and ruler have updated the ring. The querier will also watch
1027-
// the store-gateway ring if blocks sharding is enabled.
1028-
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
1029-
require.NoError(t, ruler.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
1078+
err = consul.Kill() // kill consul so the rulers will operate with the tokens/instances they already have
1079+
require.NoError(t, err)
10301080

1031-
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", ruler.HTTPEndpoint(), user)
1081+
err = ruler2.Kill()
10321082
require.NoError(t, err)
10331083

1034-
expression := "vector(1) > 0" // Alert will fire
1035-
groupName := "rule_group_1"
1036-
ruleName := "rule_keep_firing"
1084+
// wait for another sync
1085+
require.NoError(t, ruler1.WaitSumMetrics(e2e.Greater(ruler1SyncTotal[0]), "cortex_ruler_sync_rules_total"))
1086+
require.NoError(t, ruler3.WaitSumMetrics(e2e.Greater(ruler3SyncTotal[0]), "cortex_ruler_sync_rules_total"))
10371087

1038-
require.NoError(t, c.SetRuleGroup(alertRuleWithKeepFiringFor(groupName, ruleName, expression, model.Duration(10*time.Second)), namespace))
1088+
rulers = e2ecortex.NewCompositeCortexService(ruler1, ruler3)
1089+
require.NoError(t, rulers.WaitSumMetricsWithOptions(e2e.Equals(numRulesGroups), []string{"cortex_prometheus_rule_group_rules"}, e2e.WaitMissingMetrics))
10391090

1040-
m := ruleGroupMatcher(user, namespace, groupName)
1091+
t.Log(ruler1.SumMetrics([]string{"cortex_prometheus_rule_group_rules"}))
1092+
t.Log(ruler3.SumMetrics([]string{"cortex_prometheus_rule_group_rules"}))
10411093

1042-
// Wait until ruler has loaded the group.
1043-
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
1044-
// Wait until rule group has tried to evaluate the rule.
1045-
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
1094+
c3, err := e2ecortex.NewClient("", "", "", ruler3.HTTPEndpoint(), "user-1")
1095+
require.NoError(t, err)
10461096

1047-
groups, err := c.GetPrometheusRules(e2ecortex.RuleFilter{
1048-
RuleNames: []string{ruleName},
1049-
})
1097+
ruler1Rules, err := c.GetRuleGroups()
10501098
require.NoError(t, err)
1051-
require.NotEmpty(t, groups)
1052-
require.Equal(t, 1, len(groups[0].Rules))
1053-
alert := parseAlertFromRule(t, groups[0].Rules[0])
1054-
require.Equal(t, float64(10), alert.KeepFiringFor)
1055-
require.Equal(t, 1, len(alert.Alerts))
1056-
require.Empty(t, alert.Alerts[0].KeepFiringSince) //Alert expression not resolved, keepFiringSince should be empty
1057-
1058-
expression = "vector(1) > 1" // Resolve, should keep firing for set duration
1059-
ts := time.Now()
1060-
require.NoError(t, c.SetRuleGroup(alertRuleWithKeepFiringFor(groupName, ruleName, expression, model.Duration(10*time.Second)), namespace))
1061-
// Wait until rule group has tried to evaluate the rule.
1062-
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(5), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
1063-
1064-
updatedGroups, err := c.GetPrometheusRules(e2ecortex.RuleFilter{
1065-
RuleNames: []string{ruleName},
1066-
})
1099+
1100+
ruler3Rules, err := c3.GetRuleGroups()
10671101
require.NoError(t, err)
1068-
require.NotEmpty(t, updatedGroups)
1069-
require.Equal(t, 1, len(updatedGroups[0].Rules))
1070-
1071-
alert = parseAlertFromRule(t, updatedGroups[0].Rules[0])
1072-
require.Equal(t, "firing", alert.State)
1073-
require.Equal(t, float64(10), alert.KeepFiringFor)
1074-
require.Equal(t, 1, len(alert.Alerts))
1075-
require.NotEmpty(t, alert.Alerts[0].KeepFiringSince)
1076-
require.Greater(t, alert.Alerts[0].KeepFiringSince.UnixNano(), ts.UnixNano(), "KeepFiringSince value should be after expression is resolved")
1077-
1078-
time.Sleep(10 * time.Second) // Sleep beyond keepFiringFor time
1079-
updatedGroups, err = c.GetPrometheusRules(e2ecortex.RuleFilter{
1080-
RuleNames: []string{ruleName},
1081-
})
1102+
1103+
ruleCount := 0
1104+
countFunc := func(ruleGroups map[string][]rulefmt.RuleGroup) {
1105+
for _, v := range ruleGroups {
1106+
ruleCount += len(v)
1107+
}
1108+
}
1109+
1110+
countFunc(ruler1Rules)
1111+
require.Equal(t, numRulesGroups, ruleCount)
1112+
ruleCount = 0
1113+
countFunc(ruler3Rules)
1114+
require.Equal(t, numRulesGroups, ruleCount)
1115+
1116+
results, err := c.GetPrometheusRules(e2ecortex.RuleFilter{})
10821117
require.NoError(t, err)
1083-
require.NotEmpty(t, updatedGroups)
1084-
require.Equal(t, 1, len(updatedGroups[0].Rules))
1085-
alert = parseAlertFromRule(t, updatedGroups[0].Rules[0])
1086-
require.Equal(t, 0, len(alert.Alerts)) // alert should be resolved once keepFiringFor time expires
1118+
require.Equal(t, numRulesGroups, len(results))
10871119
}
10881120

10891121
func parseAlertFromRule(t *testing.T, rules interface{}) *alertingRule {

pkg/ruler/client_pool_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
"github.com/cortexproject/cortex/pkg/util/flagext"
1616
"github.com/cortexproject/cortex/pkg/util/grpcclient"
17+
"github.com/cortexproject/cortex/pkg/util/services"
1718
)
1819

1920
func Test_newRulerClientFactory(t *testing.T) {
@@ -63,6 +64,12 @@ func Test_newRulerClientFactory(t *testing.T) {
6364

6465
type mockRulerServer struct{}
6566

67+
func (m *mockRulerServer) LivenessCheck(ctx context.Context, request *LivenessCheckRequest) (*LivenessCheckResponse, error) {
68+
return &LivenessCheckResponse{
69+
State: int32(services.Running),
70+
}, nil
71+
}
72+
6673
func (m *mockRulerServer) Rules(context.Context, *RulesRequest) (*RulesResponse, error) {
6774
return &RulesResponse{}, nil
6875
}

pkg/ruler/merger_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func TestMergeGroupStateDesc(t *testing.T) {
9898

9999
for name, tc := range testCases {
100100
t.Run(name, func(t *testing.T) {
101-
out := mergeGroupStateDesc(tc.input)
101+
out := dedupStateDesc(tc.input)
102102
slices.SortFunc(out, func(a, b *GroupStateDesc) int {
103103
fileCompare := strings.Compare(a.Group.Namespace, b.Group.Namespace)
104104
if fileCompare != 0 {

0 commit comments

Comments
 (0)