@@ -978,112 +978,144 @@ func TestRulerDisablesRuleGroups(t *testing.T) {
978
978
})
979
979
}
980
980
981
- func TestRulerKeepFiring (t * testing.T ) {
981
+ func TestRulerHA (t * testing.T ) {
982
+ const numRulesGroups = 20
983
+
984
+ random := rand .New (rand .NewSource (time .Now ().UnixNano ()))
982
985
s , err := e2e .NewScenario (networkName )
983
986
require .NoError (t , err )
984
987
defer s .Close ()
985
988
989
+ // Generate multiple rule groups, with 1 rule each.
990
+ ruleGroups := make ([]rulefmt.RuleGroup , numRulesGroups )
991
+ expectedNames := make ([]string , numRulesGroups )
992
+ alertCount := 0
993
+ evalInterval , _ := model .ParseDuration ("5s" )
994
+ for i := 0 ; i < numRulesGroups ; i ++ {
995
+ num := random .Intn (10 )
996
+ var ruleNode yaml.Node
997
+ var exprNode yaml.Node
998
+
999
+ ruleNode .SetString (fmt .Sprintf ("rule_%d" , i ))
1000
+ exprNode .SetString (strconv .Itoa (i ))
1001
+ ruleName := fmt .Sprintf ("test_%d" , i )
1002
+
1003
+ expectedNames [i ] = ruleName
1004
+
1005
+ if num % 2 == 0 {
1006
+ alertCount ++
1007
+ ruleGroups [i ] = rulefmt.RuleGroup {
1008
+ Name : ruleName ,
1009
+ Interval : evalInterval ,
1010
+ Rules : []rulefmt.RuleNode {{
1011
+ Alert : ruleNode ,
1012
+ Expr : exprNode ,
1013
+ }},
1014
+ }
1015
+ } else {
1016
+ ruleGroups [i ] = rulefmt.RuleGroup {
1017
+ Name : ruleName ,
1018
+ Interval : evalInterval ,
1019
+ Rules : []rulefmt.RuleNode {{
1020
+ Record : ruleNode ,
1021
+ Expr : exprNode ,
1022
+ }},
1023
+ }
1024
+ }
1025
+ }
1026
+
986
1027
// Start dependencies.
987
1028
consul := e2edb .NewConsul ()
988
- minio := e2edb .NewMinio (9000 , bucketName , rulestoreBucketName )
1029
+ minio := e2edb .NewMinio (9000 , rulestoreBucketName )
989
1030
require .NoError (t , s .StartAndWaitReady (consul , minio ))
990
1031
991
1032
// Configure the ruler.
992
- flags := mergeFlags (
1033
+ overrides := map [string ]string {
1034
+ // Since we're not going to run any rule, we don't need the
1035
+ // store-gateway to be configured to a valid address.
1036
+ "-querier.store-gateway-addresses" : "localhost:12345" ,
1037
+ // Enable the bucket index so we can skip the initial bucket scan.
1038
+ "-blocks-storage.bucket-store.bucket-index.enabled" : "true" ,
1039
+ "-ruler.ring.replication-factor" : "2" ,
1040
+ "-ruler.enable-ha-evaluation" : "true" ,
1041
+ "-ruler.poll-interval" : "5s" ,
1042
+ }
1043
+
1044
+ rulerFlags := mergeFlags (
993
1045
BlocksStorageFlags (),
994
1046
RulerFlags (),
995
- map [string ]string {
996
- // Since we're not going to run any rule (our only rule is invalid), we don't need the
997
- // store-gateway to be configured to a valid address.
998
- "-querier.store-gateway-addresses" : "localhost:12345" ,
999
- // Enable the bucket index so we can skip the initial bucket scan.
1000
- "-blocks-storage.bucket-store.bucket-index.enabled" : "true" ,
1001
- // Evaluate rules often, so that we don't need to wait for metrics to show up.
1002
- "-ruler.evaluation-interval" : "2s" ,
1003
- "-ruler.poll-interval" : "2s" ,
1004
- // No delay
1005
- "-ruler.evaluation-delay-duration" : "0" ,
1006
-
1007
- "-blocks-storage.tsdb.block-ranges-period" : "1h" ,
1008
- "-blocks-storage.bucket-store.sync-interval" : "1s" ,
1009
- "-blocks-storage.tsdb.retention-period" : "2h" ,
1047
+ RulerShardingFlags (consul .NetworkHTTPEndpoint ()),
1048
+ overrides ,
1049
+ )
1010
1050
1011
- // We run single ingester only, no replication.
1012
- "-distributor.replication-factor" : "1" ,
1051
+ // Start rulers.
1052
+ ruler1 := e2ecortex .NewRuler ("ruler-1" , consul .NetworkHTTPEndpoint (), rulerFlags , "" )
1053
+ ruler2 := e2ecortex .NewRuler ("ruler-2" , consul .NetworkHTTPEndpoint (), rulerFlags , "" )
1054
+ ruler3 := e2ecortex .NewRuler ("ruler-3" , consul .NetworkHTTPEndpoint (), rulerFlags , "" )
1055
+ rulers := e2ecortex .NewCompositeCortexService (ruler1 , ruler2 , ruler3 )
1056
+ require .NoError (t , s .StartAndWaitReady (ruler1 , ruler2 , ruler3 ))
1013
1057
1014
- "-querier.max-fetched-chunks-per-query" : "50" ,
1015
- },
1016
- )
1058
+ // Upload rule groups to one of the rulers.
1059
+ c , err := e2ecortex .NewClient ("" , "" , "" , ruler1 .HTTPEndpoint (), "user-1" )
1060
+ require .NoError (t , err )
1061
+ namespaceNames := []string {"test1" , "test2" , "test3" , "test4" , "test5" }
1062
+ namespaceNameCount := make ([]int , len (namespaceNames ))
1063
+ nsRand := rand .New (rand .NewSource (time .Now ().UnixNano ()))
1064
+ for _ , ruleGroup := range ruleGroups {
1065
+ index := nsRand .Intn (len (namespaceNames ))
1066
+ namespaceNameCount [index ] = namespaceNameCount [index ] + 1
1067
+ require .NoError (t , c .SetRuleGroup (ruleGroup , namespaceNames [index ]))
1068
+ }
1017
1069
1018
- const namespace = "test"
1019
- const user = "user"
1070
+ // Wait until rulers have loaded all rules.
1071
+ require . NoError ( t , rulers . WaitSumMetricsWithOptions ( e2e . Equals ( numRulesGroups ), [] string { "cortex_prometheus_rule_group_rules" }, e2e . WaitMissingMetrics ))
1020
1072
1021
- distributor := e2ecortex . NewDistributor ( "distributor" , e2ecortex . RingStoreConsul , consul . NetworkHTTPEndpoint (), flags , "" )
1022
- ruler := e2ecortex . NewRuler ( "ruler" , consul . NetworkHTTPEndpoint (), flags , "" )
1023
- ingester := e2ecortex . NewIngester ( "ingester" , e2ecortex . RingStoreConsul , consul . NetworkHTTPEndpoint (), flags , "" )
1024
- require .NoError (t , s . StartAndWaitReady ( distributor , ingester , ruler ) )
1073
+ ruler1SyncTotal , err := ruler1 . SumMetrics ([] string { "cortex_ruler_sync_rules_total" } )
1074
+ require . NoError ( t , err )
1075
+ ruler3SyncTotal , err := ruler3 . SumMetrics ([] string { "cortex_ruler_sync_rules_total" } )
1076
+ require .NoError (t , err )
1025
1077
1026
- // Wait until both the distributor and ruler have updated the ring. The querier will also watch
1027
- // the store-gateway ring if blocks sharding is enabled.
1028
- require .NoError (t , distributor .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
1029
- require .NoError (t , ruler .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
1078
+ err = consul .Kill () // kill consul so the rulers will operate with the tokens/instances they already have
1079
+ require .NoError (t , err )
1030
1080
1031
- c , err := e2ecortex . NewClient ( distributor . HTTPEndpoint (), "" , "" , ruler . HTTPEndpoint (), user )
1081
+ err = ruler2 . Kill ( )
1032
1082
require .NoError (t , err )
1033
1083
1034
- expression := "vector(1) > 0" // Alert will fire
1035
- groupName := "rule_group_1"
1036
- ruleName := "rule_keep_firing"
1084
+ // wait for another sync
1085
+ require . NoError ( t , ruler1 . WaitSumMetrics ( e2e . Greater ( ruler1SyncTotal [ 0 ]), "cortex_ruler_sync_rules_total" ))
1086
+ require . NoError ( t , ruler3 . WaitSumMetrics ( e2e . Greater ( ruler3SyncTotal [ 0 ]), "cortex_ruler_sync_rules_total" ))
1037
1087
1038
- require .NoError (t , c .SetRuleGroup (alertRuleWithKeepFiringFor (groupName , ruleName , expression , model .Duration (10 * time .Second )), namespace ))
1088
+ rulers = e2ecortex .NewCompositeCortexService (ruler1 , ruler3 )
1089
+ require .NoError (t , rulers .WaitSumMetricsWithOptions (e2e .Equals (numRulesGroups ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WaitMissingMetrics ))
1039
1090
1040
- m := ruleGroupMatcher (user , namespace , groupName )
1091
+ t .Log (ruler1 .SumMetrics ([]string {"cortex_prometheus_rule_group_rules" }))
1092
+ t .Log (ruler3 .SumMetrics ([]string {"cortex_prometheus_rule_group_rules" }))
1041
1093
1042
- // Wait until ruler has loaded the group.
1043
- require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
1044
- // Wait until rule group has tried to evaluate the rule.
1045
- require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
1094
+ c3 , err := e2ecortex .NewClient ("" , "" , "" , ruler3 .HTTPEndpoint (), "user-1" )
1095
+ require .NoError (t , err )
1046
1096
1047
- groups , err := c .GetPrometheusRules (e2ecortex.RuleFilter {
1048
- RuleNames : []string {ruleName },
1049
- })
1097
+ ruler1Rules , err := c .GetRuleGroups ()
1050
1098
require .NoError (t , err )
1051
- require .NotEmpty (t , groups )
1052
- require .Equal (t , 1 , len (groups [0 ].Rules ))
1053
- alert := parseAlertFromRule (t , groups [0 ].Rules [0 ])
1054
- require .Equal (t , float64 (10 ), alert .KeepFiringFor )
1055
- require .Equal (t , 1 , len (alert .Alerts ))
1056
- require .Empty (t , alert .Alerts [0 ].KeepFiringSince ) //Alert expression not resolved, keepFiringSince should be empty
1057
-
1058
- expression = "vector(1) > 1" // Resolve, should keep firing for set duration
1059
- ts := time .Now ()
1060
- require .NoError (t , c .SetRuleGroup (alertRuleWithKeepFiringFor (groupName , ruleName , expression , model .Duration (10 * time .Second )), namespace ))
1061
- // Wait until rule group has tried to evaluate the rule.
1062
- require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (5 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
1063
-
1064
- updatedGroups , err := c .GetPrometheusRules (e2ecortex.RuleFilter {
1065
- RuleNames : []string {ruleName },
1066
- })
1099
+
1100
+ ruler3Rules , err := c3 .GetRuleGroups ()
1067
1101
require .NoError (t , err )
1068
- require .NotEmpty (t , updatedGroups )
1069
- require .Equal (t , 1 , len (updatedGroups [0 ].Rules ))
1070
-
1071
- alert = parseAlertFromRule (t , updatedGroups [0 ].Rules [0 ])
1072
- require .Equal (t , "firing" , alert .State )
1073
- require .Equal (t , float64 (10 ), alert .KeepFiringFor )
1074
- require .Equal (t , 1 , len (alert .Alerts ))
1075
- require .NotEmpty (t , alert .Alerts [0 ].KeepFiringSince )
1076
- require .Greater (t , alert .Alerts [0 ].KeepFiringSince .UnixNano (), ts .UnixNano (), "KeepFiringSince value should be after expression is resolved" )
1077
-
1078
- time .Sleep (10 * time .Second ) // Sleep beyond keepFiringFor time
1079
- updatedGroups , err = c .GetPrometheusRules (e2ecortex.RuleFilter {
1080
- RuleNames : []string {ruleName },
1081
- })
1102
+
1103
+ ruleCount := 0
1104
+ countFunc := func (ruleGroups map [string ][]rulefmt.RuleGroup ) {
1105
+ for _ , v := range ruleGroups {
1106
+ ruleCount += len (v )
1107
+ }
1108
+ }
1109
+
1110
+ countFunc (ruler1Rules )
1111
+ require .Equal (t , numRulesGroups , ruleCount )
1112
+ ruleCount = 0
1113
+ countFunc (ruler3Rules )
1114
+ require .Equal (t , numRulesGroups , ruleCount )
1115
+
1116
+ results , err := c .GetPrometheusRules (e2ecortex.RuleFilter {})
1082
1117
require .NoError (t , err )
1083
- require .NotEmpty (t , updatedGroups )
1084
- require .Equal (t , 1 , len (updatedGroups [0 ].Rules ))
1085
- alert = parseAlertFromRule (t , updatedGroups [0 ].Rules [0 ])
1086
- require .Equal (t , 0 , len (alert .Alerts )) // alert should be resolved once keepFiringFor time expires
1118
+ require .Equal (t , numRulesGroups , len (results ))
1087
1119
}
1088
1120
1089
1121
func parseAlertFromRule (t * testing.T , rules interface {}) * alertingRule {
0 commit comments