@@ -576,8 +576,8 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
576
576
require .Equal (t , 200 , res .StatusCode )
577
577
}
578
578
579
- totalQueries , err := ruler . SumMetrics ([] string { "cortex_ruler_queries_total" } )
580
- require . NoError ( t , err )
579
+ matcher := labels . MustNewMatcher ( labels . MatchEqual , "user" , user )
580
+ var totalQueries = [] float64 { 0 }
581
581
582
582
// Verify that user-failures don't increase cortex_ruler_queries_failed_total
583
583
for groupName , expression := range map [string ]string {
@@ -601,7 +601,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
601
601
require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
602
602
603
603
// But these failures were not reported as "failed queries"
604
- sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
604
+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" }, e2e . WithLabelMatchers ( matcher ) )
605
605
require .NoError (t , err )
606
606
require .Equal (t , float64 (0 ), sum [0 ])
607
607
@@ -612,7 +612,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
612
612
require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .SkipMissingMetrics ))
613
613
614
614
// Check that cortex_ruler_queries_total went up since last test.
615
- newTotalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
615
+ newTotalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" }, e2e . WithLabelMatchers ( matcher ) )
616
616
require .NoError (t , err )
617
617
require .Greater (t , newTotalQueries [0 ], totalQueries [0 ])
618
618
@@ -637,15 +637,119 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
637
637
require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
638
638
639
639
// Still no failures.
640
- sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
640
+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" }, e2e . WithLabelMatchers ( matcher ) )
641
641
require .NoError (t , err )
642
642
require .Equal (t , float64 (0 ), sum [0 ])
643
643
644
644
// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
645
645
require .NoError (t , s .Stop (ingester ))
646
646
647
647
// We should start getting "real" failures now.
648
- require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_failed_total" }))
648
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_failed_total" }, e2e .WithLabelMatchers (matcher )))
649
+ })
650
+ }
651
+
652
+ func TestRulerMetricsWhenIngesterFails (t * testing.T ) {
653
+ s , err := e2e .NewScenario (networkName )
654
+ require .NoError (t , err )
655
+ defer s .Close ()
656
+
657
+ // Start dependencies.
658
+ consul := e2edb .NewConsul ()
659
+ minio := e2edb .NewMinio (9000 , bucketName , rulestoreBucketName )
660
+ require .NoError (t , s .StartAndWaitReady (consul , minio ))
661
+
662
+ const blockRangePeriod = 2 * time .Second
663
+ // Configure the ruler.
664
+ flags := mergeFlags (
665
+ BlocksStorageFlags (),
666
+ RulerFlags (),
667
+ map [string ]string {
668
+ "-blocks-storage.tsdb.block-ranges-period" : blockRangePeriod .String (),
669
+ "-blocks-storage.tsdb.ship-interval" : "1s" ,
670
+ "-blocks-storage.bucket-store.sync-interval" : "1s" ,
671
+ "-blocks-storage.bucket-store.index-cache.backend" : tsdb .IndexCacheBackendInMemory ,
672
+ "-blocks-storage.tsdb.retention-period" : ((blockRangePeriod * 2 ) - 1 ).String (),
673
+
674
+ // Enable the bucket index so we can skip the initial bucket scan.
675
+ "-blocks-storage.bucket-store.bucket-index.enabled" : "false" ,
676
+ // Evaluate rules often, so that we don't need to wait for metrics to show up.
677
+ "-ruler.evaluation-interval" : "2s" ,
678
+ "-ruler.poll-interval" : "2s" ,
679
+ // No delay
680
+ "-ruler.evaluation-delay-duration" : "0" ,
681
+
682
+ // We run single ingester only, no replication.
683
+ "-distributor.replication-factor" : "1" ,
684
+
685
+ // Very low limit so that ruler hits it.
686
+ "-querier.max-fetched-chunks-per-query" : "15" ,
687
+ "-querier.query-store-after" : (1 * time .Second ).String (),
688
+ "-querier.query-ingesters-within" : (2 * time .Second ).String (),
689
+ },
690
+ )
691
+
692
+ const namespace = "test"
693
+ const user = "user"
694
+
695
+ storeGateway := e2ecortex .NewStoreGateway ("store-gateway-1" , e2ecortex .RingStoreConsul , consul .NetworkHTTPEndpoint (), flags , "" )
696
+
697
+ flags = mergeFlags (flags , map [string ]string {
698
+ "-querier.store-gateway-addresses" : storeGateway .NetworkGRPCEndpoint (),
699
+ })
700
+
701
+ distributor := e2ecortex .NewDistributor ("distributor" , e2ecortex .RingStoreConsul , consul .NetworkHTTPEndpoint (), flags , "" )
702
+ ruler := e2ecortex .NewRuler ("ruler" , consul .NetworkHTTPEndpoint (), flags , "" )
703
+ ingester := e2ecortex .NewIngester ("ingester" , e2ecortex .RingStoreConsul , consul .NetworkHTTPEndpoint (), flags , "" )
704
+ require .NoError (t , s .StartAndWaitReady (distributor , ingester , ruler , storeGateway ))
705
+
706
+ // Wait until both the distributor and ruler have updated the ring. The querier will also watch
707
+ // the store-gateway ring if blocks sharding is enabled.
708
+ require .NoError (t , distributor .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
709
+ require .NoError (t , ruler .WaitSumMetrics (e2e .Equals (1024 ), "cortex_ring_tokens_total" ))
710
+
711
+ c , err := e2ecortex .NewClient (distributor .HTTPEndpoint (), "" , "" , ruler .HTTPEndpoint (), user )
712
+ require .NoError (t , err )
713
+
714
+ matcher := labels .MustNewMatcher (labels .MatchEqual , "user" , user )
715
+ expression := "absent(sum_over_time(metric{}[2s] offset 1h))"
716
+
717
+ // Now let's upload a non-failing rule, and make sure that it works.
718
+ t .Run ("real_error" , func (t * testing.T ) {
719
+ const groupName = "good_rule"
720
+
721
+ var ruleEvalCount float64
722
+ ruleGroup := ruleGroupWithRule (groupName , "rule" , expression )
723
+ ruleGroup .Interval = 2
724
+ require .NoError (t , c .SetRuleGroup (ruleGroup , namespace ))
725
+ m := ruleGroupMatcher (user , namespace , groupName )
726
+
727
+ // Wait until ruler has loaded the group.
728
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
729
+
730
+ // Wait until rule group has tried to evaluate the rule, and succeeded.
731
+ ruleEvalCount ++
732
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (ruleEvalCount ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
733
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
734
+
735
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_write_requests_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
736
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_ruler_write_requests_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
737
+
738
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_ruler_queries_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
739
+
740
+ // Wait until the TSDB head is compacted and shipped to the storage.
741
+ // The shipped block contains the 1st series, while the 2ns series in the head.
742
+ require .NoError (t , ingester .WaitSumMetrics (e2e .Equals (1 ), "cortex_ingester_shipper_uploads_total" ))
743
+
744
+ // Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_write_requests_failed_total failures.
745
+ require .NoError (t , s .Stop (ingester ))
746
+ ruleEvalCount ++
747
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (ruleEvalCount ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
748
+
749
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_ruler_queries_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
750
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (2 ), []string {"cortex_ruler_write_requests_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
751
+
752
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_write_requests_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
649
753
})
650
754
}
651
755
0 commit comments