Skip to content

Commit f443580

Browse files
Adding user label to metrics to be able to track these metrics at a user level
Signed-off-by: Anand Rajagopal <[email protected]>
1 parent 04566a5 commit f443580

File tree

3 files changed

+124
-14
lines changed

3 files changed

+124
-14
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Changelog
22

33
## master / unreleased
4+
* [CHANGE] Ruler: Added user label to `cortex_ruler_write_requests_total`, `cortex_ruler_write_requests_failed_total`, `cortex_ruler_queries_total`, and `cortex_ruler_queries_failed_total` metrics. #5312
45
* [CHANGE] Alertmanager: Validating new fields on the PagerDuty AM config. #5290
56
* [CHANGE] Ingester: Creating label `native-histogram-sample` on the `cortex_discarded_samples_total` to keep track of discarded native histogram samples. #5289
67
* [FEATURE] Store Gateway: Add `max_downloaded_bytes_per_request` to limit max bytes to download per store gateway request.

integration/ruler_test.go

Lines changed: 110 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -576,8 +576,8 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
576576
require.Equal(t, 200, res.StatusCode)
577577
}
578578

579-
totalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"})
580-
require.NoError(t, err)
579+
matcher := labels.MustNewMatcher(labels.MatchEqual, "user", user)
580+
var totalQueries = []float64{0}
581581

582582
// Verify that user-failures don't increase cortex_ruler_queries_failed_total
583583
for groupName, expression := range map[string]string{
@@ -601,7 +601,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
601601
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
602602

603603
// But these failures were not reported as "failed queries"
604-
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"})
604+
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher))
605605
require.NoError(t, err)
606606
require.Equal(t, float64(0), sum[0])
607607

@@ -612,7 +612,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
612612
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_group_rules"}, e2e.SkipMissingMetrics))
613613

614614
// Check that cortex_ruler_queries_total went up since last test.
615-
newTotalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"})
615+
newTotalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"}, e2e.WithLabelMatchers(matcher))
616616
require.NoError(t, err)
617617
require.Greater(t, newTotalQueries[0], totalQueries[0])
618618

@@ -637,15 +637,119 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
637637
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
638638

639639
// Still no failures.
640-
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"})
640+
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher))
641641
require.NoError(t, err)
642642
require.Equal(t, float64(0), sum[0])
643643

644644
// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
645645
require.NoError(t, s.Stop(ingester))
646646

647647
// We should start getting "real" failures now.
648-
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_queries_failed_total"}))
648+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher)))
649+
})
650+
}
651+
652+
func TestRulerMetricsWhenIngesterFails(t *testing.T) {
653+
s, err := e2e.NewScenario(networkName)
654+
require.NoError(t, err)
655+
defer s.Close()
656+
657+
// Start dependencies.
658+
consul := e2edb.NewConsul()
659+
minio := e2edb.NewMinio(9000, bucketName, rulestoreBucketName)
660+
require.NoError(t, s.StartAndWaitReady(consul, minio))
661+
662+
const blockRangePeriod = 2 * time.Second
663+
// Configure the ruler.
664+
flags := mergeFlags(
665+
BlocksStorageFlags(),
666+
RulerFlags(),
667+
map[string]string{
668+
"-blocks-storage.tsdb.block-ranges-period": blockRangePeriod.String(),
669+
"-blocks-storage.tsdb.ship-interval": "1s",
670+
"-blocks-storage.bucket-store.sync-interval": "1s",
671+
"-blocks-storage.bucket-store.index-cache.backend": tsdb.IndexCacheBackendInMemory,
672+
"-blocks-storage.tsdb.retention-period": ((blockRangePeriod * 2) - 1).String(),
673+
674+
// Enable the bucket index so we can skip the initial bucket scan.
675+
"-blocks-storage.bucket-store.bucket-index.enabled": "false",
676+
// Evaluate rules often, so that we don't need to wait for metrics to show up.
677+
"-ruler.evaluation-interval": "2s",
678+
"-ruler.poll-interval": "2s",
679+
// No delay
680+
"-ruler.evaluation-delay-duration": "0",
681+
682+
// We run single ingester only, no replication.
683+
"-distributor.replication-factor": "1",
684+
685+
// Very low limit so that ruler hits it.
686+
"-querier.max-fetched-chunks-per-query": "15",
687+
"-querier.query-store-after": (1 * time.Second).String(),
688+
"-querier.query-ingesters-within": (2 * time.Second).String(),
689+
},
690+
)
691+
692+
const namespace = "test"
693+
const user = "user"
694+
695+
storeGateway := e2ecortex.NewStoreGateway("store-gateway-1", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
696+
697+
flags = mergeFlags(flags, map[string]string{
698+
"-querier.store-gateway-addresses": storeGateway.NetworkGRPCEndpoint(),
699+
})
700+
701+
distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
702+
ruler := e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), flags, "")
703+
ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
704+
require.NoError(t, s.StartAndWaitReady(distributor, ingester, ruler, storeGateway))
705+
706+
// Wait until both the distributor and ruler have updated the ring. The querier will also watch
707+
// the store-gateway ring if blocks sharding is enabled.
708+
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
709+
require.NoError(t, ruler.WaitSumMetrics(e2e.Equals(1024), "cortex_ring_tokens_total"))
710+
711+
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", ruler.HTTPEndpoint(), user)
712+
require.NoError(t, err)
713+
714+
matcher := labels.MustNewMatcher(labels.MatchEqual, "user", user)
715+
expression := "absent(sum_over_time(metric{}[2s] offset 1h))"
716+
717+
// Now let's upload a non-failing rule, and make sure that it works.
718+
t.Run("real_error", func(t *testing.T) {
719+
const groupName = "good_rule"
720+
721+
var ruleEvalCount float64
722+
ruleGroup := ruleGroupWithRule(groupName, "rule", expression)
723+
ruleGroup.Interval = 2
724+
require.NoError(t, c.SetRuleGroup(ruleGroup, namespace))
725+
m := ruleGroupMatcher(user, namespace, groupName)
726+
727+
// Wait until ruler has loaded the group.
728+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
729+
730+
// Wait until rule group has tried to evaluate the rule, and succeeded.
731+
ruleEvalCount++
732+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(ruleEvalCount), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
733+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
734+
735+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_write_requests_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
736+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_ruler_write_requests_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
737+
738+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
739+
740+
// Wait until the TSDB head is compacted and shipped to the storage.
741+
// The shipped block contains the 1st series, while the 2ns series in the head.
742+
require.NoError(t, ingester.WaitSumMetrics(e2e.Equals(1), "cortex_ingester_shipper_uploads_total"))
743+
744+
// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_write_requests_failed_total failures.
745+
require.NoError(t, s.Stop(ingester))
746+
ruleEvalCount++
747+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(ruleEvalCount), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
748+
749+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
750+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(2), []string{"cortex_ruler_write_requests_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
751+
752+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_write_requests_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
649753
})
650754
}
651755

pkg/ruler/compat.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -251,23 +251,23 @@ type RulesManager interface {
251251
type ManagerFactory func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager
252252

253253
func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine v1.QueryEngine, overrides RulesLimits, reg prometheus.Registerer) ManagerFactory {
254-
totalWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
254+
totalWritesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
255255
Name: "cortex_ruler_write_requests_total",
256256
Help: "Number of write requests to ingesters.",
257-
})
258-
failedWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
257+
}, []string{"user"})
258+
failedWritesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
259259
Name: "cortex_ruler_write_requests_failed_total",
260260
Help: "Number of failed write requests to ingesters.",
261-
})
261+
}, []string{"user"})
262262

263-
totalQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
263+
totalQueriesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
264264
Name: "cortex_ruler_queries_total",
265265
Help: "Number of queries executed by ruler.",
266-
})
267-
failedQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
266+
}, []string{"user"})
267+
failedQueriesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
268268
Name: "cortex_ruler_queries_failed_total",
269269
Help: "Number of failed queries by ruler.",
270-
})
270+
}, []string{"user"})
271271
var rulerQuerySeconds *prometheus.CounterVec
272272
if cfg.EnableQueryStats {
273273
rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
@@ -287,6 +287,11 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
287287
queryTime = rulerQuerySeconds.WithLabelValues(userID)
288288
}
289289

290+
failedQueries := failedQueriesVec.WithLabelValues(userID)
291+
totalQueries := totalQueriesVec.WithLabelValues(userID)
292+
totalWrites := totalWritesVec.WithLabelValues(userID)
293+
failedWrites := failedWritesVec.WithLabelValues(userID)
294+
290295
return rules.NewManager(&rules.ManagerOptions{
291296
Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
292297
Queryable: q,

0 commit comments

Comments
 (0)