diff --git a/CHANGELOG.md b/CHANGELOG.md index 41479ba2c92..dff4bdda084 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,9 +56,18 @@ * [ENHANCEMENT] Redis Cache: Added `idle_timeout`, `wait_on_pool_exhaustion` and `max_conn_lifetime` options to redis cache configuration. #2550 * [ENHANCEMENT] WAL: the experimental tag has been removed on the WAL in ingesters. * [ENHANCEMENT] Use newer AWS API for paginated queries - removes 'Deprecated' message from logfiles. #2452 -* [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580 +* [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580 #2583 * `cortex_ingester_tsdb_appender_add_duration_seconds` * `cortex_ingester_tsdb_appender_commit_duration_seconds` + * `cortex_ingester_tsdb_refcache_purge_duration_seconds` + * `cortex_ingester_tsdb_compactions_total` + * `cortex_ingester_tsdb_compaction_duration_seconds` + * `cortex_ingester_tsdb_wal_fsync_duration_seconds` + * `cortex_ingester_tsdb_wal_page_flushes_total` + * `cortex_ingester_tsdb_wal_completed_pages_total` + * `cortex_ingester_tsdb_wal_truncations_failed_total` + * `cortex_ingester_tsdb_wal_truncations_total` + * `cortex_ingester_tsdb_wal_writes_failed_total` * [BUGFIX] Ruler: Ensure temporary rule files with special characters are properly mapped and cleaned up. #2506 * [BUGFIX] Fixes #2411, Ensure requests are properly routed to the prometheus api embedded in the query if `-server.path-prefix` is set. #2372 * [BUGFIX] Experimental TSDB: fixed chunk data corruption when querying back series using the experimental blocks storage. #2400 diff --git a/pkg/ingester/ingester_v2.go b/pkg/ingester/ingester_v2.go index 4ba7506f046..93b17d31097 100644 --- a/pkg/ingester/ingester_v2.go +++ b/pkg/ingester/ingester_v2.go @@ -79,6 +79,7 @@ type TSDBState struct { walReplayTime prometheus.Histogram appenderAddDuration prometheus.Histogram appenderCommitDuration prometheus.Histogram + refCachePurgeDuration prometheus.Histogram } // NewV2 returns a new Ingester that uses prometheus block storage instead of chunk storage @@ -126,6 +127,11 @@ func NewV2(cfg Config, clientConfig client.Config, limits *validation.Overrides, Help: "The total time it takes for a push request to commit samples appended to TSDB.", Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, }), + refCachePurgeDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ + Name: "cortex_ingester_tsdb_refcache_purge_duration_seconds", + Help: "The total time it takes to purge the TSDB series reference cache for a single tenant.", + Buckets: prometheus.DefBuckets, + }), }, } @@ -228,9 +234,13 @@ func (i *Ingester) updateLoop(ctx context.Context) error { case <-refCachePurgeTicker.C: for _, userID := range i.getTSDBUsers() { userDB := i.getTSDB(userID) - if userDB != nil { - userDB.refCache.Purge(time.Now().Add(-cortex_tsdb.DefaultRefCacheTTL)) + if userDB == nil { + continue } + + startTime := time.Now() + userDB.refCache.Purge(startTime.Add(-cortex_tsdb.DefaultRefCacheTTL)) + i.TSDBState.refCachePurgeDuration.Observe(time.Since(startTime).Seconds()) } case <-ctx.Done(): return nil diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 2c763ae8aba..a68c1b3245c 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -218,13 +218,22 @@ func newIngesterMetrics(r prometheus.Registerer, createMetricsConflictingWithTSD // TSDB metrics collector. Each tenant has its own registry, that TSDB code uses. type tsdbMetrics struct { - // We aggregate metrics from individual TSDB registries into - // a single set of counters, which are exposed as Cortex metrics. + // Metrics aggregated from Thanos shipper. dirSyncs *prometheus.Desc // sum(thanos_shipper_dir_syncs_total) dirSyncFailures *prometheus.Desc // sum(thanos_shipper_dir_sync_failures_total) uploads *prometheus.Desc // sum(thanos_shipper_uploads_total) uploadFailures *prometheus.Desc // sum(thanos_shipper_upload_failures_total) + // Metrics aggregated from TSDB. + tsdbCompactionsTotal *prometheus.Desc + tsdbCompactionDuration *prometheus.Desc + tsdbFsyncDuration *prometheus.Desc + tsdbPageFlushes *prometheus.Desc + tsdbPageCompletions *prometheus.Desc + tsdbTruncateFail *prometheus.Desc + tsdbTruncateTotal *prometheus.Desc + tsdbWritesFailed *prometheus.Desc + // These two metrics replace metrics in ingesterMetrics, as we count them differently memSeriesCreatedTotal *prometheus.Desc memSeriesRemovedTotal *prometheus.Desc @@ -253,6 +262,38 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics { "cortex_ingester_shipper_upload_failures_total", "TSDB: Total number of block upload failures", nil, nil), + tsdbCompactionsTotal: prometheus.NewDesc( + "cortex_ingester_tsdb_compactions_total", + "Total number of TSDB compactions that were executed.", + nil, nil), + tsdbCompactionDuration: prometheus.NewDesc( + "cortex_ingester_tsdb_compaction_duration_seconds", + "Duration of TSDB compaction runs.", + nil, nil), + tsdbFsyncDuration: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_fsync_duration_seconds", + "Duration of TSDB WAL fsync.", + nil, nil), + tsdbPageFlushes: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_page_flushes_total", + "Total number of TSDB WAL page flushes.", + nil, nil), + tsdbPageCompletions: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_completed_pages_total", + "Total number of TSDB WAL completed pages.", + nil, nil), + tsdbTruncateFail: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_truncations_failed_total", + "Total number of TSDB WAL truncations that failed.", + nil, nil), + tsdbTruncateTotal: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_truncations_total", + "Total number of TSDB WAL truncations attempted.", + nil, nil), + tsdbWritesFailed: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_writes_failed_total", + "Total number of TSDB WAL writes that failed.", + nil, nil), memSeriesCreatedTotal: prometheus.NewDesc(memSeriesCreatedTotalName, memSeriesCreatedTotalHelp, []string{"user"}, nil), memSeriesRemovedTotal: prometheus.NewDesc(memSeriesRemovedTotalName, memSeriesRemovedTotalHelp, []string{"user"}, nil), @@ -269,6 +310,16 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) { out <- sm.dirSyncFailures out <- sm.uploads out <- sm.uploadFailures + + out <- sm.tsdbCompactionsTotal + out <- sm.tsdbCompactionDuration + out <- sm.tsdbFsyncDuration + out <- sm.tsdbPageFlushes + out <- sm.tsdbPageCompletions + out <- sm.tsdbTruncateFail + out <- sm.tsdbTruncateTotal + out <- sm.tsdbWritesFailed + out <- sm.memSeriesCreatedTotal out <- sm.memSeriesRemovedTotal } @@ -282,6 +333,15 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCounters(out, sm.uploads, "thanos_shipper_uploads_total") data.SendSumOfCounters(out, sm.uploadFailures, "thanos_shipper_upload_failures_total") + data.SendSumOfCounters(out, sm.tsdbCompactionsTotal, "prometheus_tsdb_compactions_total") + data.SendSumOfHistograms(out, sm.tsdbCompactionDuration, "prometheus_tsdb_compaction_duration_seconds") + data.SendSumOfSummaries(out, sm.tsdbFsyncDuration, "prometheus_tsdb_wal_fsync_duration_seconds") + data.SendSumOfCounters(out, sm.tsdbPageFlushes, "prometheus_tsdb_wal_page_flushes_total") + data.SendSumOfCounters(out, sm.tsdbPageCompletions, "prometheus_tsdb_wal_completed_pages_total") + data.SendSumOfCounters(out, sm.tsdbTruncateFail, "prometheus_tsdb_wal_truncations_failed_total") + data.SendSumOfCounters(out, sm.tsdbTruncateTotal, "prometheus_tsdb_wal_truncations_total") + data.SendSumOfCounters(out, sm.tsdbWritesFailed, "prometheus_tsdb_wal_writes_failed_total") + data.SendSumOfCountersPerUser(out, sm.memSeriesCreatedTotal, "prometheus_tsdb_head_series_created_total") data.SendSumOfCountersPerUser(out, sm.memSeriesRemovedTotal, "prometheus_tsdb_head_series_removed_total") } diff --git a/pkg/ingester/metrics_test.go b/pkg/ingester/metrics_test.go index 466730881fe..942146b4435 100644 --- a/pkg/ingester/metrics_test.go +++ b/pkg/ingester/metrics_test.go @@ -40,6 +40,54 @@ func TestTSDBMetrics(t *testing.T) { # 4*(12345 + 85787 + 999) cortex_ingester_shipper_upload_failures_total 396524 + # HELP cortex_ingester_tsdb_compactions_total Total number of TSDB compactions that were executed. + # TYPE cortex_ingester_tsdb_compactions_total counter + cortex_ingester_tsdb_compactions_total 693917 + + # HELP cortex_ingester_tsdb_compaction_duration_seconds Duration of TSDB compaction runs. + # TYPE cortex_ingester_tsdb_compaction_duration_seconds histogram + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="1"} 0 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="2"} 0 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="4"} 0 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="8"} 0 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="16"} 3 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="32"} 3 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="64"} 3 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="128"} 3 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="256"} 3 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="512"} 3 + cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="+Inf"} 3 + cortex_ingester_tsdb_compaction_duration_seconds_sum 27 + cortex_ingester_tsdb_compaction_duration_seconds_count 3 + + # HELP cortex_ingester_tsdb_wal_fsync_duration_seconds Duration of TSDB WAL fsync. + # TYPE cortex_ingester_tsdb_wal_fsync_duration_seconds summary + cortex_ingester_tsdb_wal_fsync_duration_seconds{quantile="0.5"} 30 + cortex_ingester_tsdb_wal_fsync_duration_seconds{quantile="0.9"} 30 + cortex_ingester_tsdb_wal_fsync_duration_seconds{quantile="0.99"} 30 + cortex_ingester_tsdb_wal_fsync_duration_seconds_sum 30 + cortex_ingester_tsdb_wal_fsync_duration_seconds_count 3 + + # HELP cortex_ingester_tsdb_wal_page_flushes_total Total number of TSDB WAL page flushes. + # TYPE cortex_ingester_tsdb_wal_page_flushes_total counter + cortex_ingester_tsdb_wal_page_flushes_total 1090441 + + # HELP cortex_ingester_tsdb_wal_completed_pages_total Total number of TSDB WAL completed pages. + # TYPE cortex_ingester_tsdb_wal_completed_pages_total counter + cortex_ingester_tsdb_wal_completed_pages_total 1189572 + + # HELP cortex_ingester_tsdb_wal_truncations_failed_total Total number of TSDB WAL truncations that failed. + # TYPE cortex_ingester_tsdb_wal_truncations_failed_total counter + cortex_ingester_tsdb_wal_truncations_failed_total 1288703 + + # HELP cortex_ingester_tsdb_wal_truncations_total Total number of TSDB WAL truncations attempted. + # TYPE cortex_ingester_tsdb_wal_truncations_total counter + cortex_ingester_tsdb_wal_truncations_total 1387834 + + # HELP cortex_ingester_tsdb_wal_writes_failed_total Total number of TSDB WAL writes that failed. + # TYPE cortex_ingester_tsdb_wal_writes_failed_total counter + cortex_ingester_tsdb_wal_writes_failed_total 1486965 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. # TYPE cortex_ingester_memory_series_created_total counter # 5 * (12345, 85787 and 999 respectively) @@ -60,7 +108,7 @@ func TestTSDBMetrics(t *testing.T) { func populateTSDBMetrics(base float64) *prometheus.Registry { r := prometheus.NewRegistry() - // shipper + // Thanos shipper. dirSyncs := promauto.With(r).NewCounter(prometheus.CounterOpts{ Name: "thanos_shipper_dir_syncs_total", Help: "Total number of dir syncs", @@ -96,5 +144,55 @@ func populateTSDBMetrics(base float64) *prometheus.Registry { }) seriesRemoved.Add(6 * base) + ran := promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_compactions_total", + Help: "Total number of compactions that were executed for the partition.", + }) + ran.Add(7 * base) + + duration := promauto.With(r).NewHistogram(prometheus.HistogramOpts{ + Name: "prometheus_tsdb_compaction_duration_seconds", + Help: "Duration of compaction runs", + Buckets: prometheus.ExponentialBuckets(1, 2, 10), + }) + duration.Observe(9) + + fsyncDuration := promauto.With(r).NewSummary(prometheus.SummaryOpts{ + Name: "prometheus_tsdb_wal_fsync_duration_seconds", + Help: "Duration of WAL fsync.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }) + fsyncDuration.Observe(10) + + pageFlushes := promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_page_flushes_total", + Help: "Total number of page flushes.", + }) + pageFlushes.Add(11 * base) + + pageCompletions := promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_completed_pages_total", + Help: "Total number of completed pages.", + }) + pageCompletions.Add(12 * base) + + truncateFail := promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_truncations_failed_total", + Help: "Total number of WAL truncations that failed.", + }) + truncateFail.Add(13 * base) + + truncateTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_truncations_total", + Help: "Total number of WAL truncations attempted.", + }) + truncateTotal.Add(14 * base) + + writesFailed := promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_writes_failed_total", + Help: "Total number of WAL writes that failed.", + }) + writesFailed.Add(15 * base) + return r }