Skip to content

Add new TSDB metrics from Prometheus #5624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
* [ENHANCEMENT] QueryFrontend: Add metric for number of series requests. #5373
* [ENHANCEMENT] Store Gateway: Add histogram metrics for total time spent fetching series and chunks per request. #5573
* [ENHANCEMENT] Store Gateway: Check context in multi level cache. Add `cortex_store_multilevel_index_cache_fetch_duration_seconds` and `cortex_store_multilevel_index_cache_backfill_duration_seconds` to measure fetch and backfill latency. #5596
* [ENHANCEMENT] Ingester: Added new ingester TSDB metrics `cortex_ingester_tsdb_head_samples_appended_total`, `cortex_ingester_tsdb_head_out_of_order_samples_appended_total`, `cortex_ingester_tsdb_snapshot_replay_error_total`, `cortex_ingester_tsdb_sample_ooo_delta` and `cortex_ingester_tsdb_mmap_chunks_total`. #5624
* [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265
* [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286
* [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293
Expand Down
35 changes: 35 additions & 0 deletions pkg/ingester/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,11 @@ type tsdbMetrics struct {
tsdbChunksRemovedTotal *prometheus.Desc
tsdbMmapChunkCorruptionTotal *prometheus.Desc
tsdbChunkwriteQueueOperationsTotal *prometheus.Desc
tsdbSamplesAppended *prometheus.Desc
tsdbOutOfOrderSamplesAppended *prometheus.Desc
tsdbSnapshotReplayErrorTotal *prometheus.Desc
tsdbOOOHistogram *prometheus.Desc
tsdbMmapChunksTotal *prometheus.Desc

tsdbExemplarsTotal *prometheus.Desc
tsdbExemplarsInStorage *prometheus.Desc
Expand Down Expand Up @@ -429,6 +434,26 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
"cortex_ingester_tsdb_checkpoint_creations_total",
"Total number of TSDB checkpoint creations attempted.",
nil, nil),
tsdbSamplesAppended: prometheus.NewDesc(
"cortex_ingester_tsdb_head_samples_appended_total",
"Total number of appended samples.",
[]string{"type", "user"}, nil),
tsdbOutOfOrderSamplesAppended: prometheus.NewDesc(
"cortex_ingester_tsdb_head_out_of_order_samples_appended_total",
"Total number of appended out of order samples.",
[]string{"user"}, nil),
tsdbSnapshotReplayErrorTotal: prometheus.NewDesc(
"cortex_ingester_tsdb_snapshot_replay_error_total",
"Total number snapshot replays that failed.",
nil, nil),
tsdbOOOHistogram: prometheus.NewDesc(
"cortex_ingester_tsdb_sample_ooo_delta",
"Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
nil, nil),
tsdbMmapChunksTotal: prometheus.NewDesc(
"cortex_ingester_tsdb_mmap_chunks_total",
"Total number of chunks that were memory-mapped.",
nil, nil),

// The most useful exemplar metrics are per-user. The rest
// are global to reduce metrics overhead.
Expand Down Expand Up @@ -497,6 +522,11 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
out <- sm.tsdbReloadsFailed
out <- sm.tsdbTimeRetentionCount
out <- sm.tsdbBlocksBytes
out <- sm.tsdbSamplesAppended
out <- sm.tsdbOutOfOrderSamplesAppended
out <- sm.tsdbSnapshotReplayErrorTotal
out <- sm.tsdbOOOHistogram
out <- sm.tsdbMmapChunksTotal
out <- sm.checkpointDeleteFail
out <- sm.checkpointDeleteTotal
out <- sm.checkpointCreationFail
Expand Down Expand Up @@ -547,6 +577,11 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCounters(out, sm.tsdbReloadsFailed, "prometheus_tsdb_reloads_failures_total")
data.SendSumOfCounters(out, sm.tsdbTimeRetentionCount, "prometheus_tsdb_time_retentions_total")
data.SendSumOfGaugesPerUser(out, sm.tsdbBlocksBytes, "prometheus_tsdb_storage_blocks_bytes")
data.SendSumOfCountersPerUserWithLabels(out, sm.tsdbSamplesAppended, "prometheus_tsdb_head_samples_appended_total", "type")
data.SendSumOfCountersPerUser(out, sm.tsdbOutOfOrderSamplesAppended, "prometheus_tsdb_head_out_of_order_samples_appended_total")
data.SendSumOfCounters(out, sm.tsdbSnapshotReplayErrorTotal, "prometheus_tsdb_snapshot_replay_error_total")
data.SendSumOfHistograms(out, sm.tsdbOOOHistogram, "prometheus_tsdb_sample_ooo_delta")
data.SendSumOfGauges(out, sm.tsdbMmapChunksTotal, "prometheus_tsdb_mmap_chunks_total")
data.SendSumOfCounters(out, sm.checkpointDeleteFail, "prometheus_tsdb_checkpoint_deletions_failed_total")
data.SendSumOfCounters(out, sm.checkpointDeleteTotal, "prometheus_tsdb_checkpoint_deletions_total")
data.SendSumOfCounters(out, sm.checkpointCreationFail, "prometheus_tsdb_checkpoint_creations_failed_total")
Expand Down
98 changes: 93 additions & 5 deletions pkg/ingester/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,16 @@ func TestTSDBMetrics(t *testing.T) {
# TYPE cortex_ingester_tsdb_head_gc_duration_seconds summary
cortex_ingester_tsdb_head_gc_duration_seconds_sum 9
cortex_ingester_tsdb_head_gc_duration_seconds_count 3

# HELP cortex_ingester_tsdb_head_out_of_order_samples_appended_total Total number of appended out of order samples.
# TYPE cortex_ingester_tsdb_head_out_of_order_samples_appended_total counter
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user1"} 102
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user2"} 102
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user3"} 102
# HELP cortex_ingester_tsdb_head_samples_appended_total Total number of appended samples.
# TYPE cortex_ingester_tsdb_head_samples_appended_total counter
cortex_ingester_tsdb_head_samples_appended_total{type="user1",user="float"} 101
cortex_ingester_tsdb_head_samples_appended_total{type="user2",user="float"} 101
cortex_ingester_tsdb_head_samples_appended_total{type="user3",user="float"} 101
# HELP cortex_ingester_tsdb_checkpoint_deletions_failed_total Total number of TSDB checkpoint deletions that failed.
# TYPE cortex_ingester_tsdb_checkpoint_deletions_failed_total counter
cortex_ingester_tsdb_checkpoint_deletions_failed_total 1586096
Expand Down Expand Up @@ -167,15 +176,31 @@ func TestTSDBMetrics(t *testing.T) {
# HELP cortex_ingester_tsdb_mmap_chunk_corruptions_total Total number of memory-mapped TSDB chunk corruptions.
# TYPE cortex_ingester_tsdb_mmap_chunk_corruptions_total counter
cortex_ingester_tsdb_mmap_chunk_corruptions_total 2577406

# HELP cortex_ingester_tsdb_mmap_chunks_total Total number of chunks that were memory-mapped.
# TYPE cortex_ingester_tsdb_mmap_chunks_total gauge
cortex_ingester_tsdb_mmap_chunks_total 0
# HELP cortex_ingester_tsdb_blocks_loaded Number of currently loaded data blocks
# TYPE cortex_ingester_tsdb_blocks_loaded gauge
cortex_ingester_tsdb_blocks_loaded 15

# HELP cortex_ingester_tsdb_reloads_total Number of times the database reloaded block data from disk.
# TYPE cortex_ingester_tsdb_reloads_total counter
cortex_ingester_tsdb_reloads_total 30

# HELP cortex_ingester_tsdb_sample_ooo_delta Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).
# TYPE cortex_ingester_tsdb_sample_ooo_delta histogram
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="600"} 0
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="1800"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="3600"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="7200"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="10800"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="21600"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="43200"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="+Inf"} 3
cortex_ingester_tsdb_sample_ooo_delta_sum 2700
cortex_ingester_tsdb_sample_ooo_delta_count 3
# HELP cortex_ingester_tsdb_snapshot_replay_error_total Total number snapshot replays that failed.
# TYPE cortex_ingester_tsdb_snapshot_replay_error_total counter
cortex_ingester_tsdb_snapshot_replay_error_total 309
# HELP cortex_ingester_tsdb_reloads_failures_total Number of times the database failed to reloadBlocks block data from disk.
# TYPE cortex_ingester_tsdb_reloads_failures_total counter
cortex_ingester_tsdb_reloads_failures_total 21
Expand Down Expand Up @@ -318,6 +343,14 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
# TYPE cortex_ingester_tsdb_head_gc_duration_seconds summary
cortex_ingester_tsdb_head_gc_duration_seconds_sum 9
cortex_ingester_tsdb_head_gc_duration_seconds_count 3
# HELP cortex_ingester_tsdb_head_out_of_order_samples_appended_total Total number of appended out of order samples.
# TYPE cortex_ingester_tsdb_head_out_of_order_samples_appended_total counter
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user1"} 102
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user2"} 102
# HELP cortex_ingester_tsdb_head_samples_appended_total Total number of appended samples.
# TYPE cortex_ingester_tsdb_head_samples_appended_total counter
cortex_ingester_tsdb_head_samples_appended_total{type="user1",user="float"} 101
cortex_ingester_tsdb_head_samples_appended_total{type="user2",user="float"} 101

# HELP cortex_ingester_tsdb_checkpoint_deletions_failed_total Total number of TSDB checkpoint deletions that failed.
# TYPE cortex_ingester_tsdb_checkpoint_deletions_failed_total counter
Expand Down Expand Up @@ -377,15 +410,31 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
# HELP cortex_ingester_tsdb_mmap_chunk_corruptions_total Total number of memory-mapped TSDB chunk corruptions.
# TYPE cortex_ingester_tsdb_mmap_chunk_corruptions_total counter
cortex_ingester_tsdb_mmap_chunk_corruptions_total 2577406

# HELP cortex_ingester_tsdb_mmap_chunks_total Total number of chunks that were memory-mapped.
# TYPE cortex_ingester_tsdb_mmap_chunks_total gauge
cortex_ingester_tsdb_mmap_chunks_total 0
# HELP cortex_ingester_tsdb_blocks_loaded Number of currently loaded data blocks
# TYPE cortex_ingester_tsdb_blocks_loaded gauge
cortex_ingester_tsdb_blocks_loaded 10

# HELP cortex_ingester_tsdb_reloads_total Number of times the database reloaded block data from disk.
# TYPE cortex_ingester_tsdb_reloads_total counter
cortex_ingester_tsdb_reloads_total 30

# HELP cortex_ingester_tsdb_sample_ooo_delta Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).
# TYPE cortex_ingester_tsdb_sample_ooo_delta histogram
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="600"} 0
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="1800"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="3600"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="7200"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="10800"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="21600"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="43200"} 3
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="+Inf"} 3
cortex_ingester_tsdb_sample_ooo_delta_sum 2700
cortex_ingester_tsdb_sample_ooo_delta_count 3
# HELP cortex_ingester_tsdb_snapshot_replay_error_total Total number snapshot replays that failed.
# TYPE cortex_ingester_tsdb_snapshot_replay_error_total counter
cortex_ingester_tsdb_snapshot_replay_error_total 309
# HELP cortex_ingester_tsdb_reloads_failures_total Number of times the database failed to reloadBlocks block data from disk.
# TYPE cortex_ingester_tsdb_reloads_failures_total counter
cortex_ingester_tsdb_reloads_failures_total 21
Expand Down Expand Up @@ -608,6 +657,45 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
})
gcDuration.Observe(3)

samplesAppended := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_samples_appended_total",
Help: "Total number of appended samples.",
}, []string{"type"})
samplesAppended.WithLabelValues("float").Add(101)

outOfOrderSamplesAppended := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
Help: "Total number of appended out of order samples.",
})
outOfOrderSamplesAppended.Add(102)

snapshotReplayErrorTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_snapshot_replay_error_total",
Help: "Total number snapshot replays that failed.",
})
snapshotReplayErrorTotal.Add(103)

oooHistogram := promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_sample_ooo_delta",
Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
Buckets: []float64{
60 * 10, // 10 min
60 * 30, // 30 min
60 * 60, // 60 min
60 * 60 * 2, // 2h
60 * 60 * 3, // 3h
60 * 60 * 6, // 6h
60 * 60 * 12, // 12h
},
})
oooHistogram.Observe(60 * 15)

mmapChunksTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_mmap_chunks_total",
Help: "Total number of chunks that were memory-mapped.",
})
mmapChunksTotal.Add(104)

loadedBlocks := promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_blocks_loaded",
Help: "Number of currently loaded data blocks",
Expand Down