Skip to content

Commit 5ae2a62

Browse files
authored
Expose TSDB checkpoint metrics from ingesters (#2589)
* Expose TSDB checkpoint metrics from ingesters Signed-off-by: Marco Pracucci <[email protected]> * Updated CHANGELOG Signed-off-by: Marco Pracucci <[email protected]> * Renamed checkpoint metrics Signed-off-by: Marco Pracucci <[email protected]>
1 parent 34327cb commit 5ae2a62

File tree

3 files changed

+89
-17
lines changed

3 files changed

+89
-17
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
* [ENHANCEMENT] Redis Cache: Added `idle_timeout`, `wait_on_pool_exhaustion` and `max_conn_lifetime` options to redis cache configuration. #2550
5757
* [ENHANCEMENT] WAL: the experimental tag has been removed on the WAL in ingesters.
5858
* [ENHANCEMENT] Use newer AWS API for paginated queries - removes 'Deprecated' message from logfiles. #2452
59-
* [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580 #2583
59+
* [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580 #2583 #2589
6060
* `cortex_ingester_tsdb_appender_add_duration_seconds`
6161
* `cortex_ingester_tsdb_appender_commit_duration_seconds`
6262
* `cortex_ingester_tsdb_refcache_purge_duration_seconds`
@@ -68,6 +68,10 @@
6868
* `cortex_ingester_tsdb_wal_truncations_failed_total`
6969
* `cortex_ingester_tsdb_wal_truncations_total`
7070
* `cortex_ingester_tsdb_wal_writes_failed_total`
71+
* `cortex_ingester_tsdb_checkpoint_deletions_failed_total`
72+
* `cortex_ingester_tsdb_checkpoint_deletions_total`
73+
* `cortex_ingester_tsdb_checkpoint_creations_failed_total`
74+
* `cortex_ingester_tsdb_checkpoint_creations_total`
7175
* [ENHANCEMENT] Experimental TSDB: added metrics useful to alert on critical conditions of the blocks storage: #2573
7276
* `cortex_compactor_last_successful_run_timestamp_seconds`
7377
* `cortex_querier_blocks_last_successful_sync_timestamp_seconds` (when store-gateway is disabled)

pkg/ingester/metrics.go

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -225,14 +225,18 @@ type tsdbMetrics struct {
225225
uploadFailures *prometheus.Desc // sum(thanos_shipper_upload_failures_total)
226226

227227
// Metrics aggregated from TSDB.
228-
tsdbCompactionsTotal *prometheus.Desc
229-
tsdbCompactionDuration *prometheus.Desc
230-
tsdbFsyncDuration *prometheus.Desc
231-
tsdbPageFlushes *prometheus.Desc
232-
tsdbPageCompletions *prometheus.Desc
233-
tsdbTruncateFail *prometheus.Desc
234-
tsdbTruncateTotal *prometheus.Desc
235-
tsdbWritesFailed *prometheus.Desc
228+
tsdbCompactionsTotal *prometheus.Desc
229+
tsdbCompactionDuration *prometheus.Desc
230+
tsdbFsyncDuration *prometheus.Desc
231+
tsdbPageFlushes *prometheus.Desc
232+
tsdbPageCompletions *prometheus.Desc
233+
tsdbTruncateFail *prometheus.Desc
234+
tsdbTruncateTotal *prometheus.Desc
235+
tsdbWritesFailed *prometheus.Desc
236+
checkpointDeleteFail *prometheus.Desc
237+
checkpointDeleteTotal *prometheus.Desc
238+
checkpointCreationFail *prometheus.Desc
239+
checkpointCreationTotal *prometheus.Desc
236240

237241
// These two metrics replace metrics in ingesterMetrics, as we count them differently
238242
memSeriesCreatedTotal *prometheus.Desc
@@ -248,19 +252,19 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
248252

249253
dirSyncs: prometheus.NewDesc(
250254
"cortex_ingester_shipper_dir_syncs_total",
251-
"TSDB: Total number of dir syncs",
255+
"Total number of TSDB dir syncs",
252256
nil, nil),
253257
dirSyncFailures: prometheus.NewDesc(
254258
"cortex_ingester_shipper_dir_sync_failures_total",
255-
"TSDB: Total number of failed dir syncs",
259+
"Total number of failed TSDB dir syncs",
256260
nil, nil),
257261
uploads: prometheus.NewDesc(
258262
"cortex_ingester_shipper_uploads_total",
259-
"TSDB: Total number of uploaded blocks",
263+
"Total number of uploaded TSDB blocks",
260264
nil, nil),
261265
uploadFailures: prometheus.NewDesc(
262266
"cortex_ingester_shipper_upload_failures_total",
263-
"TSDB: Total number of block upload failures",
267+
"Total number of TSDB block upload failures",
264268
nil, nil),
265269
tsdbCompactionsTotal: prometheus.NewDesc(
266270
"cortex_ingester_tsdb_compactions_total",
@@ -294,6 +298,22 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
294298
"cortex_ingester_tsdb_wal_writes_failed_total",
295299
"Total number of TSDB WAL writes that failed.",
296300
nil, nil),
301+
checkpointDeleteFail: prometheus.NewDesc(
302+
"cortex_ingester_tsdb_checkpoint_deletions_failed_total",
303+
"Total number of TSDB checkpoint deletions that failed.",
304+
nil, nil),
305+
checkpointDeleteTotal: prometheus.NewDesc(
306+
"cortex_ingester_tsdb_checkpoint_deletions_total",
307+
"Total number of TSDB checkpoint deletions attempted.",
308+
nil, nil),
309+
checkpointCreationFail: prometheus.NewDesc(
310+
"cortex_ingester_tsdb_checkpoint_creations_failed_total",
311+
"Total number of TSDB checkpoint creations that failed.",
312+
nil, nil),
313+
checkpointCreationTotal: prometheus.NewDesc(
314+
"cortex_ingester_tsdb_checkpoint_creations_total",
315+
"Total number of TSDB checkpoint creations attempted.",
316+
nil, nil),
297317

298318
memSeriesCreatedTotal: prometheus.NewDesc(memSeriesCreatedTotalName, memSeriesCreatedTotalHelp, []string{"user"}, nil),
299319
memSeriesRemovedTotal: prometheus.NewDesc(memSeriesRemovedTotalName, memSeriesRemovedTotalHelp, []string{"user"}, nil),
@@ -319,6 +339,10 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
319339
out <- sm.tsdbTruncateFail
320340
out <- sm.tsdbTruncateTotal
321341
out <- sm.tsdbWritesFailed
342+
out <- sm.checkpointDeleteFail
343+
out <- sm.checkpointDeleteTotal
344+
out <- sm.checkpointCreationFail
345+
out <- sm.checkpointCreationTotal
322346

323347
out <- sm.memSeriesCreatedTotal
324348
out <- sm.memSeriesRemovedTotal
@@ -341,6 +365,10 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
341365
data.SendSumOfCounters(out, sm.tsdbTruncateFail, "prometheus_tsdb_wal_truncations_failed_total")
342366
data.SendSumOfCounters(out, sm.tsdbTruncateTotal, "prometheus_tsdb_wal_truncations_total")
343367
data.SendSumOfCounters(out, sm.tsdbWritesFailed, "prometheus_tsdb_wal_writes_failed_total")
368+
data.SendSumOfCounters(out, sm.checkpointDeleteFail, "prometheus_tsdb_checkpoint_deletions_failed_total")
369+
data.SendSumOfCounters(out, sm.checkpointDeleteTotal, "prometheus_tsdb_checkpoint_deletions_total")
370+
data.SendSumOfCounters(out, sm.checkpointCreationFail, "prometheus_tsdb_checkpoint_creations_failed_total")
371+
data.SendSumOfCounters(out, sm.checkpointCreationTotal, "prometheus_tsdb_checkpoint_creations_total")
344372

345373
data.SendSumOfCountersPerUser(out, sm.memSeriesCreatedTotal, "prometheus_tsdb_head_series_created_total")
346374
data.SendSumOfCountersPerUser(out, sm.memSeriesRemovedTotal, "prometheus_tsdb_head_series_removed_total")

pkg/ingester/metrics_test.go

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,22 @@ func TestTSDBMetrics(t *testing.T) {
2020
tsdbMetrics.setRegistryForUser("user3", populateTSDBMetrics(999))
2121

2222
err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
23-
# HELP cortex_ingester_shipper_dir_syncs_total TSDB: Total number of dir syncs
23+
# HELP cortex_ingester_shipper_dir_syncs_total Total number of TSDB dir syncs
2424
# TYPE cortex_ingester_shipper_dir_syncs_total counter
2525
# 12345 + 85787 + 999
2626
cortex_ingester_shipper_dir_syncs_total 99131
2727
28-
# HELP cortex_ingester_shipper_dir_sync_failures_total TSDB: Total number of failed dir syncs
28+
# HELP cortex_ingester_shipper_dir_sync_failures_total Total number of failed TSDB dir syncs
2929
# TYPE cortex_ingester_shipper_dir_sync_failures_total counter
3030
# 2*(12345 + 85787 + 999)
3131
cortex_ingester_shipper_dir_sync_failures_total 198262
3232
33-
# HELP cortex_ingester_shipper_uploads_total TSDB: Total number of uploaded blocks
33+
# HELP cortex_ingester_shipper_uploads_total Total number of uploaded TSDB blocks
3434
# TYPE cortex_ingester_shipper_uploads_total counter
3535
# 3*(12345 + 85787 + 999)
3636
cortex_ingester_shipper_uploads_total 297393
3737
38-
# HELP cortex_ingester_shipper_upload_failures_total TSDB: Total number of block upload failures
38+
# HELP cortex_ingester_shipper_upload_failures_total Total number of TSDB block upload failures
3939
# TYPE cortex_ingester_shipper_upload_failures_total counter
4040
# 4*(12345 + 85787 + 999)
4141
cortex_ingester_shipper_upload_failures_total 396524
@@ -88,6 +88,22 @@ func TestTSDBMetrics(t *testing.T) {
8888
# TYPE cortex_ingester_tsdb_wal_writes_failed_total counter
8989
cortex_ingester_tsdb_wal_writes_failed_total 1486965
9090
91+
# HELP cortex_ingester_tsdb_checkpoint_deletions_failed_total Total number of TSDB checkpoint deletions that failed.
92+
# TYPE cortex_ingester_tsdb_checkpoint_deletions_failed_total counter
93+
cortex_ingester_tsdb_checkpoint_deletions_failed_total 1586096
94+
95+
# HELP cortex_ingester_tsdb_checkpoint_deletions_total Total number of TSDB checkpoint deletions attempted.
96+
# TYPE cortex_ingester_tsdb_checkpoint_deletions_total counter
97+
cortex_ingester_tsdb_checkpoint_deletions_total 1685227
98+
99+
# HELP cortex_ingester_tsdb_checkpoint_creations_failed_total Total number of TSDB checkpoint creations that failed.
100+
# TYPE cortex_ingester_tsdb_checkpoint_creations_failed_total counter
101+
cortex_ingester_tsdb_checkpoint_creations_failed_total 1784358
102+
103+
# HELP cortex_ingester_tsdb_checkpoint_creations_total Total number of TSDB checkpoint creations attempted.
104+
# TYPE cortex_ingester_tsdb_checkpoint_creations_total counter
105+
cortex_ingester_tsdb_checkpoint_creations_total 1883489
106+
91107
# HELP cortex_ingester_memory_series_created_total The total number of series that were created per user.
92108
# TYPE cortex_ingester_memory_series_created_total counter
93109
# 5 * (12345, 85787 and 999 respectively)
@@ -194,5 +210,29 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
194210
})
195211
writesFailed.Add(15 * base)
196212

213+
checkpointDeleteFail := promauto.With(r).NewCounter(prometheus.CounterOpts{
214+
Name: "prometheus_tsdb_checkpoint_deletions_failed_total",
215+
Help: "Total number of checkpoint deletions that failed.",
216+
})
217+
checkpointDeleteFail.Add(16 * base)
218+
219+
checkpointDeleteTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
220+
Name: "prometheus_tsdb_checkpoint_deletions_total",
221+
Help: "Total number of checkpoint deletions attempted.",
222+
})
223+
checkpointDeleteTotal.Add(17 * base)
224+
225+
checkpointCreationFail := promauto.With(r).NewCounter(prometheus.CounterOpts{
226+
Name: "prometheus_tsdb_checkpoint_creations_failed_total",
227+
Help: "Total number of checkpoint creations that failed.",
228+
})
229+
checkpointCreationFail.Add(18 * base)
230+
231+
checkpointCreationTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
232+
Name: "prometheus_tsdb_checkpoint_creations_total",
233+
Help: "Total number of checkpoint creations attempted.",
234+
})
235+
checkpointCreationTotal.Add(19 * base)
236+
197237
return r
198238
}

0 commit comments

Comments
 (0)