Skip to content

Commit c3727b3

Browse files
committed
Address feedback
Signed-off-by: Goutham Veeramachaneni <[email protected]>
1 parent 950221f commit c3727b3

File tree

2 files changed

+31
-27
lines changed

2 files changed

+31
-27
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* [CHANGE] Experimental Delete Series: `/api/v1/admin/tsdb/delete_series` and `/api/v1/admin/tsdb/cancel_delete_request` purger APIs to return status code `204` instead of `200` for success. #2946
1313
* [CHANGE] Histogram `cortex_memcache_request_duration_seconds` `method` label value changes from `Memcached.Get` to `Memcached.GetBatched` for batched lookups, and is not reported for non-batched lookups (label value `Memcached.GetMulti` remains, and had exactly the same value as `Get` in nonbatched lookups). The same change applies to tracing spans. #3046
1414
* [CHANGE] TLS server validation is now enabled by default, a new parameter `tls_insecure_skip_verify` can be set to true to skip validation optionally. #3030
15+
* [CHANGE] `cortex_ruler_config_update_failures_total` has been removed in favor of `cortex_ruler_config_last_reload_successful`. #3056
1516
* [ENHANCEMENT] Add support for azure storage in China, German and US Government environments. #2988
1617
* [ENHANCEMENT] Query-tee: added a small tolerance to floating point sample values comparison. #2994
1718
* [ENHANCEMENT] Query-tee: add support for doing a passthrough of requests to preferred backend for unregistered routes #3018

pkg/ruler/manager.go

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,24 +20,6 @@ import (
2020
"github.com/cortexproject/cortex/pkg/util"
2121
)
2222

23-
var (
24-
configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
25-
Namespace: "cortex",
26-
Name: "ruler_config_updates_total",
27-
Help: "Total number of config updates triggered by a user",
28-
}, []string{"user"})
29-
configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{
30-
Namespace: "cortex",
31-
Name: "ruler_config_update_failures_total",
32-
Help: "Total number of config update failures triggered by a user",
33-
}, []string{"user", "reason"})
34-
userManagerFailed = promauto.NewGaugeVec(prometheus.GaugeOpts{
35-
Namespace: "cortex",
36-
Name: "ruler_manager_failed",
37-
Help: "Boolean set to 1 whenever the Ruler manager failed to start for a user.",
38-
}, []string{"user"})
39-
)
40-
4123
type DefaultMultiTenantManager struct {
4224
cfg Config
4325
notifierCfg *config.Config
@@ -55,9 +37,12 @@ type DefaultMultiTenantManager struct {
5537
notifiersMtx sync.Mutex
5638
notifiers map[string]*rulerNotifier
5739

58-
managersTotal prometheus.Gauge
59-
registry prometheus.Registerer
60-
logger log.Logger
40+
managersTotal prometheus.Gauge
41+
lastReloadSuccessful *prometheus.GaugeVec
42+
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
43+
configUpdatesTotal *prometheus.CounterVec
44+
registry prometheus.Registerer
45+
logger log.Logger
6146
}
6247

6348
func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
@@ -84,6 +69,21 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
8469
Name: "ruler_managers_total",
8570
Help: "Total number of managers registered and running in the ruler",
8671
}),
72+
lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
73+
Namespace: "cortex",
74+
Name: "ruler_config_last_reload_successful",
75+
Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.",
76+
}, []string{"user"}),
77+
lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
78+
Namespace: "cortex",
79+
Name: "ruler_config_last_reload_successful_seconds",
80+
Help: "Timestamp of the last successful configuration reload.",
81+
}, []string{"user"}),
82+
configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
83+
Namespace: "cortex",
84+
Name: "ruler_config_updates_total",
85+
Help: "Total number of config updates triggered by a user",
86+
}, []string{"user"}),
8787
registry: reg,
8888
logger: logger,
8989
}, nil
@@ -104,6 +104,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
104104
if _, exists := ruleGroups[userID]; !exists {
105105
go mngr.Stop()
106106
delete(r.userManagers, userID)
107+
r.lastReloadSuccessful.DeleteLabelValues(userID)
108+
r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
109+
r.configUpdatesTotal.DeleteLabelValues(userID)
107110
level.Info(r.logger).Log("msg", "deleting rule manager", "user", userID)
108111
}
109112
}
@@ -118,19 +121,19 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
118121
// have been updated
119122
update, files, err := r.mapper.MapRules(user, groups.Formatted())
120123
if err != nil {
124+
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
121125
level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err)
122126
return
123127
}
124128

125129
if update {
126130
level.Debug(r.logger).Log("msg", "updating rules", "user", "user")
127-
configUpdatesTotal.WithLabelValues(user).Inc()
131+
r.configUpdatesTotal.WithLabelValues(user).Inc()
128132
manager, exists := r.userManagers[user]
129133
if !exists {
130134
manager, err = r.newManager(ctx, user)
131135
if err != nil {
132-
configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc()
133-
userManagerFailed.WithLabelValues(user).Set(1)
136+
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
134137
level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err)
135138
return
136139
}
@@ -141,13 +144,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
141144
}
142145
err = manager.Update(r.cfg.EvaluationInterval, files, nil)
143146
if err != nil {
144-
configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc()
145-
userManagerFailed.WithLabelValues(user).Set(1)
147+
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
146148
level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err)
147149
return
148150
}
149151

150-
userManagerFailed.WithLabelValues(user).Set(0)
152+
r.lastReloadSuccessful.WithLabelValues(user).Set(1)
153+
r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
151154
}
152155
}
153156

0 commit comments

Comments
 (0)