Skip to content

Commit 6c2705f

Browse files
utam0kroboquat
authored andcommitted
observability: Ring the phone only when a data loss occurs with GitpodWsDaemonCrashLoopingg
1 parent c0ef19f commit 6c2705f

File tree

1 file changed

+15
-1
lines changed
  • operations/observability/mixins/workspace/rules/components/ws-daemon

1 file changed

+15
-1
lines changed

operations/observability/mixins/workspace/rules/components/ws-daemon/alerts.libsonnet

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
{
1313
alert: 'GitpodWsDaemonCrashLooping',
1414
labels: {
15-
severity: 'critical',
15+
severity: 'warning',
1616
},
1717
annotations: {
1818
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWsDaemonCrashLooping.md',
@@ -23,6 +23,20 @@
2323
increase(kube_pod_container_status_restarts_total{container="ws-daemon"}[10m]) > 0
2424
|||,
2525
},
26+
{
27+
alert: 'BackupFailureBecauseOfGitpodWsDaemonCrash',
28+
labels: {
29+
severity: 'critical',
30+
},
31+
annotations: {
32+
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWsDaemonCrashLooping.md',
33+
summary: 'Increase the number of backup failure because of ws-daemon is crashlooping.',
34+
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes.',
35+
},
36+
expr: |||
37+
sum(increase(kube_pod_container_status_restarts_total{container="ws-daemon"}[10m])) > 0 AND sum(increase(gitpod_ws_manager_workspace_backups_failure_total{type="REGULAR"}[10m])) > 0
38+
|||,
39+
},
2640
{
2741
alert: 'GitpodWsDaemonExcessiveGC',
2842
labels: {

0 commit comments

Comments
 (0)