diff --git a/operations/observability/mixins/meta/rules/public-api.yaml b/operations/observability/mixins/meta/rules/public-api.yaml index 987ce989caf062..25a634e3e696d4 100644 --- a/operations/observability/mixins/meta/rules/public-api.yaml +++ b/operations/observability/mixins/meta/rules/public-api.yaml @@ -30,3 +30,14 @@ spec: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/PublicAPI_ServiceReturningServerErrors.md summary: PublicAPI serves multiple different Services and RPC. There have been failing requests due to server errors. Investigation required. description: Service {{ $labels.package }}.{{ $labels.call }} has returned {{ printf "%.2f" $value }} server errors in the last 10 minutes. + + - alert: GitpodStripeWebhookFailures + expr: sum(increase(gitpod_http_request_duration_seconds_count{handler="/stripe/invoices/webhook", code=~"5.*"}[30m])) > 0 + for: 10m + labels: + severity: warning + team: webapp + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageStripeWebhookFailures.md + summary: Detected {{ printf "%.2f" $value }} errors handling Stripe webhook. + description: Stripe is sending us webhooks but we are failing to handle them. Inconsistent usage data very likely. diff --git a/operations/observability/mixins/meta/rules/usage.yaml b/operations/observability/mixins/meta/rules/usage.yaml index 03cfaa3b099fc1..ae0d07b0b24245 100644 --- a/operations/observability/mixins/meta/rules/usage.yaml +++ b/operations/observability/mixins/meta/rules/usage.yaml @@ -57,17 +57,6 @@ spec: summary: Usage reconciliation has not run successfully for {{ printf "%.2f" $value }} seconds. Usage data is stale. description: We have not executed scheduled usage reconciliation for {{ printf "%.2f" $value }} seconds. We expect the data to update every 15 minutes to avoid stale usage records and stale invoices. - - alert: GitpodUsageStripeWebhookFailures - expr: sum(increase(gitpod_http_request_duration_seconds_count{handler="/stripe/invoices/webhook", code=~"5.*"}[30m])) > 0 - for: 30m - labels: - severity: warning - team: webapp - annotations: - runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageStripeWebhookFailures.md - summary: Detected {{ printf "%.2f" $value }} errors handling Stripe webhook. - description: Stripe is sending us webhooks but we are failing to handle them. Inconsistent usage data very likely. - - alert: UsageHighCPUUsage # Reasoning: high rates of CPU consumption should only be temporary. expr: avg(rate(container_cpu_usage_seconds_total{container!="POD", pod=~"usage-.*"}[5m])) by (cluster) > 0.2