Skip to content

Commit f899f9b

Browse files
committed
s/kube_node_info/kube_node_role
Use kube_node_role{role="control-plane"} (see [1] and [2]) to estimate if the cluster is HA or not. * [1]: https://github.com/search?q=repo%3Akubernetes%2Fkube-state-metrics%20kube_node_role&type=code * [2]: https://kubernetes.io/docs/reference/labels-annotations-taints/#node-role-kubernetes-io-control-plane Also drop any thresholds as they would lead to false positives.
1 parent e118942 commit f899f9b

File tree

2 files changed

+22
-22
lines changed

2 files changed

+22
-22
lines changed

alerts/resource_alerts.libsonnet

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ local utils = import '../lib/utils.libsonnet';
3737
if $._config.showMultiCluster then {
3838
expr: |||
3939
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
40-
0.85 * sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0
40+
sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0
4141
and
42-
count by (cluster) (max by (cluster, node) (kube_node_info)) == 1)
42+
count by (cluster) (max by (cluster, node) (kube_node_role{role="control-plane"})) < 3)
4343
or
4444
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
4545
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
@@ -54,9 +54,9 @@ local utils = import '../lib/utils.libsonnet';
5454
} else {
5555
expr: |||
5656
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
57-
0.85 * sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0
57+
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0
5858
and
59-
count(max by (node) (kube_node_info)) == 1)
59+
count(max by (node) (kube_node_role{role="control-plane"})) < 3)
6060
or
6161
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
6262
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
@@ -82,9 +82,9 @@ local utils = import '../lib/utils.libsonnet';
8282
if $._config.showMultiCluster then {
8383
expr: |||
8484
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
85-
0.85 * sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0
85+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0
8686
and
87-
count by (cluster) (max by (cluster, node) (kube_node_info)) == 1)
87+
count by (cluster) (max by (cluster, node) (kube_node_role{role="control-plane"})) < 3)
8888
or
8989
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
9090
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
@@ -99,9 +99,9 @@ local utils = import '../lib/utils.libsonnet';
9999
} else {
100100
expr: |||
101101
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
102-
0.85 * sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0
102+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0
103103
and
104-
count(max by (node) (kube_node_info)) == 1)
104+
count(max by (node) (kube_node_role{role="control-plane"})) < 3)
105105
or
106106
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
107107
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -

tests/tests.yaml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,15 +1425,15 @@ tests:
14251425
summary: "StatefulSet has not matched the expected number of replicas."
14261426

14271427
- name: KubeCPUOvercommit alert (single-node)
1428-
- interval: 1m
1428+
interval: 1m
14291429
input_series:
14301430
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
14311431
values: '1x10'
14321432
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
14331433
values: '1x10'
14341434
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="cpu", job="kube-state-metrics"}'
14351435
values: '1.9x10' # This value was seen on a 2x vCPU node
1436-
- series: 'kube_node_info{cluster="kubernetes", node="n1", job="kube-state-metrics"}'
1436+
- series: 'kube_node_role{ cluster="kubernetes", node="n1", role="control-plane"}'
14371437
values: '1x10'
14381438
alert_rule_test:
14391439
- eval_time: 9m
@@ -1444,12 +1444,12 @@ tests:
14441444
- exp_labels:
14451445
severity: warning
14461446
exp_annotations:
1447-
description: Cluster has overcommitted CPU resource requests for Pods by 0.385 CPU shares and cannot tolerate node failure.
1447+
description: Cluster has overcommitted CPU resource requests for Pods by 0.10000000000000009 CPU shares and cannot tolerate node failure.
14481448
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
14491449
summary: Cluster has overcommitted CPU resource requests.
14501450

14511451
- name: KubeCPUOvercommit alert (multi-node)
1452-
- interval: 1m
1452+
interval: 1m
14531453
input_series:
14541454
- series: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
14551455
values: '2x10'
@@ -1459,9 +1459,9 @@ tests:
14591459
values: '1.9x10' # This value was seen on a 2x vCPU node
14601460
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n2", resource="cpu", job="kube-state-metrics"}'
14611461
values: '1.9x10'
1462-
- series: 'kube_node_info{cluster="kubernetes", node="n1", job="kube-state-metrics"}'
1462+
- series: 'kube_node_role{ cluster="kubernetes", node="n1", role="control-plane"}'
14631463
values: '1x10'
1464-
- series: 'kube_node_info{cluster="kubernetes", node="n2", job="kube-state-metrics"}'
1464+
- series: 'kube_node_role{ cluster="kubernetes", node="n2", role="control-plane"}'
14651465
values: '1x10'
14661466
alert_rule_test:
14671467
- eval_time: 9m
@@ -1472,20 +1472,20 @@ tests:
14721472
- exp_labels:
14731473
severity: warning
14741474
exp_annotations:
1475-
description: Cluster has overcommitted CPU resource requests for Pods by 2.1 CPU shares and cannot tolerate node failure.
1475+
description: Cluster has overcommitted CPU resource requests for Pods by 0.20000000000000018 CPU shares and cannot tolerate node failure.
14761476
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
14771477
summary: Cluster has overcommitted CPU resource requests.
14781478

14791479
- name: KubeMemoryOvercommit alert (single-node)
1480-
- interval: 1m
1480+
interval: 1m
14811481
input_series:
14821482
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
14831483
values: '1000000000x10' # 1 GB
14841484
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="kube-system"}'
14851485
values: '1000000000x10'
14861486
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n1", resource="memory", job="kube-state-metrics"}'
14871487
values: '1000000000x10'
1488-
- series: 'kube_node_info{cluster="kubernetes", node="n1", job="kube-state-metrics"}'
1488+
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane"}'
14891489
values: '1x10'
14901490
alert_rule_test:
14911491
- eval_time: 9m
@@ -1496,12 +1496,12 @@ tests:
14961496
- exp_labels:
14971497
severity: warning
14981498
exp_annotations:
1499-
description: Cluster has overcommitted memory resource requests for Pods by 1.15G bytes and cannot tolerate node failure.
1499+
description: Cluster has overcommitted memory resource requests for Pods by 1G bytes and cannot tolerate node failure.
15001500
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
15011501
summary: Cluster has overcommitted memory resource requests.
15021502

15031503
- name: KubeMemoryOvercommit alert (multi-node)
1504-
- interval: 1m
1504+
interval: 1m
15051505
input_series:
15061506
- series: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes", namespace="default"}'
15071507
values: '2000000000x10' # 2 GB
@@ -1511,9 +1511,9 @@ tests:
15111511
values: '1000000000x10'
15121512
- series: 'kube_node_status_allocatable{cluster="kubernetes", node="n2", resource="memory", job="kube-state-metrics"}'
15131513
values: '1000000000x10'
1514-
- series: 'kube_node_info{cluster="kubernetes", node="n1", job="kube-state-metrics"}'
1514+
- series: 'kube_node_role{cluster="kubernetes", node="n1", role="control-plane"}'
15151515
values: '1x10'
1516-
- series: 'kube_node_info{cluster="kubernetes", node="n2", job="kube-state-metrics"}'
1516+
- series: 'kube_node_role{cluster="kubernetes", node="n2", role="control-plane"}'
15171517
values: '1x10'
15181518
alert_rule_test:
15191519
- eval_time: 9m
@@ -1524,6 +1524,6 @@ tests:
15241524
- exp_labels:
15251525
severity: warning
15261526
exp_annotations:
1527-
description: Cluster has overcommitted memory resource requests for Pods by 3G bytes and cannot tolerate node failure.
1527+
description: Cluster has overcommitted memory resource requests for Pods by 2G bytes and cannot tolerate node failure.
15281528
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
15291529
summary: Cluster has overcommitted memory resource requests.

0 commit comments

Comments
 (0)