Trying to fix more alerts
This commit is contained in:
parent
8ebefa8c93
commit
c195a6df1c
|
|
@ -3,17 +3,17 @@ kind: PrometheusRule
|
||||||
metadata:
|
metadata:
|
||||||
name: cluster-node-alerts
|
name: cluster-node-alerts
|
||||||
labels:
|
labels:
|
||||||
app: prometheus-stack
|
app: kube-prometheus-stack
|
||||||
release: prometheus-stack
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||||
spec:
|
spec:
|
||||||
groups:
|
groups:
|
||||||
- name: node.alerts
|
- name: node.alerts
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeHighCPU
|
- alert: NodeHighCPU
|
||||||
expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 20
|
expr: (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 20
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "High CPU usage on node {{ $labels.instance }}"
|
summary: "High CPU usage on node {{ "{{" }} $labels.instance {{ "}}" }}"
|
||||||
description: "Node {{ $labels.instance }} has CPU usage above 20% (current value: {{ $value | printf \"%.2f\" }}%)"
|
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has CPU usage above 20% (current value: {{ "{{" }} $value | printf \"%.2f\" {{ "}}" }}%)"
|
||||||
|
|
|
||||||
|
|
@ -50,8 +50,8 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(node_cpu_seconds_total{mode!=\"idle\"}) by (workload) / sum(machine_cpu_cores) by (workload) * 100",
|
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (workload) / sum(machine_cpu_cores) by (workload) * 100",
|
||||||
"legendFormat": "{{workload}} CPU %",
|
"legendFormat": "{{ "{{" }}workload{{ "}}" }} CPU %",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -73,7 +73,7 @@ data:
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100",
|
"expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100",
|
||||||
"legendFormat": "{{workload}} Mem %",
|
"legendFormat": "{{ "{{" }}workload{{ "}}" }} Mem %",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -87,7 +87,7 @@ data:
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0",
|
"expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0",
|
||||||
"legendFormat": "{{namespace}}/{{pod}} ({{node}})",
|
"legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}pod{{ "}}" }} ({{ "{{" }}node{{ "}}" }})",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,8 @@ kind: PrometheusRule
|
||||||
metadata:
|
metadata:
|
||||||
name: cluster-pod-alerts
|
name: cluster-pod-alerts
|
||||||
labels:
|
labels:
|
||||||
app: prometheus-stack
|
app: kube-prometheus-stack
|
||||||
release: prometheus-stack
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||||
spec:
|
spec:
|
||||||
groups:
|
groups:
|
||||||
- name: pod.alerts
|
- name: pod.alerts
|
||||||
|
|
@ -16,7 +16,7 @@ spec:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Pod OOMKilled"
|
summary: "Pod OOMKilled"
|
||||||
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) was OOMKilled."
|
description: "Container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} (namespace {{ "{{" }} $labels.namespace {{ "}}" }}) was OOMKilled."
|
||||||
|
|
||||||
- alert: PodCrashLoopBackOff
|
- alert: PodCrashLoopBackOff
|
||||||
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0
|
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0
|
||||||
|
|
@ -25,4 +25,4 @@ spec:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Pod in CrashLoopBackOff"
|
summary: "Pod in CrashLoopBackOff"
|
||||||
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is in CrashLoopBackOff."
|
description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} in namespace {{ "{{" }} $labels.namespace {{ "}}" }} is in CrashLoopBackOff."
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue