46 lines
1.7 KiB
YAML
46 lines
1.7 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: cluster-node-alerts
|
|
labels:
|
|
app: kube-prometheus-stack
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
spec:
|
|
groups:
|
|
- name: node.alerts
|
|
rules:
|
|
- alert: InternalTestAlert
|
|
expr: vector(1)
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Internal Alert Pipeline Test"
|
|
description: "This alert is manually triggered to verify the Slack alerting pipeline."
|
|
|
|
- alert: NodeMissingWorkloadLabel
|
|
expr: |
|
|
count by (instance) (node_cpu_seconds_total) unless count by (instance) (node_cpu_seconds_total{workload=~".+"})
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node missing workload label on metrics"
|
|
description: "Metrics for instance {{ "{{" }} $labels.instance {{ "}}" }} are missing the 'workload' label, which is required for NodeHighCPU alerts."
|
|
|
|
- alert: MediaNodeHighCPU
|
|
expr: |
|
|
(
|
|
(1 - avg without (cpu, mode) (rate(node_cpu_seconds_total{mode="idle", workload="media"}[1m]))) * 100 > {{ .Values.cpuThresholdMedia | default 65 }}
|
|
)
|
|
or
|
|
(
|
|
(1 - avg without (cpu, mode) (rate(node_cpu_seconds_total{mode="idle", workload!="media"}[1m]))) * 100 > {{ .Values.cpuThresholdOther | default 80 }}
|
|
)
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on node {{ "{{" }} $labels.instance {{ "}}" }}"
|
|
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} (workload: {{ "{{" }} $labels.workload {{ "}}" }}) has CPU usage above threshold (current value: {{ "{{" }} $value | printf \"%.2f\" {{ "}}" }}%)"
|