apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: cluster-node-alerts labels: app: kube-prometheus-stack app.kubernetes.io/instance: {{ .Release.Name }} spec: groups: - name: node.alerts rules: - alert: NodeMissingWorkloadLabel expr: | count by (instance) (node_cpu_seconds_total) unless count by (instance) (node_cpu_seconds_total{workload=~".+"}) for: 5m labels: severity: warning annotations: summary: "Node missing workload label on metrics" description: "Metrics for instance {{ "{{" }} $labels.instance {{ "}}" }} are missing the 'workload' label, which is required for NodeHighCPU alerts." - alert: MediaNodeHighCPU expr: | ( (1 - avg without (cpu, mode) (rate(node_cpu_seconds_total{mode="idle", workload="media"}[1m]))) * 100 > {{ .Values.cpuThresholdMedia | default 65 }} ) or ( (1 - avg without (cpu, mode) (rate(node_cpu_seconds_total{mode="idle", workload!="media"}[1m]))) * 100 > {{ .Values.cpuThresholdOther | default 80 }} ) for: 1m labels: severity: warning annotations: summary: "High CPU usage on node {{ "{{" }} $labels.instance {{ "}}" }}" description: "Node {{ "{{" }} $labels.instance {{ "}}" }} (workload: {{ "{{" }} $labels.workload {{ "}}" }}) has CPU usage above threshold (current value: {{ "{{" }} $value | printf \"%.2f\" {{ "}}" }}%)"