video-iac/k8s/monitoring/templates/node-alerts.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: cluster-node-alerts
  labels:
    app: kube-prometheus-stack
    app.kubernetes.io/instance: {{ .Release.Name }}
spec:
   groups:
  - name: node.alerts
    rules:
    - alert: InternalTestAlert
      expr: vector(1)
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: "Internal Alert Pipeline Test"
        description: "This alert is manually triggered to verify the Slack alerting pipeline."

    - alert: NodeMissingWorkloadLabel
      expr: |
        count by (instance) (node_cpu_seconds_total) unless count by (instance) (node_cpu_seconds_total{workload=~".+"})
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Node missing workload label on metrics"
        description: "Metrics for instance {{ "{{" }} $labels.instance {{ "}}" }} are missing the 'workload' label, which is required for NodeHighCPU alerts."

    - alert: MediaNodeHighCPU
      expr: |
        (
          (1 - avg without (cpu, mode) (rate(node_cpu_seconds_total{mode="idle", workload="media"}[1m]))) * 100 > {{ .Values.cpuThresholdMedia | default 65 }}
        )
        or
        (
          (1 - avg without (cpu, mode) (rate(node_cpu_seconds_total{mode="idle", workload!="media"}[1m]))) * 100 > {{ .Values.cpuThresholdOther | default 80 }}
        )
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "High CPU usage on node {{ "{{" }} $labels.instance {{ "}}" }}"
        description: "Node {{ "{{" }} $labels.instance {{ "}}" }} (workload: {{ "{{" }} $labels.workload {{ "}}" }}) has CPU usage above threshold (current value: {{ "{{" }} $value | printf \"%.2f\" {{ "}}" }}%)"