apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: {{ include "probers.fullname" . }}-alerts labels: {{- include "probers.labels" . | nindent 4 }} {{- toYaml .Values.monitoring.labels | nindent 4 }} spec: groups: - name: jkvideo.alerts rules: {{- if .Values.jkvideo.cpuAlerts.enabled }} # Alert: High CPU Utilization - alert: JkvideoHighCPU # This query calculates the CPU usage/limit ratio per container and then uses vector matching # (* on(...) group_left()) to filter the results to only the pods matching the specific labels defined in values.yaml. expr: | ( # 1. Calculate CPU usage rate per container (cAdvisor metrics) sum by (pod, namespace, container) ( rate(container_cpu_usage_seconds_total{ namespace="{{ .Values.jkvideo.namespace }}", container!="", image!="" }[5m]) ) / # 2. Get CPU limits per container (kube-state-metrics) sum by (pod, namespace, container) ( kube_pod_container_resource_limits{ namespace="{{ .Values.jkvideo.namespace }}", resource="cpu" } ) ) # 3. Filter by Kubernetes labels using vector matching against kube_pod_labels * on (pod, namespace) group_left() ( kube_pod_labels{ namespace="{{ .Values.jkvideo.namespace }}" {{- range $key, $value := .Values.jkvideo.podLabels }}, label_{{ include "probers.sanitizePrometheusLabel" $key }}="{{ $value }}" {{- end }} } ) > {{ .Values.jkvideo.cpuAlerts.threshold }} for: {{ .Values.jkvideo.cpuAlerts.for }} labels: severity: warning annotations: summary: "Jkvideo Pod {{ "{{" }} $labels.pod {{ "}}" }} High CPU Utilization" description: "CPU utilization for container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} has exceeded {{ .Values.jkvideo.cpuAlerts.threshold | mul 100 }}% of the limit." {{- end }} {{- if .Values.turnProber.enabled }} # Alert: TURN Prober Failure - alert: TurnProberJobFailed # Alert if the CronJob fails (metric from kube-state-metrics). # The regex matches the job name generated by the CronJob controller (which appends a suffix). expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-turn-.*"} > 0 for: 0m # Alert immediately labels: severity: critical annotations: summary: "TURN Server Probe Failed" description: "The CronJob testing the TURN server failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})." {{- end }} {{- if .Values.e2eProber.enabled }} # Alert: E2E Prober Failure - alert: E2EProberJobFailed expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-e2e-.*"} > 0 for: 0m labels: severity: critical annotations: summary: "Mediasoup E2E Session Probe Failed" description: "The CronJob simulating a 2-party session failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})." {{- end }}