video-iac/k8s/probers/templates/prometheusrule-alerts.yaml

77 lines
3.2 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "probers.fullname" . }}-alerts
labels:
{{- include "probers.labels" . | nindent 4 }}
{{- toYaml .Values.monitoring.labels | nindent 4 }}
spec:
groups:
- name: jkvideo.alerts
rules:
{{- if .Values.jkvideo.cpuAlerts.enabled }}
# Alert: High CPU Utilization
- alert: JkvideoHighCPU
# This query calculates the CPU usage/limit ratio per container and then uses vector matching
# (* on(...) group_left()) to filter the results to only the pods matching the specific labels defined in values.yaml.
expr: |
(
# 1. Calculate CPU usage rate per container (cAdvisor metrics)
sum by (pod, namespace, container) (
rate(container_cpu_usage_seconds_total{
namespace="{{ .Values.jkvideo.namespace }}",
container!="", image!=""
}[5m])
)
/
# 2. Get CPU limits per container (kube-state-metrics)
sum by (pod, namespace, container) (
kube_pod_container_resource_limits{
namespace="{{ .Values.jkvideo.namespace }}",
resource="cpu"
}
)
)
# 3. Filter by Kubernetes labels using vector matching against kube_pod_labels
* on (pod, namespace) group_left()
(
kube_pod_labels{
namespace="{{ .Values.jkvideo.namespace }}"
{{- range $key, $value := .Values.jkvideo.podLabels }},
label_{{ include "probers.sanitizePrometheusLabel" $key }}="{{ $value }}"
{{- end }}
}
) > {{ .Values.jkvideo.cpuAlerts.threshold }}
for: {{ .Values.jkvideo.cpuAlerts.for }}
labels:
severity: warning
annotations:
summary: "Jkvideo Pod {{ "{{" }} $labels.pod {{ "}}" }} High CPU Utilization"
description: "CPU utilization for container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} has exceeded {{ .Values.jkvideo.cpuAlerts.threshold | mul 100 }}% of the limit."
{{- end }}
{{- if .Values.turnProber.enabled }}
# Alert: TURN Prober Failure
- alert: TurnProberJobFailed
# Alert if the CronJob fails (metric from kube-state-metrics).
# The regex matches the job name generated by the CronJob controller (which appends a suffix).
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-turn-.*"} > 0
for: 0m # Alert immediately
labels:
severity: critical
annotations:
summary: "TURN Server Probe Failed"
description: "The CronJob testing the TURN server failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
{{- end }}
{{- if .Values.e2eProber.enabled }}
# Alert: E2E Prober Failure
- alert: E2EProberJobFailed
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-e2e-.*"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Mediasoup E2E Session Probe Failed"
description: "The CronJob simulating a 2-party session failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
{{- end }}