77 lines
3.2 KiB
YAML
77 lines
3.2 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: {{ include "probers.fullname" . }}-alerts
|
|
labels:
|
|
{{- include "probers.labels" . | nindent 4 }}
|
|
{{- toYaml .Values.monitoring.labels | nindent 4 }}
|
|
spec:
|
|
groups:
|
|
- name: jkvideo.alerts
|
|
rules:
|
|
{{- if .Values.jkvideo.cpuAlerts.enabled }}
|
|
# Alert: High CPU Utilization
|
|
- alert: JkvideoHighCPU
|
|
# This query calculates the CPU usage/limit ratio per container and then uses vector matching
|
|
# (* on(...) group_left()) to filter the results to only the pods matching the specific labels defined in values.yaml.
|
|
expr: |
|
|
(
|
|
# 1. Calculate CPU usage rate per container (cAdvisor metrics)
|
|
sum by (pod, namespace, container) (
|
|
rate(container_cpu_usage_seconds_total{
|
|
namespace="{{ .Values.jkvideo.namespace }}",
|
|
container!="", image!=""
|
|
}[5m])
|
|
)
|
|
/
|
|
# 2. Get CPU limits per container (kube-state-metrics)
|
|
sum by (pod, namespace, container) (
|
|
kube_pod_container_resource_limits{
|
|
namespace="{{ .Values.jkvideo.namespace }}",
|
|
resource="cpu"
|
|
}
|
|
)
|
|
)
|
|
# 3. Filter by Kubernetes labels using vector matching against kube_pod_labels
|
|
* on (pod, namespace) group_left()
|
|
(
|
|
kube_pod_labels{
|
|
namespace="{{ .Values.jkvideo.namespace }}"
|
|
{{- range $key, $value := .Values.jkvideo.podLabels }},
|
|
label_{{ include "probers.sanitizePrometheusLabel" $key }}="{{ $value }}"
|
|
{{- end }}
|
|
}
|
|
) > {{ .Values.jkvideo.cpuAlerts.threshold }}
|
|
for: {{ .Values.jkvideo.cpuAlerts.for }}
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Jkvideo Pod {{ "{{" }} $labels.pod {{ "}}" }} High CPU Utilization"
|
|
description: "CPU utilization for container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} has exceeded {{ .Values.jkvideo.cpuAlerts.threshold | mul 100 }}% of the limit."
|
|
{{- end }}
|
|
|
|
{{- if .Values.turnProber.enabled }}
|
|
# Alert: TURN Prober Failure
|
|
- alert: TurnProberJobFailed
|
|
# Alert if the CronJob fails (metric from kube-state-metrics).
|
|
# The regex matches the job name generated by the CronJob controller (which appends a suffix).
|
|
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-turn-.*"} > 0
|
|
for: 0m # Alert immediately
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "TURN Server Probe Failed"
|
|
description: "The CronJob testing the TURN server failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
|
|
{{- end }}
|
|
|
|
{{- if .Values.e2eProber.enabled }}
|
|
# Alert: E2E Prober Failure
|
|
- alert: E2EProberJobFailed
|
|
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-e2e-.*"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Mediasoup E2E Session Probe Failed"
|
|
description: "The CronJob simulating a 2-party session failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
|
|
{{- end }} |