video-iac/k8s/webrtc-be/templates/alerts.yaml

44 lines
1.7 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: webrtc-be-log-alerts
labels:
app: webrtc-be
release: prometheus-stack
spec:
groups:
- name: webrtc-be.alerts
rules:
- alert: WebrtcBeError
expr: 'sum(count_over_time({container="webrtc-be", namespace="webrtc-be"} |= "error" [5m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: "Errors found in webrtc-be logs"
description: "The webrtc-be container is logging errors. Please check the logs."
loki_link: >-
{{ .Values.grafana.externalUrl }}/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{container=\"webrtc-be\", namespace=\"webrtc-be\"}"}]
- alert: WebrtcBeCrashed
expr: increase(kube_pod_container_status_restarts_total{container="webrtc-be", namespace="webrtc-be"}[5m]) > 0
# Instant alert - no 'for' duration
labels:
severity: critical
annotations:
summary: "webrtc-be crashed"
description: "The webrtc-be pod has crashed. Please check the logs."
loki_link: >-
{{ .Values.grafana.externalUrl }}/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{container=\"webrtc-be\", namespace=\"webrtc-be\"}"}]
- alert: WebrtcBeDown
expr: kube_deployment_status_replicas_available{deployment="webrtc-be", namespace="webrtc-be"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "webrtc-be is down"
description: "The webrtc-be service has been unavailable for more than 1 minute."
loki_link: >-
{{ .Values.grafana.externalUrl }}/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{container=\"webrtc-be\", namespace=\"webrtc-be\"}"}]