diff --git a/k8s/applications/templates/probers.yaml b/k8s/applications/templates/probers.yaml new file mode 100644 index 0000000..7e1825d --- /dev/null +++ b/k8s/applications/templates/probers.yaml @@ -0,0 +1,23 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: probers +spec: + destination: + name: '' + namespace: probers + server: 'https://kubernetes.default.svc' + source: + helm: + valueFiles: + - values-{{ .Values.environment }}.yaml + path: k8s/probers + repoURL: 'git@bitbucket.org:jamkazam/video-iac.git' + targetRevision: {{ .Values.gitBranch }} + project: default + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true diff --git a/k8s/probers/Chart.yaml b/k8s/probers/Chart.yaml new file mode 100644 index 0000000..f0380eb --- /dev/null +++ b/k8s/probers/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: probers +description: A Helm chart for various propers; in particular webrtc_be probing +type: application +version: 0.1.0 +appVersion: "1.0.0" \ No newline at end of file diff --git a/k8s/probers/templates/_helpers.tpl b/k8s/probers/templates/_helpers.tpl new file mode 100644 index 0000000..f0ec6c9 --- /dev/null +++ b/k8s/probers/templates/_helpers.tpl @@ -0,0 +1,32 @@ +{{/* +Create a default fully qualified app name. +*/}} +{{- define "probers.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "probers.labels" -}} +helm.sh/chart: {{ printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Sanitize Kubernetes label keys for Prometheus (kube-state-metrics compatibility). +Replaces characters like '.', '/', and '-' with '_'. +Example: app.kubernetes.io/name -> app_kubernetes_io_name +*/}} +{{- define "probers.sanitizePrometheusLabel" -}} +{{- . | replace "." "_" | replace "/" "_" | replace "-" "_" -}} +{{- end -}} \ No newline at end of file diff --git a/k8s/probers/templates/e2e-prober-cronjob.yaml b/k8s/probers/templates/e2e-prober-cronjob.yaml new file mode 100644 index 0000000..05ce200 --- /dev/null +++ b/k8s/probers/templates/e2e-prober-cronjob.yaml @@ -0,0 +1,28 @@ +{{- if .Values.e2eProber.enabled }} +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "probers.fullname" . }}-e2e + labels: + {{- include "probers.labels" . | nindent 4 }} +spec: + schedule: "{{ .Values.e2eProber.schedule }}" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + spec: + restartPolicy: OnFailure + containers: + - name: e2e-tester + image: "{{ .Values.e2eProber.image.repository }}:{{ .Values.e2eProber.image.tag }}" + imagePullPolicy: {{ .Values.e2eProber.image.pullPolicy }} + # Assuming the image's entrypoint executes the test suite (e.g., `npm test` or `pytest`) + env: + - name: BACKEND_URL + value: {{ .Values.e2eProber.backendUrl | quote }} + resources: + {{- toYaml .Values.e2eProber.resources | nindent 14 }} +{{- end }} \ No newline at end of file diff --git a/k8s/probers/templates/prometheusrule-alerts.yaml b/k8s/probers/templates/prometheusrule-alerts.yaml new file mode 100644 index 0000000..2b35c1d --- /dev/null +++ b/k8s/probers/templates/prometheusrule-alerts.yaml @@ -0,0 +1,77 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "probers.fullname" . }}-alerts + labels: + {{- include "probers.labels" . | nindent 4 }} + {{- toYaml .Values.monitoring.labels | nindent 4 }} +spec: + groups: + - name: jkvideo.alerts + rules: + {{- if .Values.jkvideo.cpuAlerts.enabled }} + # Alert: High CPU Utilization + - alert: JkvideoHighCPU + # This query calculates the CPU usage/limit ratio per container and then uses vector matching + # (* on(...) group_left()) to filter the results to only the pods matching the specific labels defined in values.yaml. + expr: | + ( + # 1. Calculate CPU usage rate per container (cAdvisor metrics) + sum by (pod, namespace, container) ( + rate(container_cpu_usage_seconds_total{ + namespace="{{ .Values.jkvideo.namespace }}", + container!="", image!="" + }[5m]) + ) + / + # 2. Get CPU limits per container (kube-state-metrics) + sum by (pod, namespace, container) ( + kube_pod_container_resource_limits{ + namespace="{{ .Values.jkvideo.namespace }}", + resource="cpu" + } + ) + ) + # 3. Filter by Kubernetes labels using vector matching against kube_pod_labels + * on (pod, namespace) group_left() + ( + kube_pod_labels{ + namespace="{{ .Values.jkvideo.namespace }}" + {{- range $key, $value := .Values.jkvideo.podLabels }}, + label_{{ include "probers.sanitizePrometheusLabel" $key }}="{{ $value }}" + {{- end }} + } + ) > {{ .Values.jkvideo.cpuAlerts.threshold }} + for: {{ .Values.jkvideo.cpuAlerts.for }} + labels: + severity: warning + annotations: + summary: "Jkvideo Pod {{ "{{" }} $labels.pod {{ "}}" }} High CPU Utilization" + description: "CPU utilization for container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} has exceeded {{ .Values.jkvideo.cpuAlerts.threshold | mul 100 }}% of the limit." + {{- end }} + + {{- if .Values.turnProber.enabled }} + # Alert: TURN Prober Failure + - alert: TurnProberJobFailed + # Alert if the CronJob fails (metric from kube-state-metrics). + # The regex matches the job name generated by the CronJob controller (which appends a suffix). + expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-turn-.*"} > 0 + for: 0m # Alert immediately + labels: + severity: critical + annotations: + summary: "TURN Server Probe Failed" + description: "The CronJob testing the TURN server failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})." + {{- end }} + + {{- if .Values.e2eProber.enabled }} + # Alert: E2E Prober Failure + - alert: E2EProberJobFailed + expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-e2e-.*"} > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Mediasoup E2E Session Probe Failed" + description: "The CronJob simulating a 2-party session failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})." + {{- end }} \ No newline at end of file diff --git a/k8s/probers/templates/turn-probers-cronjob.yaml b/k8s/probers/templates/turn-probers-cronjob.yaml new file mode 100644 index 0000000..9ef2d0c --- /dev/null +++ b/k8s/probers/templates/turn-probers-cronjob.yaml @@ -0,0 +1,52 @@ +{{- if .Values.turnProber.enabled }} +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "probers.fullname" . }}-turn + labels: + {{- include "probers.labels" . | nindent 4 }} +spec: + schedule: "{{ .Values.turnProber.schedule }}" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + spec: + imagePullSecrets: +{{ toYaml .Values.turnProber.imagePullSecrets | indent 12 }} + restartPolicy: OnFailure + containers: + - name: turnutils-client + image: "{{ .Values.turnProber.image.repository }}:{{ .Values.turnProber.image.tag }}" + imagePullPolicy: {{ .Values.turnProber.image.pullPolicy }} + command: + - /bin/sh + - -c + - | + set -e + echo "Probing TURN server {{ .Values.turnProber.target }} via custom prober" + + # Run the custom client + turnutils_uclient_ars -c -n 2000 -v -H 1 -R 1 \ + -u {{ .Values.turnProber.username }} \ + -w {{ .Values.turnProber.password }} \ + -p {{ .Values.turnProber.port }} \ + {{ .Values.turnProber.target }} + + + # ORIGINAL PROBE (coturn stock invocation) + # Run the client + # -y: client-to-client mode (self-test, verifies relay functionality) + # -n 5: Send 5 messages + # -v: Verbose (useful for debugging logs) + # Exits 0 on success, non-zero on failure + #turnutils_uclient -v $PROTOCOL_FLAG -n 5 -y \ + # -u {{ .Values.turnProber.username }} \ + # -w {{ .Values.turnProber.password }} \ + # {{ .Values.turnProber.target }} + + + echo "TURN probe successful." +{{- end }} \ No newline at end of file diff --git a/k8s/probers/values-production.yaml b/k8s/probers/values-production.yaml new file mode 100644 index 0000000..8aceb06 --- /dev/null +++ b/k8s/probers/values-production.yaml @@ -0,0 +1,20 @@ +# Production specific values +environment: "production" + +jkvideo: + # In production, we should be more conservative with alerts + cpuAlerts: + threshold: 0.9 + for: "10m" + +turnProber: + target: "turn.video.jamkazam.com" + # Reminder: In a real environment, these should be managed via Kubernetes secrets + # username: "prod-user" + # password: "use-a-secret" + +e2eProber: + enabled: true # Enabling for production + backendUrl: "https://webrtc-be.video.jamkazam.com" + image: + tag: "stable-v1.2.3" # Example of a stable tag for production diff --git a/k8s/probers/values-staging.yaml b/k8s/probers/values-staging.yaml new file mode 100644 index 0000000..b1609bd --- /dev/null +++ b/k8s/probers/values-staging.yaml @@ -0,0 +1,20 @@ +# Staging specific values +environment: "staging" + +jkvideo: + # In staging, we can be more aggressive with alerts + cpuAlerts: + threshold: 0.75 + for: "2m" + +turnProber: + target: "turn.staging.video.jamkazam.com" + # Reminder: In a real environment, these should be managed via Kubernetes secrets + # username: "staging-user" + # password: "use-a-secret" + +e2eProber: + enabled: true # Enabling for staging + backendUrl: "https://webrtc-be.staging.video.jamkazam.com" + image: + tag: "staging-latest" diff --git a/k8s/probers/values.yaml b/k8s/probers/values.yaml new file mode 100644 index 0000000..9006acb --- /dev/null +++ b/k8s/probers/values.yaml @@ -0,0 +1,52 @@ +# Default environment +environment: "staging" + +# Common configuration for monitoring integration +monitoring: + # Labels to apply to PrometheusRules so Prometheus Operator can discover them. + labels: + release: prometheus-stack + +# jkvideo Backend Details +jkvideo: + namespace: "webrtc-be" + podLabels: + app.kubernetes.io/name: "webrtc-be" + component: "worker" + cpuAlerts: + enabled: true + threshold: 0.7 + for: "5m" + +# TURN Server Probing Configuration +turnProber: + enabled: true + schedule: "*/5 * * * *" + image: + repository: gcr.io/tough-craft-276813/coturn + tag: "latest" + pullPolicy: Always + imagePullSecrets: + - name: gcr-json-key + target: "turn.staging.video.jamkazam.com" + port: 3478 + username: "smoketest" + password: "foolishcharmer" + protocol: "udp" + +# E2E Session Simulation Configuration +e2eProber: + enabled: false + schedule: "*/15 * * * *" + backendUrl: "https://webrtc-be.staging.video.jamkazam.com" + image: + repository: "your-registry/mediasoup-e2e-tester" + tag: "latest" + pullPolicy: IfNotPresent + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 1000m + memory: 2Gi diff --git a/scripts/helm-update-probers b/scripts/helm-update-probers new file mode 100755 index 0000000..7089568 --- /dev/null +++ b/scripts/helm-update-probers @@ -0,0 +1,4 @@ +# run in k8s/monitoring folder +helm upgrade --install probers . --values values-production.yaml + +# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml diff --git a/scripts/helm-update-probers-staging b/scripts/helm-update-probers-staging new file mode 100755 index 0000000..1e400ea --- /dev/null +++ b/scripts/helm-update-probers-staging @@ -0,0 +1,10 @@ +# run in k8s/monitoring folder +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +pushd $SCRIPT_DIR/../k8s/probers > /dev/null + +helm upgrade --install probers . --values values-staging.yaml + +popd + +# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml