Probers 1st attempt
This commit is contained in:
parent
8c271b9b7b
commit
15139dec05
|
|
@ -0,0 +1,23 @@
|
||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: probers
|
||||||
|
spec:
|
||||||
|
destination:
|
||||||
|
name: ''
|
||||||
|
namespace: probers
|
||||||
|
server: 'https://kubernetes.default.svc'
|
||||||
|
source:
|
||||||
|
helm:
|
||||||
|
valueFiles:
|
||||||
|
- values-{{ .Values.environment }}.yaml
|
||||||
|
path: k8s/probers
|
||||||
|
repoURL: 'git@bitbucket.org:jamkazam/video-iac.git'
|
||||||
|
targetRevision: {{ .Values.gitBranch }}
|
||||||
|
project: default
|
||||||
|
syncPolicy:
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
automated:
|
||||||
|
prune: true
|
||||||
|
selfHeal: true
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
apiVersion: v2
|
||||||
|
name: probers
|
||||||
|
description: A Helm chart for various propers; in particular webrtc_be probing
|
||||||
|
type: application
|
||||||
|
version: 0.1.0
|
||||||
|
appVersion: "1.0.0"
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
{{/*
|
||||||
|
Create a default fully qualified app name.
|
||||||
|
*/}}
|
||||||
|
{{- define "probers.fullname" -}}
|
||||||
|
{{- if .Values.fullnameOverride }}
|
||||||
|
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||||
|
{{- else }}
|
||||||
|
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||||
|
{{- if contains $name .Release.Name }}
|
||||||
|
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||||
|
{{- else }}
|
||||||
|
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Common labels
|
||||||
|
*/}}
|
||||||
|
{{- define "probers.labels" -}}
|
||||||
|
helm.sh/chart: {{ printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||||
|
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Sanitize Kubernetes label keys for Prometheus (kube-state-metrics compatibility).
|
||||||
|
Replaces characters like '.', '/', and '-' with '_'.
|
||||||
|
Example: app.kubernetes.io/name -> app_kubernetes_io_name
|
||||||
|
*/}}
|
||||||
|
{{- define "probers.sanitizePrometheusLabel" -}}
|
||||||
|
{{- . | replace "." "_" | replace "/" "_" | replace "-" "_" -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
{{- if .Values.e2eProber.enabled }}
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: {{ include "probers.fullname" . }}-e2e
|
||||||
|
labels:
|
||||||
|
{{- include "probers.labels" . | nindent 4 }}
|
||||||
|
spec:
|
||||||
|
schedule: "{{ .Values.e2eProber.schedule }}"
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
successfulJobsHistoryLimit: 1
|
||||||
|
failedJobsHistoryLimit: 3
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
restartPolicy: OnFailure
|
||||||
|
containers:
|
||||||
|
- name: e2e-tester
|
||||||
|
image: "{{ .Values.e2eProber.image.repository }}:{{ .Values.e2eProber.image.tag }}"
|
||||||
|
imagePullPolicy: {{ .Values.e2eProber.image.pullPolicy }}
|
||||||
|
# Assuming the image's entrypoint executes the test suite (e.g., `npm test` or `pytest`)
|
||||||
|
env:
|
||||||
|
- name: BACKEND_URL
|
||||||
|
value: {{ .Values.e2eProber.backendUrl | quote }}
|
||||||
|
resources:
|
||||||
|
{{- toYaml .Values.e2eProber.resources | nindent 14 }}
|
||||||
|
{{- end }}
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ include "probers.fullname" . }}-alerts
|
||||||
|
labels:
|
||||||
|
{{- include "probers.labels" . | nindent 4 }}
|
||||||
|
{{- toYaml .Values.monitoring.labels | nindent 4 }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: jkvideo.alerts
|
||||||
|
rules:
|
||||||
|
{{- if .Values.jkvideo.cpuAlerts.enabled }}
|
||||||
|
# Alert: High CPU Utilization
|
||||||
|
- alert: JkvideoHighCPU
|
||||||
|
# This query calculates the CPU usage/limit ratio per container and then uses vector matching
|
||||||
|
# (* on(...) group_left()) to filter the results to only the pods matching the specific labels defined in values.yaml.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
# 1. Calculate CPU usage rate per container (cAdvisor metrics)
|
||||||
|
sum by (pod, namespace, container) (
|
||||||
|
rate(container_cpu_usage_seconds_total{
|
||||||
|
namespace="{{ .Values.jkvideo.namespace }}",
|
||||||
|
container!="", image!=""
|
||||||
|
}[5m])
|
||||||
|
)
|
||||||
|
/
|
||||||
|
# 2. Get CPU limits per container (kube-state-metrics)
|
||||||
|
sum by (pod, namespace, container) (
|
||||||
|
kube_pod_container_resource_limits{
|
||||||
|
namespace="{{ .Values.jkvideo.namespace }}",
|
||||||
|
resource="cpu"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# 3. Filter by Kubernetes labels using vector matching against kube_pod_labels
|
||||||
|
* on (pod, namespace) group_left()
|
||||||
|
(
|
||||||
|
kube_pod_labels{
|
||||||
|
namespace="{{ .Values.jkvideo.namespace }}"
|
||||||
|
{{- range $key, $value := .Values.jkvideo.podLabels }},
|
||||||
|
label_{{ include "probers.sanitizePrometheusLabel" $key }}="{{ $value }}"
|
||||||
|
{{- end }}
|
||||||
|
}
|
||||||
|
) > {{ .Values.jkvideo.cpuAlerts.threshold }}
|
||||||
|
for: {{ .Values.jkvideo.cpuAlerts.for }}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Jkvideo Pod {{ "{{" }} $labels.pod {{ "}}" }} High CPU Utilization"
|
||||||
|
description: "CPU utilization for container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} has exceeded {{ .Values.jkvideo.cpuAlerts.threshold | mul 100 }}% of the limit."
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if .Values.turnProber.enabled }}
|
||||||
|
# Alert: TURN Prober Failure
|
||||||
|
- alert: TurnProberJobFailed
|
||||||
|
# Alert if the CronJob fails (metric from kube-state-metrics).
|
||||||
|
# The regex matches the job name generated by the CronJob controller (which appends a suffix).
|
||||||
|
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-turn-.*"} > 0
|
||||||
|
for: 0m # Alert immediately
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "TURN Server Probe Failed"
|
||||||
|
description: "The CronJob testing the TURN server failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if .Values.e2eProber.enabled }}
|
||||||
|
# Alert: E2E Prober Failure
|
||||||
|
- alert: E2EProberJobFailed
|
||||||
|
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-e2e-.*"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Mediasoup E2E Session Probe Failed"
|
||||||
|
description: "The CronJob simulating a 2-party session failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
|
||||||
|
{{- end }}
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
{{- if .Values.turnProber.enabled }}
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: {{ include "probers.fullname" . }}-turn
|
||||||
|
labels:
|
||||||
|
{{- include "probers.labels" . | nindent 4 }}
|
||||||
|
spec:
|
||||||
|
schedule: "{{ .Values.turnProber.schedule }}"
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
successfulJobsHistoryLimit: 1
|
||||||
|
failedJobsHistoryLimit: 3
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
imagePullSecrets:
|
||||||
|
{{ toYaml .Values.turnProber.imagePullSecrets | indent 12 }}
|
||||||
|
restartPolicy: OnFailure
|
||||||
|
containers:
|
||||||
|
- name: turnutils-client
|
||||||
|
image: "{{ .Values.turnProber.image.repository }}:{{ .Values.turnProber.image.tag }}"
|
||||||
|
imagePullPolicy: {{ .Values.turnProber.image.pullPolicy }}
|
||||||
|
command:
|
||||||
|
- /bin/sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
set -e
|
||||||
|
echo "Probing TURN server {{ .Values.turnProber.target }} via custom prober"
|
||||||
|
|
||||||
|
# Run the custom client
|
||||||
|
turnutils_uclient_ars -c -n 2000 -v -H 1 -R 1 \
|
||||||
|
-u {{ .Values.turnProber.username }} \
|
||||||
|
-w {{ .Values.turnProber.password }} \
|
||||||
|
-p {{ .Values.turnProber.port }} \
|
||||||
|
{{ .Values.turnProber.target }}
|
||||||
|
|
||||||
|
|
||||||
|
# ORIGINAL PROBE (coturn stock invocation)
|
||||||
|
# Run the client
|
||||||
|
# -y: client-to-client mode (self-test, verifies relay functionality)
|
||||||
|
# -n 5: Send 5 messages
|
||||||
|
# -v: Verbose (useful for debugging logs)
|
||||||
|
# Exits 0 on success, non-zero on failure
|
||||||
|
#turnutils_uclient -v $PROTOCOL_FLAG -n 5 -y \
|
||||||
|
# -u {{ .Values.turnProber.username }} \
|
||||||
|
# -w {{ .Values.turnProber.password }} \
|
||||||
|
# {{ .Values.turnProber.target }}
|
||||||
|
|
||||||
|
|
||||||
|
echo "TURN probe successful."
|
||||||
|
{{- end }}
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Production specific values
|
||||||
|
environment: "production"
|
||||||
|
|
||||||
|
jkvideo:
|
||||||
|
# In production, we should be more conservative with alerts
|
||||||
|
cpuAlerts:
|
||||||
|
threshold: 0.9
|
||||||
|
for: "10m"
|
||||||
|
|
||||||
|
turnProber:
|
||||||
|
target: "turn.video.jamkazam.com"
|
||||||
|
# Reminder: In a real environment, these should be managed via Kubernetes secrets
|
||||||
|
# username: "prod-user"
|
||||||
|
# password: "use-a-secret"
|
||||||
|
|
||||||
|
e2eProber:
|
||||||
|
enabled: true # Enabling for production
|
||||||
|
backendUrl: "https://webrtc-be.video.jamkazam.com"
|
||||||
|
image:
|
||||||
|
tag: "stable-v1.2.3" # Example of a stable tag for production
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Staging specific values
|
||||||
|
environment: "staging"
|
||||||
|
|
||||||
|
jkvideo:
|
||||||
|
# In staging, we can be more aggressive with alerts
|
||||||
|
cpuAlerts:
|
||||||
|
threshold: 0.75
|
||||||
|
for: "2m"
|
||||||
|
|
||||||
|
turnProber:
|
||||||
|
target: "turn.staging.video.jamkazam.com"
|
||||||
|
# Reminder: In a real environment, these should be managed via Kubernetes secrets
|
||||||
|
# username: "staging-user"
|
||||||
|
# password: "use-a-secret"
|
||||||
|
|
||||||
|
e2eProber:
|
||||||
|
enabled: true # Enabling for staging
|
||||||
|
backendUrl: "https://webrtc-be.staging.video.jamkazam.com"
|
||||||
|
image:
|
||||||
|
tag: "staging-latest"
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
# Default environment
|
||||||
|
environment: "staging"
|
||||||
|
|
||||||
|
# Common configuration for monitoring integration
|
||||||
|
monitoring:
|
||||||
|
# Labels to apply to PrometheusRules so Prometheus Operator can discover them.
|
||||||
|
labels:
|
||||||
|
release: prometheus-stack
|
||||||
|
|
||||||
|
# jkvideo Backend Details
|
||||||
|
jkvideo:
|
||||||
|
namespace: "webrtc-be"
|
||||||
|
podLabels:
|
||||||
|
app.kubernetes.io/name: "webrtc-be"
|
||||||
|
component: "worker"
|
||||||
|
cpuAlerts:
|
||||||
|
enabled: true
|
||||||
|
threshold: 0.7
|
||||||
|
for: "5m"
|
||||||
|
|
||||||
|
# TURN Server Probing Configuration
|
||||||
|
turnProber:
|
||||||
|
enabled: true
|
||||||
|
schedule: "*/5 * * * *"
|
||||||
|
image:
|
||||||
|
repository: gcr.io/tough-craft-276813/coturn
|
||||||
|
tag: "latest"
|
||||||
|
pullPolicy: Always
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: gcr-json-key
|
||||||
|
target: "turn.staging.video.jamkazam.com"
|
||||||
|
port: 3478
|
||||||
|
username: "smoketest"
|
||||||
|
password: "foolishcharmer"
|
||||||
|
protocol: "udp"
|
||||||
|
|
||||||
|
# E2E Session Simulation Configuration
|
||||||
|
e2eProber:
|
||||||
|
enabled: false
|
||||||
|
schedule: "*/15 * * * *"
|
||||||
|
backendUrl: "https://webrtc-be.staging.video.jamkazam.com"
|
||||||
|
image:
|
||||||
|
repository: "your-registry/mediasoup-e2e-tester"
|
||||||
|
tag: "latest"
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 1Gi
|
||||||
|
limits:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 2Gi
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
# run in k8s/monitoring folder
|
||||||
|
helm upgrade --install probers . --values values-production.yaml
|
||||||
|
|
||||||
|
# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
# run in k8s/monitoring folder
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
|
||||||
|
pushd $SCRIPT_DIR/../k8s/probers > /dev/null
|
||||||
|
|
||||||
|
helm upgrade --install probers . --values values-staging.yaml
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
||||||
|
# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml
|
||||||
Loading…
Reference in New Issue