From 5b502f93b47cd3cda523229057abef343c8d3242 Mon Sep 17 00:00:00 2001 From: Seth Call Date: Sat, 13 Dec 2025 14:53:22 -0600 Subject: [PATCH] Preparing for webrtc-be crash alerts --- k8s/monitoring/README.md | 19 +++++++++++ k8s/monitoring/values-production.yaml | 45 +++++++++++++++++++++++++-- k8s/monitoring/values-staging.yaml | 45 +++++++++++++++++++++++++-- k8s/webrtc-be/templates/alerts.yaml | 2 ++ scripts/loki-port-forward.sh | 8 +++++ scripts/loki-query.sh | 13 ++++++++ 6 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 k8s/monitoring/README.md create mode 100755 scripts/loki-port-forward.sh create mode 100755 scripts/loki-query.sh diff --git a/k8s/monitoring/README.md b/k8s/monitoring/README.md new file mode 100644 index 0000000..d7d737a --- /dev/null +++ b/k8s/monitoring/README.md @@ -0,0 +1,19 @@ +# Monitoring and Alerting + +## Slack Webhook Configuration + +The Slack notifications use a specific Incoming Webhook URL structure: +`https://hooks.slack.com/services/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` + +These ID components represent: +- **T0L5RA3E0**: Slack Workspace ID (e.g., JamKazam) +- **B01SM8RC346**: Bot/App Configuration ID (unique to the specific "Incoming Webhook" integration created in the Slack app management) +- **XDDOrcPE7eAXJPMCvc5FxIva**: The Secret Token for authentication + +### Updating the Webhook +If you need to change the channel or regenerate the URL: +1. Go to [Slack App Management](https://api.slack.com/apps). +2. Select the relevant App (e.g., "Monitoring" or "Incoming Webhooks"). +3. Navigate to **Incoming Webhooks**. +4. Generate a new Webhook URL for the desired channel. +5. Update the URL in `values-production.yaml` and `values-staging.yaml`. diff --git a/k8s/monitoring/values-production.yaml b/k8s/monitoring/values-production.yaml index 6aa6af8..c84dbab 100644 --- a/k8s/monitoring/values-production.yaml +++ b/k8s/monitoring/values-production.yaml @@ -148,15 +148,56 @@ kube-prometheus-stack: repeat_interval: 12h receiver: 'null' routes: - - match: + - match: alertname: WebrtcBeCrashed - receiver: 'email-alerts' + receiver: 'email-and-slack-notifications' + - match: + alertname: WebrtcBeError + receiver: 'email-and-slack-notifications' receivers: - name: 'null' - name: 'email-alerts' email_configs: - to: 'alerts@jamkazam.com' send_resolved: true + - name: 'slack-notifications' + slack_configs: + - api_url: 'https://hooks.slack.com/services/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' + channel: '#monitoring-alerts' + send_resolved: true + title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification' + text: >- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + - name: 'email-and-slack-notifications' + email_configs: + - to: 'alerts@jamkazam.com' + send_resolved: true + headers: + Subject: '[PRODUCTION] {{ .Status | toUpper }} - {{ range .Alerts }}{{ .Annotations.summary }} {{ end }}' + html: '{{ template "email.default.html" . }}

View in Alertmanager
{{ range .Alerts }}{{ if .Annotations.loki_link }}View Logs in Loki{{ end }}{{ end }}' + slack_configs: + - api_url: 'https://hooks.slack.com/services/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' + channel: '#monitoring-alerts' + send_resolved: true + title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification' + text: >- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ if .Annotations.loki_link }} + *Logs:* <{{ .Annotations.loki_link }}|View in Loki> + {{ end }} + {{ end }} + *Source:* <{{ .ExternalURL }}|Alertmanager> grafana: persistence: enabled: true diff --git a/k8s/monitoring/values-staging.yaml b/k8s/monitoring/values-staging.yaml index 2a693c3..70558e5 100644 --- a/k8s/monitoring/values-staging.yaml +++ b/k8s/monitoring/values-staging.yaml @@ -148,15 +148,56 @@ kube-prometheus-stack: repeat_interval: 12h receiver: 'null' routes: - - match: + - match: alertname: WebrtcBeCrashed - receiver: 'email-alerts' + receiver: 'email-and-slack-notifications' + - match: + alertname: WebrtcBeError + receiver: 'email-and-slack-notifications' receivers: - name: 'null' - name: 'email-alerts' email_configs: - to: 'alerts@jamkazam.com' send_resolved: true + - name: 'slack-notifications' + slack_configs: + - api_url: 'https://hooks.slack.com/services/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' + channel: '#monitoring-alerts' + send_resolved: true + title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification' + text: >- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + - name: 'email-and-slack-notifications' + email_configs: + - to: 'alerts@jamkazam.com' + send_resolved: true + headers: + Subject: '[STAGING] {{ .Status | toUpper }} - {{ range .Alerts }}{{ .Annotations.summary }} {{ end }}' + html: '{{ template "email.default.html" . }}

View in Alertmanager
{{ range .Alerts }}{{ if .Annotations.loki_link }}View Logs in Loki{{ end }}{{ end }}' + slack_configs: + - api_url: 'https://hooks.slack.com/services/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' + channel: '#monitoring-alerts' + send_resolved: true + title: '[STAGING] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification' + text: >- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ if .Annotations.loki_link }} + *Logs:* <{{ .Annotations.loki_link }}|View in Loki> + {{ end }} + {{ end }} + *Source:* <{{ .ExternalURL }}|Alertmanager> grafana: persistence: enabled: true diff --git a/k8s/webrtc-be/templates/alerts.yaml b/k8s/webrtc-be/templates/alerts.yaml index aa7e916..8f66a1f 100644 --- a/k8s/webrtc-be/templates/alerts.yaml +++ b/k8s/webrtc-be/templates/alerts.yaml @@ -27,3 +27,5 @@ spec: annotations: summary: "webrtc-be crashed" description: "The webrtc-be pod has crashed. Please check the logs." + loki_link: >- + {{ .Values.grafana.externalUrl }}/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{container=\"webrtc-be\", namespace=\"webrtc-be\"}"}] diff --git a/scripts/loki-port-forward.sh b/scripts/loki-port-forward.sh new file mode 100755 index 0000000..4c67e2f --- /dev/null +++ b/scripts/loki-port-forward.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Port forward Loki service to localhost:3101 +# Usage: ./scripts/loki-port-forward.sh +# Keep this running in a separate terminal. +# Once running, you can use ./scripts/loki-query.sh to inspect logs. + +echo "Port forwarding Loki to http://localhost:3101..." +kubectl -n loki port-forward svc/loki 3101:3100 diff --git a/scripts/loki-query.sh b/scripts/loki-query.sh new file mode 100755 index 0000000..1f3cb5c --- /dev/null +++ b/scripts/loki-query.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Query Loki for recent logs of a specific pod regex +# Usage: ./scripts/loki-query.sh [pod_regex] +# Example: ./scripts/loki-query.sh "webrtc-be-.*" + +POD_REGEX="${1:-webrtc-be-.*}" + +echo "Querying Loki for pod regex: ${POD_REGEX}" +echo "Checking labels (namespace, cluster, etc)..." + +curl -G -s "http://localhost:3101/loki/api/v1/query_range" \ + --data-urlencode "query={pod=~\"${POD_REGEX}\"}" \ + --data-urlencode "limit=1" | jq '.data.result[0].stream'