infra: add staging runner reaper cronjob

This commit is contained in:
Seth Call 2026-03-14 11:04:31 -06:00
parent 5da9e05f94
commit 386c00eeb8
2 changed files with 194 additions and 0 deletions

View File

@ -0,0 +1,22 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: runner-reaper
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: 'git@bitbucket.org:jamkazam/video-iac.git'
targetRevision: develop
path: k8s/jam-cloud-infra
directory:
include: 'runner-reaper.yaml'
destination:
server: https://kubernetes.default.svc
namespace: jam-cloud-infra
syncPolicy:
automated:
prune: true
selfHeal: true

View File

@ -0,0 +1,172 @@
apiVersion: v1
kind: Secret
metadata:
name: runner-reaper-secret
namespace: jam-cloud-infra
type: Opaque
stringData:
slackWebhookUrl: https://hooks.slack.com/services/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
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: runner-reaper
namespace: jam-cloud-infra
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: runner-reaper
namespace: jam-cloud-infra
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: runner-reaper
namespace: jam-cloud-infra
subjects:
- kind: ServiceAccount
name: runner-reaper
namespace: jam-cloud-infra
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: runner-reaper
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: runner-reaper
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: runner-reaper
subjects:
- kind: ServiceAccount
name: runner-reaper
namespace: jam-cloud-infra
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: runner-reaper
---
apiVersion: v1
kind: ConfigMap
metadata:
name: runner-reaper-script
namespace: jam-cloud-infra
data:
reap-stale-runners.sh: |
#!/bin/sh
set -eu
NAMESPACE="${NAMESPACE:-jam-cloud-infra}"
STALE_SECONDS="${STALE_SECONDS:-180}"
SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"
send_slack() {
message="$1"
echo "$message"
if [ -n "$SLACK_WEBHOOK_URL" ]; then
payload=$(printf '{"text":"%s"}' "$(printf '%s' "$message" | sed 's/"/\\"/g')")
curl -fsS -X POST -H 'Content-type: application/json' --data "$payload" "$SLACK_WEBHOOK_URL" >/dev/null || true
fi
}
now_epoch="$(date +%s)"
pods_json="$(kubectl get pods -n "$NAMESPACE" -l app=act-runner -o json)"
pod_count="$(printf '%s' "$pods_json" | jq '.items | length')"
if [ "$pod_count" -eq 0 ]; then
send_slack "🚨 runner-reaper: no act-runner pods found in $NAMESPACE"
exit 0
fi
printf '%s' "$pods_json" | jq -c '.items[]' | while IFS= read -r pod; do
pod_name="$(printf '%s' "$pod" | jq -r '.metadata.name')"
node_name="$(printf '%s' "$pod" | jq -r '.spec.nodeName // ""')"
phase="$(printf '%s' "$pod" | jq -r '.status.phase // "Unknown"')"
ready="$(printf '%s' "$pod" | jq -r '[.status.conditions[]? | select(.type=="Ready")][0].status // "Unknown"')"
start_time="$(printf '%s' "$pod" | jq -r '.status.startTime // ""')"
reason=""
if [ -z "$node_name" ]; then
reason="pod has no assigned node"
else
node_json="$(kubectl get node "$node_name" -o json 2>/dev/null || true)"
if [ -z "$node_json" ]; then
reason="assigned node $node_name is missing"
else
node_ready="$(printf '%s' "$node_json" | jq -r '[.status.conditions[]? | select(.type=="Ready")][0].status // "Unknown"')"
if [ "$node_ready" != "True" ]; then
reason="assigned node $node_name is not Ready"
fi
fi
fi
if [ -z "$reason" ] && [ "$ready" != "True" ] && [ -n "$start_time" ]; then
start_epoch="$(date -d "$start_time" +%s 2>/dev/null || echo 0)"
age="$((now_epoch - start_epoch))"
if [ "$start_epoch" -gt 0 ] && [ "$age" -ge "$STALE_SECONDS" ]; then
reason="pod has been unready for ${age}s"
fi
fi
if [ -n "$reason" ]; then
send_slack "🚨 runner-reaper: deleting stale act-runner pod $pod_name in $NAMESPACE ($reason, phase=$phase, ready=$ready)"
kubectl delete pod -n "$NAMESPACE" "$pod_name" --wait=false >/dev/null
fi
done
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: runner-reaper
namespace: jam-cloud-infra
spec:
schedule: "*/2 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
serviceAccountName: runner-reaper
restartPolicy: Never
containers:
- name: runner-reaper
image: alpine/k8s:1.33.4
command: ["/bin/sh", "-c"]
args:
- |
apk add --no-cache curl jq >/dev/null
/scripts/reap-stale-runners.sh
env:
- name: NAMESPACE
value: jam-cloud-infra
- name: STALE_SECONDS
value: "180"
- name: SLACK_WEBHOOK_URL
valueFrom:
secretKeyRef:
name: runner-reaper-secret
key: slackWebhookUrl
volumeMounts:
- name: script
mountPath: /scripts
volumes:
- name: script
configMap:
name: runner-reaper-script
defaultMode: 0555