infra: add staging runner reaper cronjob
This commit is contained in:
parent
5da9e05f94
commit
386c00eeb8
|
|
@ -0,0 +1,22 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: runner-reaper
|
||||
namespace: argocd
|
||||
finalizers:
|
||||
- resources-finalizer.argocd.argoproj.io
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: 'git@bitbucket.org:jamkazam/video-iac.git'
|
||||
targetRevision: develop
|
||||
path: k8s/jam-cloud-infra
|
||||
directory:
|
||||
include: 'runner-reaper.yaml'
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: jam-cloud-infra
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: runner-reaper-secret
|
||||
namespace: jam-cloud-infra
|
||||
type: Opaque
|
||||
stringData:
|
||||
slackWebhookUrl: https://hooks.slack.com/services/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
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: runner-reaper
|
||||
namespace: jam-cloud-infra
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: runner-reaper
|
||||
namespace: jam-cloud-infra
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["get", "list", "delete"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: runner-reaper
|
||||
namespace: jam-cloud-infra
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: runner-reaper
|
||||
namespace: jam-cloud-infra
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: runner-reaper
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: runner-reaper
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: runner-reaper
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: runner-reaper
|
||||
namespace: jam-cloud-infra
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: runner-reaper
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: runner-reaper-script
|
||||
namespace: jam-cloud-infra
|
||||
data:
|
||||
reap-stale-runners.sh: |
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
NAMESPACE="${NAMESPACE:-jam-cloud-infra}"
|
||||
STALE_SECONDS="${STALE_SECONDS:-180}"
|
||||
SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"
|
||||
|
||||
send_slack() {
|
||||
message="$1"
|
||||
echo "$message"
|
||||
if [ -n "$SLACK_WEBHOOK_URL" ]; then
|
||||
payload=$(printf '{"text":"%s"}' "$(printf '%s' "$message" | sed 's/"/\\"/g')")
|
||||
curl -fsS -X POST -H 'Content-type: application/json' --data "$payload" "$SLACK_WEBHOOK_URL" >/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
now_epoch="$(date +%s)"
|
||||
pods_json="$(kubectl get pods -n "$NAMESPACE" -l app=act-runner -o json)"
|
||||
pod_count="$(printf '%s' "$pods_json" | jq '.items | length')"
|
||||
|
||||
if [ "$pod_count" -eq 0 ]; then
|
||||
send_slack "🚨 runner-reaper: no act-runner pods found in $NAMESPACE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
printf '%s' "$pods_json" | jq -c '.items[]' | while IFS= read -r pod; do
|
||||
pod_name="$(printf '%s' "$pod" | jq -r '.metadata.name')"
|
||||
node_name="$(printf '%s' "$pod" | jq -r '.spec.nodeName // ""')"
|
||||
phase="$(printf '%s' "$pod" | jq -r '.status.phase // "Unknown"')"
|
||||
ready="$(printf '%s' "$pod" | jq -r '[.status.conditions[]? | select(.type=="Ready")][0].status // "Unknown"')"
|
||||
start_time="$(printf '%s' "$pod" | jq -r '.status.startTime // ""')"
|
||||
|
||||
reason=""
|
||||
|
||||
if [ -z "$node_name" ]; then
|
||||
reason="pod has no assigned node"
|
||||
else
|
||||
node_json="$(kubectl get node "$node_name" -o json 2>/dev/null || true)"
|
||||
if [ -z "$node_json" ]; then
|
||||
reason="assigned node $node_name is missing"
|
||||
else
|
||||
node_ready="$(printf '%s' "$node_json" | jq -r '[.status.conditions[]? | select(.type=="Ready")][0].status // "Unknown"')"
|
||||
if [ "$node_ready" != "True" ]; then
|
||||
reason="assigned node $node_name is not Ready"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$reason" ] && [ "$ready" != "True" ] && [ -n "$start_time" ]; then
|
||||
start_epoch="$(date -d "$start_time" +%s 2>/dev/null || echo 0)"
|
||||
age="$((now_epoch - start_epoch))"
|
||||
if [ "$start_epoch" -gt 0 ] && [ "$age" -ge "$STALE_SECONDS" ]; then
|
||||
reason="pod has been unready for ${age}s"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$reason" ]; then
|
||||
send_slack "🚨 runner-reaper: deleting stale act-runner pod $pod_name in $NAMESPACE ($reason, phase=$phase, ready=$ready)"
|
||||
kubectl delete pod -n "$NAMESPACE" "$pod_name" --wait=false >/dev/null
|
||||
fi
|
||||
done
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: runner-reaper
|
||||
namespace: jam-cloud-infra
|
||||
spec:
|
||||
schedule: "*/2 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: runner-reaper
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: runner-reaper
|
||||
image: alpine/k8s:1.33.4
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
apk add --no-cache curl jq >/dev/null
|
||||
/scripts/reap-stale-runners.sh
|
||||
env:
|
||||
- name: NAMESPACE
|
||||
value: jam-cloud-infra
|
||||
- name: STALE_SECONDS
|
||||
value: "180"
|
||||
- name: SLACK_WEBHOOK_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: runner-reaper-secret
|
||||
key: slackWebhookUrl
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: runner-reaper-script
|
||||
defaultMode: 0555
|
||||
Loading…
Reference in New Issue