diff --git a/k8s/monitoring/templates/node-alerts.yaml b/k8s/monitoring/templates/node-alerts.yaml new file mode 100644 index 0000000..cbec3a7 --- /dev/null +++ b/k8s/monitoring/templates/node-alerts.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: cluster-node-alerts + labels: + app: prometheus-stack + release: prometheus-stack +spec: + groups: + - name: node.alerts + rules: + - alert: NodeHighCPU + expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 20 + for: 2m + labels: + severity: warning + annotations: + summary: "High CPU usage on node {{ $labels.instance }}" + description: "Node {{ $labels.instance }} has CPU usage above 20% (current value: {{ $value | printf \"%.2f\" }}%)" diff --git a/k8s/monitoring/templates/node-pool-health-dashboard.yaml b/k8s/monitoring/templates/node-pool-health-dashboard.yaml new file mode 100644 index 0000000..44bf705 --- /dev/null +++ b/k8s/monitoring/templates/node-pool-health-dashboard.yaml @@ -0,0 +1,108 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: node-pool-health-dashboard + labels: + grafana_dashboard: "1" +data: + node-pool-health.json: | + { + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "iteration": 1583185057230, + "links": [], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + } + }, + "targets": [ + { + "expr": "sum(node_cpu_seconds_total{mode!=\"idle\"}) by (workload) / sum(machine_cpu_cores) by (workload) * 100", + "legendFormat": "{{workload}} CPU %", + "refId": "A" + } + ], + "title": "CPU Utilization by Node Pool", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "unit": "percent" + } + }, + "gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "options": { + "reduceOptions": { "calcs": ["mean"] } + }, + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100", + "legendFormat": "{{workload}} Mem %", + "refId": "A" + } + ], + "title": "Memory Utilization by Node Pool", + "type": "stat" + }, + { + "datasource": "Prometheus", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }, + "id": 3, + "targets": [ + { + "expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0", + "legendFormat": "{{namespace}}/{{pod}} ({{node}})", + "refId": "A" + } + ], + "title": "Pod Restarts (Last 5m)", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 30, + "style": "dark", + "tags": ["kubernetes", "infrastructure"], + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Node Pool Health", + "uid": "node-pool-health" + } diff --git a/k8s/monitoring/templates/pod-alerts.yaml b/k8s/monitoring/templates/pod-alerts.yaml new file mode 100644 index 0000000..8e8e698 --- /dev/null +++ b/k8s/monitoring/templates/pod-alerts.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: cluster-pod-alerts + labels: + app: prometheus-stack + release: prometheus-stack +spec: + groups: + - name: pod.alerts + rules: + - alert: PodOOMKilled + expr: kube_pod_container_status_terminated_reason{reason="OOMKilled"} > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Pod OOMKilled" + description: "Container {{ $labels.container }} in pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) was OOMKilled." + + - alert: PodCrashLoopBackOff + expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Pod in CrashLoopBackOff" + description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is in CrashLoopBackOff."