oops add therest

2026-01-06 07:54:07 -06:00 · 2026-01-06 07:54:07 -06:00 · 8f739be584
parent 50d60758b0
commit 8f739be584
3 changed files with 155 additions and 0 deletions
--- a/k8s/monitoring/templates/node-alerts.yaml
+++ b/k8s/monitoring/templates/node-alerts.yaml
@ -0,0 +1,19 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: cluster-node-alerts
+  labels:
+    app: prometheus-stack
+    release: prometheus-stack
+spec:
+  groups:
+  - name: node.alerts
+    rules:
+    - alert: NodeHighCPU
+      expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 20
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: "High CPU usage on node {{ $labels.instance }}"
+        description: "Node {{ $labels.instance }} has CPU usage above 20% (current value: {{ $value | printf \"%.2f\" }}%)"
--- a/k8s/monitoring/templates/node-pool-health-dashboard.yaml
+++ b/k8s/monitoring/templates/node-pool-health-dashboard.yaml
@ -0,0 +1,108 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: node-pool-health-dashboard
+  labels:
+    grafana_dashboard: "1"
+data:
+  node-pool-health.json: |
+    {
+      "annotations": {
+        "list": []
+      },
+      "editable": true,
+      "gnetId": null,
+      "graphTooltip": 0,
+      "hideControls": false,
+      "id": null,
+      "iteration": 1583185057230,
+      "links": [],
+      "panels": [
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  { "color": "green", "value": null },
+                  { "color": "yellow", "value": 70 },
+                  { "color": "red", "value": 90 }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "gridPos": { "h": 4, "w": 12, "x": 0, "y": 0 },
+          "id": 1,
+          "options": {
+            "orientation": "auto",
+            "reduceOptions": {
+              "calcs": ["mean"],
+              "fields": "",
+              "values": false
+            }
+          },
+          "targets": [
+            {
+              "expr": "sum(node_cpu_seconds_total{mode!=\"idle\"}) by (workload) / sum(machine_cpu_cores) by (workload) * 100",
+              "legendFormat": "{{workload}} CPU %",
+              "refId": "A"
+            }
+          ],
+          "title": "CPU Utilization by Node Pool",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            }
+          },
+          "gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 },
+          "id": 2,
+          "options": {
+            "reduceOptions": { "calcs": ["mean"] }
+          },
+          "targets": [
+            {
+              "expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100",
+              "legendFormat": "{{workload}} Mem %",
+              "refId": "A"
+            }
+          ],
+          "title": "Memory Utilization by Node Pool",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 },
+          "id": 3,
+          "targets": [
+            {
+              "expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0",
+              "legendFormat": "{{namespace}}/{{pod}} ({{node}})",
+              "refId": "A"
+            }
+          ],
+          "title": "Pod Restarts (Last 5m)",
+          "type": "timeseries"
+        }
+      ],
+      "refresh": "10s",
+      "schemaVersion": 30,
+      "style": "dark",
+      "tags": ["kubernetes", "infrastructure"],
+      "time": {
+        "from": "now-1h",
+        "to": "now"
+      },
+      "title": "Node Pool Health",
+      "uid": "node-pool-health"
+    }
--- a/k8s/monitoring/templates/pod-alerts.yaml
+++ b/k8s/monitoring/templates/pod-alerts.yaml
@ -0,0 +1,28 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: cluster-pod-alerts
+  labels:
+    app: prometheus-stack
+    release: prometheus-stack
+spec:
+  groups:
+  - name: pod.alerts
+    rules:
+    - alert: PodOOMKilled
+      expr: kube_pod_container_status_terminated_reason{reason="OOMKilled"} > 0
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: "Pod OOMKilled"
+        description: "Container {{ $labels.container }} in pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) was OOMKilled."
+
+    - alert: PodCrashLoopBackOff
+      expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: "Pod in CrashLoopBackOff"
+        description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is in CrashLoopBackOff."