apiVersion: v1 kind: ConfigMap metadata: name: node-pool-health-dashboard labels: grafana_dashboard: "1" data: node-pool-health.json: | { "annotations": { "list": [] }, "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, "id": null, "iteration": 1583185057230, "links": [], "panels": [ { "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 12, "x": 0, "y": 0 }, "id": 1, "options": { "orientation": "auto", "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false } }, "targets": [ { "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (workload) / sum(machine_cpu_cores) by (workload) * 100", "legendFormat": "{{ "{{" }}workload{{ "}}" }} CPU %", "refId": "A" } ], "title": "CPU Utilization by Node Pool", "type": "stat" }, { "datasource": "Prometheus", "fieldConfig": { "defaults": { "unit": "percent" } }, "gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 }, "id": 2, "options": { "reduceOptions": { "calcs": ["mean"] } }, "targets": [ { "expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100", "legendFormat": "{{ "{{" }}workload{{ "}}" }} Mem %", "refId": "A" } ], "title": "Memory Utilization by Node Pool", "type": "stat" }, { "datasource": "Prometheus", "gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }, "id": 3, "targets": [ { "expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0", "legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}pod{{ "}}" }} ({{ "{{" }}node{{ "}}" }})", "refId": "A" } ], "title": "Pod Restarts (Last 5m)", "type": "timeseries" } ], "refresh": "10s", "schemaVersion": 30, "style": "dark", "tags": ["kubernetes", "infrastructure"], "time": { "from": "now-1h", "to": "now" }, "title": "Node Pool Health", "uid": "node-pool-health" }