109 lines
3.1 KiB
YAML
109 lines
3.1 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: node-pool-health-dashboard
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
data:
|
|
node-pool-health.json: |
|
|
{
|
|
"annotations": {
|
|
"list": []
|
|
},
|
|
"editable": true,
|
|
"gnetId": null,
|
|
"graphTooltip": 0,
|
|
"hideControls": false,
|
|
"id": null,
|
|
"iteration": 1583185057230,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"datasource": "Prometheus",
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {
|
|
"mode": "thresholds"
|
|
},
|
|
"mappings": [],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 70 },
|
|
{ "color": "red", "value": 90 }
|
|
]
|
|
},
|
|
"unit": "percent"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"options": {
|
|
"orientation": "auto",
|
|
"reduceOptions": {
|
|
"calcs": ["mean"],
|
|
"fields": "",
|
|
"values": false
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (workload) / sum(machine_cpu_cores) by (workload) * 100",
|
|
"legendFormat": "{{ "{{" }}workload{{ "}}" }} CPU %",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "CPU Utilization by Node Pool",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": "Prometheus",
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent"
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 },
|
|
"id": 2,
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["mean"] }
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100",
|
|
"legendFormat": "{{ "{{" }}workload{{ "}}" }} Mem %",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Memory Utilization by Node Pool",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": "Prometheus",
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 },
|
|
"id": 3,
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0",
|
|
"legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}pod{{ "}}" }} ({{ "{{" }}node{{ "}}" }})",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"title": "Pod Restarts (Last 5m)",
|
|
"type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "10s",
|
|
"schemaVersion": 30,
|
|
"style": "dark",
|
|
"tags": ["kubernetes", "infrastructure"],
|
|
"time": {
|
|
"from": "now-1h",
|
|
"to": "now"
|
|
},
|
|
"title": "Node Pool Health",
|
|
"uid": "node-pool-health"
|
|
}
|