oops add therest
This commit is contained in:
parent
50d60758b0
commit
8f739be584
|
|
@ -0,0 +1,19 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: cluster-node-alerts
|
||||
labels:
|
||||
app: prometheus-stack
|
||||
release: prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: node.alerts
|
||||
rules:
|
||||
- alert: NodeHighCPU
|
||||
expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 20
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on node {{ $labels.instance }}"
|
||||
description: "Node {{ $labels.instance }} has CPU usage above 20% (current value: {{ $value | printf \"%.2f\" }}%)"
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: node-pool-health-dashboard
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
node-pool-health.json: |
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"iteration": 1583185057230,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["mean"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_cpu_seconds_total{mode!=\"idle\"}) by (workload) / sum(machine_cpu_cores) by (workload) * 100",
|
||||
"legendFormat": "{{workload}} CPU %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "CPU Utilization by Node Pool",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["mean"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100",
|
||||
"legendFormat": "{{workload}} Mem %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Memory Utilization by Node Pool",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 },
|
||||
"id": 3,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0",
|
||||
"legendFormat": "{{namespace}}/{{pod}} ({{node}})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Pod Restarts (Last 5m)",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 30,
|
||||
"style": "dark",
|
||||
"tags": ["kubernetes", "infrastructure"],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"title": "Node Pool Health",
|
||||
"uid": "node-pool-health"
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: cluster-pod-alerts
|
||||
labels:
|
||||
app: prometheus-stack
|
||||
release: prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: pod.alerts
|
||||
rules:
|
||||
- alert: PodOOMKilled
|
||||
expr: kube_pod_container_status_terminated_reason{reason="OOMKilled"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod OOMKilled"
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) was OOMKilled."
|
||||
|
||||
- alert: PodCrashLoopBackOff
|
||||
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod in CrashLoopBackOff"
|
||||
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is in CrashLoopBackOff."
|
||||
Loading…
Reference in New Issue