Added alertmanager
This commit is contained in:
parent
c218ba73ee
commit
1cd2337453
|
@ -1,5 +1,7 @@
|
||||||
version: "3.9"
|
version: "3.9"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
grafana:
|
grafana:
|
||||||
image: grafana/grafana:8.0.6-ubuntu
|
image: grafana/grafana:8.0.6-ubuntu
|
||||||
ports:
|
ports:
|
||||||
|
@ -7,6 +9,7 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- grafana-data:/var/lib/grafana
|
- grafana-data:/var/lib/grafana
|
||||||
- grafana-configs:/etc/grafana
|
- grafana-configs:/etc/grafana
|
||||||
|
|
||||||
prometheus:
|
prometheus:
|
||||||
image: prom/prometheus:v2.28.1
|
image: prom/prometheus:v2.28.1
|
||||||
ports:
|
ports:
|
||||||
|
@ -14,6 +17,15 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- prom-data:/prometheus
|
- prom-data:/prometheus
|
||||||
- prom-configs:/etc/prometheus
|
- prom-configs:/etc/prometheus
|
||||||
|
- ./rules.yml:/etc/prometheus/rules.yml
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:v0.22.2
|
||||||
|
ports:
|
||||||
|
- "9093:9093"
|
||||||
|
volumes:
|
||||||
|
- alert-configs:/etc/alertmanager
|
||||||
|
|
||||||
node-exporter:
|
node-exporter:
|
||||||
image: prom/node-exporter:v1.2.0
|
image: prom/node-exporter:v1.2.0
|
||||||
ports:
|
ports:
|
||||||
|
@ -27,8 +39,10 @@ services:
|
||||||
- '--path.sysfs=/host/sys'
|
- '--path.sysfs=/host/sys'
|
||||||
- '--collector.filesystem.mount-points-exclude'
|
- '--collector.filesystem.mount-points-exclude'
|
||||||
- '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
|
- '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
grafana-data:
|
grafana-data:
|
||||||
grafana-configs:
|
grafana-configs:
|
||||||
prom-data:
|
prom-data:
|
||||||
prom-configs:
|
prom-configs:
|
||||||
|
alert-configs:
|
|
@ -0,0 +1,71 @@
|
||||||
|
groups:
|
||||||
|
- name: Prometheus self-monitoring
|
||||||
|
rules:
|
||||||
|
- alert: Prometheus target missing
|
||||||
|
expr: 'up == 0'
|
||||||
|
annotations:
|
||||||
|
description: A Prometheus target has disappeared. An exporter might be crashed.
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: Prometheus AlertManager config not synced
|
||||||
|
expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
|
||||||
|
annotations:
|
||||||
|
description: Configurations of AlertManager cluster instances are out of sync
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: Prometheus target scraping slow
|
||||||
|
expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} > 60'
|
||||||
|
annotations:
|
||||||
|
description: Prometheus is scraping exporters slowly
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Host and hardware
|
||||||
|
rules:
|
||||||
|
- alert: Host out of memory
|
||||||
|
expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10'
|
||||||
|
annotations:
|
||||||
|
description: Node memory is filling up (< 10% left)
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
for: 2m
|
||||||
|
- alert: Host out of disk space
|
||||||
|
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
|
||||||
|
annotations:
|
||||||
|
description: Disk is almost full (< 10% left)
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
for: 2m
|
||||||
|
- alert: Host out of inodes
|
||||||
|
expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0'
|
||||||
|
annotations:
|
||||||
|
description: Disk is almost running out of available inodes (< 10% left)
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
for: 2m
|
||||||
|
- alert: Host high CPU load
|
||||||
|
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80'
|
||||||
|
annotations:
|
||||||
|
description: CPU load is > 80%
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: Host physical component too hot
|
||||||
|
expr: 'node_hwmon_temp_celsius > 75'
|
||||||
|
annotations:
|
||||||
|
description: "Physical hardware component too hot"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- alert: Host OOM kill detected
|
||||||
|
expr: 'increase(node_vmstat_oom_kill[1m]) > 0'
|
||||||
|
annotations:
|
||||||
|
description: OOM kill detected
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: Host clock skew
|
||||||
|
expr: '(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)'
|
||||||
|
annotations:
|
||||||
|
description: 'Clock skew detected. Clock is out of sync.'
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
for: 2m
|
Loading…
Reference in New Issue