From 1cd23374538b29bd032bd34f4714dd0239ad48b0 Mon Sep 17 00:00:00 2001 From: digitalstudium Date: Fri, 30 Jul 2021 14:08:09 +0300 Subject: [PATCH] Added alertmanager --- docker-compose.yml | 14 +++++++++ rules.yml | 71 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 rules.yml diff --git a/docker-compose.yml b/docker-compose.yml index 3737d12..ee61c20 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,7 @@ version: "3.9" + services: + grafana: image: grafana/grafana:8.0.6-ubuntu ports: @@ -7,6 +9,7 @@ services: volumes: - grafana-data:/var/lib/grafana - grafana-configs:/etc/grafana + prometheus: image: prom/prometheus:v2.28.1 ports: @@ -14,6 +17,15 @@ services: volumes: - prom-data:/prometheus - prom-configs:/etc/prometheus + - ./rules.yml:/etc/prometheus/rules.yml + + alertmanager: + image: prom/alertmanager:v0.22.2 + ports: + - "9093:9093" + volumes: + - alert-configs:/etc/alertmanager + node-exporter: image: prom/node-exporter:v1.2.0 ports: @@ -27,8 +39,10 @@ services: - '--path.sysfs=/host/sys' - '--collector.filesystem.mount-points-exclude' - '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)' + volumes: grafana-data: grafana-configs: prom-data: prom-configs: + alert-configs: \ No newline at end of file diff --git a/rules.yml b/rules.yml new file mode 100644 index 0000000..ef58c1c --- /dev/null +++ b/rules.yml @@ -0,0 +1,71 @@ +groups: +- name: Prometheus self-monitoring + rules: + - alert: Prometheus target missing + expr: 'up == 0' + annotations: + description: A Prometheus target has disappeared. An exporter might be crashed. + labels: + severity: critical + - alert: Prometheus AlertManager config not synced + expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' + annotations: + description: Configurations of AlertManager cluster instances are out of sync + labels: + severity: warning + - alert: Prometheus target scraping slow + expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} > 60' + annotations: + description: Prometheus is scraping exporters slowly + labels: + severity: warning + for: 5m +- name: Host and hardware + rules: + - alert: Host out of memory + expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10' + annotations: + description: Node memory is filling up (< 10% left) + labels: + severity: warning + for: 2m + - alert: Host out of disk space + expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' + annotations: + description: Disk is almost full (< 10% left) + labels: + severity: warning + for: 2m + - alert: Host out of inodes + expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0' + annotations: + description: Disk is almost running out of available inodes (< 10% left) + labels: + severity: warning + for: 2m + - alert: Host high CPU load + expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80' + annotations: + description: CPU load is > 80% + labels: + severity: warning + - alert: Host physical component too hot + expr: 'node_hwmon_temp_celsius > 75' + annotations: + description: "Physical hardware component too hot" + labels: + severity: warning + for: 5m + - alert: Host OOM kill detected + expr: 'increase(node_vmstat_oom_kill[1m]) > 0' + annotations: + description: OOM kill detected + labels: + severity: warning + - alert: Host clock skew + expr: '(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)' + annotations: + description: 'Clock skew detected. Clock is out of sync.' + labels: + severity: warning + for: 2m \ No newline at end of file