diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index afc37b654ce6cfe2dfc39ed1be6cca44a741df13..b6cb79c2cd9d4340b8d5a4558f70754984d0c5ea 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -1,74 +1,235 @@ {{ ansible_header | comment }} -{# As this is also Jinja2 it will conflict without a raw block #} -{# Depending of Prometheus Node exporter version, rules can change depending of version #} +{# As this is also using brackets it will conflict without a raw block #} {% raw %} +# Synced with https://awesome-prometheus-alerts.grep.to/rules.html on 2021-06-07 +# We remove descriptions as we only send summary on IRC. +# UPS, APT and RADIUS configuration is made by Crans. + groups: - name: alert.rules rules: - # Alert for any instance that is unreachable for >3 minutes. - - alert: InstanceDown + ############################## + # Prometheus self-monitoring # + ############################## + + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + + - alert: PrometheusTargetMissing expr: up == 0 - for: 3m + for: 0m + labels: + severity: critical + annotations: + summary: {{ $labels.instance }} ({{ $labels.job }}) est manquant + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m labels: severity: critical annotations: - summary: "{{ $labels.instance }} ({{ $labels.job }}) est invisible depuis plus de 3 minutes !" + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + + - alert: PrometheusTargetEmpty + expr: prometheus_sd_discovered_targets == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + + # This already happened in 2021 at Crans + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + + ##################### + # Host and hardware # + ##################### # Alert for out of memory # Do not take into account memory not used by apps - - alert: OutOfMemory + - alert: HostOutOfMemory expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_PageTables_bytes + node_memory_VmallocUsed_bytes + node_memory_SwapCached_bytes + node_memory_Slab_bytes) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m + for: 2m labels: severity: warning annotations: - summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%." + summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%) - # Alert for out of disk space - - alert: OutOfDiskSpace - expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 + - alert: HostUnusualDiskReadRate + expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%." + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + + - alert: HostUnusualDiskWriteRate + expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate (instance {{ $labels.instance }}) - # Alert for out of inode space on disk - - alert: OutOfInodes + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: {{ $labels.mountpoint }} sur {{ $labels.instance }} arrive à saturation ({{ $value }}%) + + - alert: HostDiskWillFillIn24Hours + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + + - alert: HostOutOfInodes expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 for: 5m labels: severity: warning annotations: - summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." + summary: Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }} - # Alert for high CPU usage - - alert: CpuBusy + - alert: HostHighCpuLoad expr: node_load5 > 9 for: 10m labels: severity: warning annotations: - summary: "Charge sur {{ $labels.instance }} à {{ $value }}." + summary: Charge sur {{ $labels.instance }} à {{ $value }} - # Check mdadm software RAID - - alert: SoftwareRAIDDegraded - expr: node_md_disks-node_md_disks_active > 0 - for: 3m + - alert: HostSystemdServiceCrashed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 0m labels: severity: warning annotations: - summary: "Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s)." + summary: {{ $labels.name }} a crashé sur {{ $labels.instance }} - # Check systemd unit (> buster) - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m + # 0B is so hot + # En pratique c'est mauvais de tourner des disques trop chauds + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + + - alert: HostNodeOvertemperatureAlarm + expr: node_hwmon_temp_crit_alarm_celsius == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Le RAID sur {{ $labels.instance }} a perdu {{ $value }} disque(s) + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + + - alert: HostEdacUncorrectableErrorsDetected + expr: node_edac_uncorrectable_errors_total > 0 + for: 0m labels: severity: warning annotations: - summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + + # This happend in June 2021 at Crans + - alert: HostConntrackLimit + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + + ############ + # Blackbox # + ############ + + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ $labels.instance }}) + + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ $labels.instance }}) + + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20 + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + + ####### + # UPS # + ####### # Check UPS - alert: UpsOutputSourceChanged @@ -77,8 +238,7 @@ groups: labels: severity: warning annotations: - summary: "La source d'alimentation de {{ $labels.instance }} a changé !" - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: La source d'alimentation de {{ $labels.instance }} a changé ! - alert: UpsBatteryStatusChanged expr: upsBatteryStatus != 2 @@ -86,8 +246,7 @@ groups: labels: severity: warning annotations: - summary: "L'état de la batterie de {{ $labels.instance }} a changé !" - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: L'état de la batterie de {{ $labels.instance }} a changé ! - alert: UpsTemperatureWarning expr: (xupsEnvRemoteTemp < 10) or (xupsEnvRemoteTemp > 26) @@ -95,8 +254,7 @@ groups: labels: severity: warning annotations: - summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C." - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C - alert: UpsTemperatureCritical expr: (xupsEnvRemoteTemp < 0) or (xupsEnvRemoteTemp > 30) @@ -104,8 +262,7 @@ groups: labels: severity: critical annotations: - summary: "La température autour de {{ $labels.instance }} est de {{ $value }}°C !" - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: La température autour de {{ $labels.instance }} est de {{ $value }}°C - alert: UpsHighHumidity expr: xupsEnvRemoteHumidity > 65 @@ -113,8 +270,7 @@ groups: labels: severity: warning annotations: - summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}%." - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}% - alert: UpsVeryHighHumidity expr: xupsEnvRemoteHumidity > 85 @@ -122,8 +278,7 @@ groups: labels: severity: critical annotations: - summary: "L'humidité autour de {{ $labels.instance }} est de {{ $value }}% !" - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: L'humidité autour de {{ $labels.instance }} est de {{ $value }}% - alert: UpsHighLoad expr: upsOutputPercentLoad > 70 @@ -131,8 +286,7 @@ groups: labels: severity: critical annotations: - summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: La charge de {{ $labels.instance }} est de {{ $value }}% - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) @@ -140,8 +294,7 @@ groups: labels: severity: warning annotations: - summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V - alert: UpsWrongOutputVoltage expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245) @@ -149,8 +302,11 @@ groups: labels: severity: warning annotations: - summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." - description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" + summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V + + ######### + # Other # + ######### - alert: AptAutoremovePending expr: apt_autoremove_pending > 0 @@ -158,7 +314,15 @@ groups: labels: severity: warning annotations: - summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}." + summary: {{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }} + + - alert: AptOrphans + expr: apt_orphans > 10 + for: 5m + labels: + severity: warning + annotations: + summary: {{ $value }} paquet(s) APT sont orphelins sur {{ $labels.instance }} - alert: MailqNotEmpty expr: postfix_mailq_length > 25 @@ -166,7 +330,7 @@ groups: labels: severity: warning annotations: - summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}." + summary: {{ $value }} mails dans la mailq sur {{ $labels.instance }} - alert: NoRadiusLogin expr: rate(radiusd_access_ok[3m]) == 0 @@ -174,7 +338,7 @@ groups: labels: severity: warning annotations: - summary: "Personne ne vient taper le RADIUS." + summary: Personne ne vient taper le RADIUS - alert: TooManyReallocatedSectors expr: smartmon_reallocated_sector_ct_raw_value > 1e3 @@ -182,6 +346,6 @@ groups: labels: severity: warning annotations: - summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués." + summary: {{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués {% endraw %}