diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 8017fb55a4fda5a1ba34d6a8fc1b3e96f0e3224b..d2fafda591b79e18b0da6bfebae5758f389dc4aa 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -56,6 +56,16 @@ groups: summary: "Charge CPU élevée sur {{ $labels.instance }}" description: "La charge CPU (moyenne de 15mn) est élevée." + # Check systemd unit (> buster) + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + description: "Le service {{ $labels.name }} a échoué." + # NTP (need NTP plugin in node) # - alert: ntp_drifting # expr: node_ntp_drift_seconds > 0.05