From c274db2ab572aacdb3958204eae7a48684d60b96 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss <erdnaxe@crans.org> Date: Sun, 9 Jun 2019 11:01:56 +0200 Subject: [PATCH] Monitor systemd services --- .../prometheus/templates/prometheus/alert.rules.yml.j2 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 8017fb55..d2fafda5 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -56,6 +56,16 @@ groups: summary: "Charge CPU élevée sur {{ $labels.instance }}" description: "La charge CPU (moyenne de 15mn) est élevée." + # Check systemd unit (> buster) + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + description: "Le service {{ $labels.name }} a échoué." + # NTP (need NTP plugin in node) # - alert: ntp_drifting # expr: node_ntp_drift_seconds > 0.05 -- GitLab