From c274db2ab572aacdb3958204eae7a48684d60b96 Mon Sep 17 00:00:00 2001
From: Alexandre Iooss <erdnaxe@crans.org>
Date: Sun, 9 Jun 2019 11:01:56 +0200
Subject: [PATCH] Monitor systemd services

---
 .../prometheus/templates/prometheus/alert.rules.yml.j2 | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
index 8017fb55..d2fafda5 100644
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@@ -56,6 +56,16 @@ groups:
       summary: "Charge CPU élevée sur {{ $labels.instance }}"
       description: "La charge CPU (moyenne de 15mn) est élevée."
 
+  # Check systemd unit (> buster)
+  - alert: SystemdServiceFailed
+    expr: node_systemd_unit_state{state="failed"} == 1
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
+      description: "Le service {{ $labels.name }} a échoué."
+
   # NTP (need NTP plugin in node)
 #  - alert: ntp_drifting
 #    expr: node_ntp_drift_seconds > 0.05
-- 
GitLab