From de63fbb99e7561baa7fa52edea6f1b57f07df726 Mon Sep 17 00:00:00 2001
From: Alexandre Iooss <erdnaxe@crans.org>
Date: Thu, 25 Apr 2019 18:30:13 +0200
Subject: [PATCH] [monitoring] Prometheus Alertmanager

---
 monitoring.yml                                |  1 +
 .../prometheus-alertmanager/handlers/main.yml |  5 ++
 roles/prometheus-alertmanager/tasks/main.yml  | 14 +++++
 .../templates/prometheus/alertmanager.yml.j2  | 57 +++++++++++++++++++
 roles/prometheus/tasks/main.yml               |  6 ++
 .../templates/prometheus/prometheus.yml.j2    | 10 +++-
 6 files changed, 91 insertions(+), 2 deletions(-)
 create mode 100644 roles/prometheus-alertmanager/handlers/main.yml
 create mode 100644 roles/prometheus-alertmanager/tasks/main.yml
 create mode 100644 roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2

diff --git a/monitoring.yml b/monitoring.yml
index f807dcf4..cc19a6bd 100644
--- a/monitoring.yml
+++ b/monitoring.yml
@@ -23,6 +23,7 @@
           - localhost:9090
   roles:
     - prometheus
+    - prometheus-alertmanager
 
 # Monitor all hosts
 - hosts: all
diff --git a/roles/prometheus-alertmanager/handlers/main.yml b/roles/prometheus-alertmanager/handlers/main.yml
new file mode 100644
index 00000000..3ddbf930
--- /dev/null
+++ b/roles/prometheus-alertmanager/handlers/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Restart Prometheus Alertmanager
+  service:
+    name: prometheus-alertmanager
+    state: restarted
diff --git a/roles/prometheus-alertmanager/tasks/main.yml b/roles/prometheus-alertmanager/tasks/main.yml
new file mode 100644
index 00000000..b65a2955
--- /dev/null
+++ b/roles/prometheus-alertmanager/tasks/main.yml
@@ -0,0 +1,14 @@
+---
+- name: Install Prometheus Alertmanager
+  apt:
+    update_cache: true
+    name: prometheus-alertmanager
+  register: apt_result
+  retries: 3
+  until: apt_result is succeeded
+
+- name: Configure Prometheus Alertmanager
+  template:
+    src: prometheus/alertmanager.yml.j2
+    dest: /etc/prometheus/alertmanager.yml
+  notify: Restart Prometheus Alertmanager
diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
new file mode 100644
index 00000000..d7337c47
--- /dev/null
+++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
@@ -0,0 +1,57 @@
+# {{ ansible_managed }}
+
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@crans.org'
+  #smtp_auth_username: 'alertmanager'
+  #smtp_auth_password: 'password'
+
+# The directory from which notification templates are read.
+templates: 
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h 
+
+  # A default receiver
+  receiver: team-roots-mails
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+
+
+receivers:
+- name: 'team-roots-mails'
+  email_configs:
+  - to: 'roots@crans.org'
+
diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml
index f1702b94..0dae5d37 100644
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@@ -13,6 +13,12 @@
     dest: /etc/prometheus/prometheus.yml
   notify: Restart Prometheus
 
+- name: Configure Prometheus alert rules
+  template:
+    src: prometheus/alert.rules.j2
+    dest: /etc/prometheus/alert.rules
+  notify: Restart Prometheus
+
 # We don't need to restart Prometheus when updating nodes
 - name: Configure Prometheus nodes
   copy:
diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2
index e96dedea..1844ad37 100644
--- a/roles/prometheus/templates/prometheus/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@@ -12,8 +12,14 @@ global:
 
 # Load and evaluate rules in this file every 'evaluation_interval' seconds.
 rule_files:
-  # - "first.rules"
-  # - "second.rules"
+  - "alert.rules"
+
+# Route alerts to Prometheus Alertmanager
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets:
+      - 'localhost:9093'
 
 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
-- 
GitLab