From de63fbb99e7561baa7fa52edea6f1b57f07df726 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss <erdnaxe@crans.org> Date: Thu, 25 Apr 2019 18:30:13 +0200 Subject: [PATCH] [monitoring] Prometheus Alertmanager --- monitoring.yml | 1 + .../prometheus-alertmanager/handlers/main.yml | 5 ++ roles/prometheus-alertmanager/tasks/main.yml | 14 +++++ .../templates/prometheus/alertmanager.yml.j2 | 57 +++++++++++++++++++ roles/prometheus/tasks/main.yml | 6 ++ .../templates/prometheus/prometheus.yml.j2 | 10 +++- 6 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 roles/prometheus-alertmanager/handlers/main.yml create mode 100644 roles/prometheus-alertmanager/tasks/main.yml create mode 100644 roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 diff --git a/monitoring.yml b/monitoring.yml index f807dcf4..cc19a6bd 100644 --- a/monitoring.yml +++ b/monitoring.yml @@ -23,6 +23,7 @@ - localhost:9090 roles: - prometheus + - prometheus-alertmanager # Monitor all hosts - hosts: all diff --git a/roles/prometheus-alertmanager/handlers/main.yml b/roles/prometheus-alertmanager/handlers/main.yml new file mode 100644 index 00000000..3ddbf930 --- /dev/null +++ b/roles/prometheus-alertmanager/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart Prometheus Alertmanager + service: + name: prometheus-alertmanager + state: restarted diff --git a/roles/prometheus-alertmanager/tasks/main.yml b/roles/prometheus-alertmanager/tasks/main.yml new file mode 100644 index 00000000..b65a2955 --- /dev/null +++ b/roles/prometheus-alertmanager/tasks/main.yml @@ -0,0 +1,14 @@ +--- +- name: Install Prometheus Alertmanager + apt: + update_cache: true + name: prometheus-alertmanager + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus Alertmanager + template: + src: prometheus/alertmanager.yml.j2 + dest: /etc/prometheus/alertmanager.yml + notify: Restart Prometheus Alertmanager diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 new file mode 100644 index 00000000..d7337c47 --- /dev/null +++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 @@ -0,0 +1,57 @@ +# {{ ansible_managed }} + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@crans.org' + #smtp_auth_username: 'alertmanager' + #smtp_auth_password: 'password' + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: team-roots-mails + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +- name: 'team-roots-mails' + email_configs: + - to: 'roots@crans.org' + diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index f1702b94..0dae5d37 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -13,6 +13,12 @@ dest: /etc/prometheus/prometheus.yml notify: Restart Prometheus +- name: Configure Prometheus alert rules + template: + src: prometheus/alert.rules.j2 + dest: /etc/prometheus/alert.rules + notify: Restart Prometheus + # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus nodes copy: diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index e96dedea..1844ad37 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -12,8 +12,14 @@ global: # Load and evaluate rules in this file every 'evaluation_interval' seconds. rule_files: - # - "first.rules" - # - "second.rules" + - "alert.rules" + +# Route alerts to Prometheus Alertmanager +alerting: + alertmanagers: + - static_configs: + - targets: + - 'localhost:9093' # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. -- GitLab