diff --git a/monitoring.yml b/monitoring.yml index f807dcf4e96955336b422827292089f72daa32eb..cc19a6bdc4863fe0e2bfedbe683aea4f1f0b7440 100644 --- a/monitoring.yml +++ b/monitoring.yml @@ -23,6 +23,7 @@ - localhost:9090 roles: - prometheus + - prometheus-alertmanager # Monitor all hosts - hosts: all diff --git a/roles/prometheus-alertmanager/handlers/main.yml b/roles/prometheus-alertmanager/handlers/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..3ddbf930e81851f247d02d4c64798e6518904c44 --- /dev/null +++ b/roles/prometheus-alertmanager/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart Prometheus Alertmanager + service: + name: prometheus-alertmanager + state: restarted diff --git a/roles/prometheus-alertmanager/tasks/main.yml b/roles/prometheus-alertmanager/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..b65a2955309aad88df61dbe886ba85c7c3ffb63e --- /dev/null +++ b/roles/prometheus-alertmanager/tasks/main.yml @@ -0,0 +1,14 @@ +--- +- name: Install Prometheus Alertmanager + apt: + update_cache: true + name: prometheus-alertmanager + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus Alertmanager + template: + src: prometheus/alertmanager.yml.j2 + dest: /etc/prometheus/alertmanager.yml + notify: Restart Prometheus Alertmanager diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 new file mode 100644 index 0000000000000000000000000000000000000000..d7337c47d2aa225a63af74c75b3089143fe66b35 --- /dev/null +++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 @@ -0,0 +1,57 @@ +# {{ ansible_managed }} + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@crans.org' + #smtp_auth_username: 'alertmanager' + #smtp_auth_password: 'password' + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: team-roots-mails + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +- name: 'team-roots-mails' + email_configs: + - to: 'roots@crans.org' + diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index f1702b943e0577864b294584b777d8863789f4a6..0dae5d37a0fd15eb2d54f7d56235ebac31431723 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -13,6 +13,12 @@ dest: /etc/prometheus/prometheus.yml notify: Restart Prometheus +- name: Configure Prometheus alert rules + template: + src: prometheus/alert.rules.j2 + dest: /etc/prometheus/alert.rules + notify: Restart Prometheus + # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus nodes copy: diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index e96dedea41e91aed019a0d6ccfb37968cbecfbf4..1844ad3798b7608c0e45b4bf829f8f02354d6cc3 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -12,8 +12,14 @@ global: # Load and evaluate rules in this file every 'evaluation_interval' seconds. rule_files: - # - "first.rules" - # - "second.rules" + - "alert.rules" + +# Route alerts to Prometheus Alertmanager +alerting: + alertmanagers: + - static_configs: + - targets: + - 'localhost:9093' # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself.