From f7347e41d2ce799c760936535d9c6efccb3580c1 Mon Sep 17 00:00:00 2001 From: shirenn <shirenn@crans.org> Date: Sun, 23 May 2021 18:18:35 +0200 Subject: [PATCH] [monitoring] PEPCRANS + monitoring of cachan infra --- group_vars/all/prometheus_node_exporter.yaml | 3 + group_vars/grafana.yml | 7 + group_vars/nginx.yml | 3 + group_vars/prometheus.yml | 13 ++ host_vars/airbus.cachan-adm.crans.org.yml | 3 + host_vars/fyre.cachan-adm.crans.org.yml | 116 ++++++++++++ host_vars/gulp.cachan-adm.crans.org.yml | 3 + host_vars/monitoring.adm.crans.org.yml | 111 +++++++++++- host_vars/omnomnom.cachan-adm.crans.org.yml | 3 + host_vars/re2o-ldap.cachan-adm.crans.org.yml | 3 + host_vars/re2o.cachan-adm.crans.org.yml | 3 + host_vars/rodauh.cachan-adm.crans.org.yml | 6 + .../cachan.yml | 3 + host_vars/terenez.cachan-adm.crans.org.yml | 6 + host_vars/unifi.cachan-adm.crans.org.yml | 3 + host_vars/zephir.cachan-adm.crans.org.yml | 3 + hosts | 14 +- plays/monitoring.yml | 92 +++------- roles/ninjabot/tasks/main.yml | 5 + .../templates/ninjabot/ninjabot.json.j2 | 1 + .../systemd/system/ninjabot.service.j2 | 2 +- .../prometheus-nginx-exporter/tasks/main.yml | 2 +- roles/prometheus-node-exporter/tasks/main.yml | 30 +--- .../templates/prometheus/snmp.yml.j2 | 10 +- roles/prometheus/tasks/main.yml | 27 +-- .../templates/prometheus/alert.rules.yml.j2 | 38 ++-- .../templates/prometheus/django.rules.yml.j2 | 106 ----------- .../templates/prometheus/prometheus.yml.j2 | 167 ++---------------- 28 files changed, 381 insertions(+), 402 deletions(-) create mode 100644 group_vars/all/prometheus_node_exporter.yaml create mode 100644 group_vars/grafana.yml create mode 100644 group_vars/prometheus.yml create mode 100644 host_vars/fyre.cachan-adm.crans.org.yml create mode 100644 roles/ninjabot/templates/ninjabot/ninjabot.json.j2 delete mode 100644 roles/prometheus/templates/prometheus/django.rules.yml.j2 diff --git a/group_vars/all/prometheus_node_exporter.yaml b/group_vars/all/prometheus_node_exporter.yaml new file mode 100644 index 00000000..39212fa9 --- /dev/null +++ b/group_vars/all/prometheus_node_exporter.yaml @@ -0,0 +1,3 @@ +--- +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" diff --git a/group_vars/grafana.yml b/group_vars/grafana.yml new file mode 100644 index 00000000..1cb40d06 --- /dev/null +++ b/group_vars/grafana.yml @@ -0,0 +1,7 @@ +--- +glob_grafana: + root_url: https://grafana.crans.org + icon: crans_icon_white.svg + ldap_base: "{{ glob_ldap.base }}" + ldap_master_ipv4: "{{ glob_ldap.servers[0] }}" + ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}" diff --git a/group_vars/nginx.yml b/group_vars/nginx.yml index e2868541..8d61b273 100644 --- a/group_vars/nginx.yml +++ b/group_vars/nginx.yml @@ -30,3 +30,6 @@ glob_nginx: - "172.16.0.0/16" - "fd00:0:0:10::/64" deploy_robots_file: false + +glob_prometheus_nginx_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" diff --git a/group_vars/prometheus.yml b/group_vars/prometheus.yml new file mode 100644 index 00000000..64642c8a --- /dev/null +++ b/group_vars/prometheus.yml @@ -0,0 +1,13 @@ +--- +glob_prometheus: {} + +glob_snmp_exporter: + procurve_password: "{{ vault.snmp_procurve_password }}" + unifi_password: "{{ vault.snmp_unifi_password }}" + +glob_ninjabot: + config: + nick: Prometheus + server: irc.adm.crans.org + port: 6667 + channel: "#monitoring" diff --git a/host_vars/airbus.cachan-adm.crans.org.yml b/host_vars/airbus.cachan-adm.crans.org.yml index b8d9e214..c8d0ef2e 100644 --- a/host_vars/airbus.cachan-adm.crans.org.yml +++ b/host_vars/airbus.cachan-adm.crans.org.yml @@ -18,3 +18,6 @@ loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/fyre.cachan-adm.crans.org.yml b/host_vars/fyre.cachan-adm.crans.org.yml new file mode 100644 index 00000000..2f6cfaee --- /dev/null +++ b/host_vars/fyre.cachan-adm.crans.org.yml @@ -0,0 +1,116 @@ +--- +interfaces: + adm: ens18 + +loc_home_nounou: + ip: 172.17.10.9 + mountpoint: /rpool/home + +loc_ldap: + servers: + - 172.17.10.9 + base: 'dc=crans,dc=org' + +loc_ntp_client: + servers: + - terenez.cachan-adm.crans.org + +debian_mirror: http://172.17.10.202/debian + +loc_mirror: + name: mirror.cachan-adm.crans.org + ip: "{{ query('ldap','ip','terenez','cachan-adm') | ipv4 | first }}" + +loc_borg: + remote: + - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} + ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" + +glob_snmp_exporter: + procurve_password: "{{ vault.snmp_procurve_password }}" + unifi_password: "{{ vault.snmp_unifi_password }}" + +loc_ninjabot: + config: + nick: fyre + server: irc.adm.crans.org + port: 6667 + channel: "#monitoring" + +loc_prometheus: + node: + file: targets_node.json + targets: "{{ groups['server'] | select('match', '^.*\\.cachan-adm\\.crans\\.org$') | list | sort }}" + config: + - job_name: servers + file_sd_configs: + - files: + - '/etc/prometheus/targets_node.json' + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9100' + + ups_snmp: + file: targets_ups_snmp.json + targets: + - pulsar.cachan-adm.crans.org # 0B + - quasar.cachan-adm.crans.org # 4J + config: + - job_name: ups_snmp + file_sd_configs: + - files: + - '/etc/prometheus/targets_ups_snmp.json' + metrics_path: /snmp + params: + module: [eatonups] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9116 + + unifi_snmp: + file: targets_unifi_snmp.json + targets: "{{ groups['crans_unifi'] | list | sort }}" + config: + - job_name: unifi_snmp + file_sd_configs: + - files: + - '/etc/prometheus/targets_unifi_snmp.json' + metrics_path: /snmp + params: + module: [ubiquiti_unifi] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9116 + + nginx: + file: targets_nginx.json + targets: + - rodauh.cachan-adm.crans.org + - terenez.cachan-adm.crans.org + config: + - job_name: nginx + file_sd_configs: + - files: + - '/etc/prometheus/targets_nginx.json' + relabel_configs: + - source_labels: [__address__] + target_label: instance + - source_labels: [instance] + target_label: __address__ + replacement: '$1:9117' diff --git a/host_vars/gulp.cachan-adm.crans.org.yml b/host_vars/gulp.cachan-adm.crans.org.yml index 251c4bdd..a3b8517c 100644 --- a/host_vars/gulp.cachan-adm.crans.org.yml +++ b/host_vars/gulp.cachan-adm.crans.org.yml @@ -25,3 +25,6 @@ loc_borg: to_exclude: - /var/lib/lxcfs ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/monitoring.adm.crans.org.yml b/host_vars/monitoring.adm.crans.org.yml index ab6e12d6..ecdb2719 100644 --- a/host_vars/monitoring.adm.crans.org.yml +++ b/host_vars/monitoring.adm.crans.org.yml @@ -1,4 +1,113 @@ interfaces: adm: eth0 srv_nat: eth1 - infra: eth2 + +loc_prometheus: + node: + file: targets_node.json + targets: "{{ groups['server'] | select('match', '^.*\\.adm\\.crans\\.org$') | list | sort }}" + config: + - job_name: servers + file_sd_configs: + - files: + - '/etc/prometheus/targets_node.json' + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9100' + + nginx: + file: targets_nginx.json + targets: + - hodaur.adm.crans.org + - charybde.adm.crans.org + config: + - job_name: nginx + file_sd_configs: + - files: + - '/etc/prometheus/targets_nginx.json' + relabel_configs: + - source_labels: [__address__] + target_label: instance + - source_labels: [instance] + target_label: __address__ + replacement: '$1:9117' + + blackbox: + file: targets_blackbox.json + targets: + - https://crans.org/ + - https://www.crans.org/ + - https://webirc.crans.org/ + - https://jitsi.crans.org/ + - https://ftps.crans.org/ + - http://ftp.crans.org/ + - https://grafana.crans.org/ + - https://roundcube.crans.org/ + - https://zero.crans.org/ + - https://wiki.crans.org/PageAccueil + - https://framadate.crans.org/ + - https://pad.crans.org/ + - https://lists.crans.org/ + - https://cas.crans.org/ + - https://ethercalc.crans.org/ + - https://phabricator.crans.org/ + - https://webmail.crans.org/horde/login.php + - https://gitlab.crans.org/ + - https://perso.crans.org/crans/ + - https://install-party.crans.org/ + - https://intranet.crans.org/ + - https://owncloud.crans.org/ + config: + - job_name: blackbox + file_sd_configs: + - files: + - '/etc/prometheus/targets_blackbox.json' + metrics_path: /probe + params: + module: [http_2xx] # Look for a HTTP 200 response. + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9115 + + mtail: + file: targets_mtail.json + targets: + - tealc.adm.crans.org + config: + - job_name: mtail + static_configs: + - targets: ["tealc.adm.crans.org"] + relabel_configs: + - source_labels: [__address__] + target_label: instance + - source_labels: [instance] + target_label: __address__ + replacement: '$1:3903' + + +# apache: +# targets: +# config: +# - job_name: apache +# file_sd_configs: +# - files: +# - '/etc/prometheus/targets_apache.json' +# relabel_configs: +# - source_labels: [__address__] +# target_label: instance +# - source_labels: [instance] +# target_label: __address__ +# replacement: '$1:9117' + +# bird_targets: +# - routeur-sam.adm.crans.org + diff --git a/host_vars/omnomnom.cachan-adm.crans.org.yml b/host_vars/omnomnom.cachan-adm.crans.org.yml index d99be4be..3685560f 100644 --- a/host_vars/omnomnom.cachan-adm.crans.org.yml +++ b/host_vars/omnomnom.cachan-adm.crans.org.yml @@ -25,3 +25,6 @@ loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/re2o-ldap.cachan-adm.crans.org.yml b/host_vars/re2o-ldap.cachan-adm.crans.org.yml index b8d9e214..c8d0ef2e 100644 --- a/host_vars/re2o-ldap.cachan-adm.crans.org.yml +++ b/host_vars/re2o-ldap.cachan-adm.crans.org.yml @@ -18,3 +18,6 @@ loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/re2o.cachan-adm.crans.org.yml b/host_vars/re2o.cachan-adm.crans.org.yml index 279c4f4d..c872b865 100644 --- a/host_vars/re2o.cachan-adm.crans.org.yml +++ b/host_vars/re2o.cachan-adm.crans.org.yml @@ -14,6 +14,9 @@ glob_ntp_client: debian_mirror: http://172.17.10.202/debian +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" + loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} diff --git a/host_vars/rodauh.cachan-adm.crans.org.yml b/host_vars/rodauh.cachan-adm.crans.org.yml index 37b60a76..75e6a304 100644 --- a/host_vars/rodauh.cachan-adm.crans.org.yml +++ b/host_vars/rodauh.cachan-adm.crans.org.yml @@ -37,3 +37,9 @@ loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" + +glob_prometheus_nginx_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/routeur-gulp.cachan-adm.crans.org/cachan.yml b/host_vars/routeur-gulp.cachan-adm.crans.org/cachan.yml index 4144ad09..20e5538c 100644 --- a/host_vars/routeur-gulp.cachan-adm.crans.org/cachan.yml +++ b/host_vars/routeur-gulp.cachan-adm.crans.org/cachan.yml @@ -22,3 +22,6 @@ loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/terenez.cachan-adm.crans.org.yml b/host_vars/terenez.cachan-adm.crans.org.yml index b8d9e214..46732a20 100644 --- a/host_vars/terenez.cachan-adm.crans.org.yml +++ b/host_vars/terenez.cachan-adm.crans.org.yml @@ -18,3 +18,9 @@ loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" + +glob_prometheus_nginx_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/unifi.cachan-adm.crans.org.yml b/host_vars/unifi.cachan-adm.crans.org.yml index 55bd2d2f..75dc7ea5 100644 --- a/host_vars/unifi.cachan-adm.crans.org.yml +++ b/host_vars/unifi.cachan-adm.crans.org.yml @@ -23,3 +23,6 @@ loc_borg: remote: - borg@zephir.cachan-adm.crans.org:/backup/borg/{{ ansible_hostname }} ssh_options: "" + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/host_vars/zephir.cachan-adm.crans.org.yml b/host_vars/zephir.cachan-adm.crans.org.yml index 0bf66658..f30a1b16 100644 --- a/host_vars/zephir.cachan-adm.crans.org.yml +++ b/host_vars/zephir.cachan-adm.crans.org.yml @@ -27,3 +27,6 @@ loc_borg: ssh_options: "" to_exclude: - /var/lib/backuppc + +glob_prometheus_node_exporter: + listen_addr: "{{ query('ldap', 'ip', ansible_hostname, 'cachan-adm') | ipv4 | first }}" diff --git a/hosts b/hosts index 30746559..df51e245 100644 --- a/hosts +++ b/hosts @@ -18,6 +18,9 @@ tealc.adm.crans.org tealc.adm.crans.org gulp.cachan-adm.crans.org +[blackbox] +monitoring.adm.crans.org + [bdd:children] virtu @@ -81,6 +84,9 @@ neree.adm.crans.org [gitlab] gitzly.adm.crans.org +[grafana] +monitoring.adm.crans.org + [horde] horde.adm.crans.org @@ -99,17 +105,16 @@ linx.adm.crans.org [mailman] mailman.adm.crans.org -[monitoring] +[prometheus] monitoring.adm.crans.org - -[nginx] -charybde.adm.crans.org +fyre.cachan-adm.crans.org [nginx:children] django_cas galene jitsi mailman +ntp_server re2o_front reverseproxy roundcube @@ -212,6 +217,7 @@ cas.adm.crans.org codichotomie.adm.crans.org ethercalc.adm.crans.org fluxx.adm.crans.org +fyre.cachan-adm.crans.org gitlab-ci.adm.crans.org gitzly.adm.crans.org hodaur.adm.crans.org diff --git a/plays/monitoring.yml b/plays/monitoring.yml index 0685ef51..70a418a3 100755 --- a/plays/monitoring.yml +++ b/plays/monitoring.yml @@ -1,81 +1,43 @@ #!/usr/bin/env ansible-playbook --- -# Deploy Prometheus and Grafana on monitoring server -- hosts: monitoring - vars: - # Prometheus targets.json - prometheus: - node_targets: "{{ groups['server'] | list | sort }}" - ups_snmp_targets: - - pulsar.adm.crans.org # 0B - - quasar.adm.crans.org # 4J - procurve_snmp_targets: - - batg-9.infra.crans.org - unifi_snmp_targets: "{{ groups['crans_unifi'] | list | sort }}" - blackbox_targets: - - https://crans.org/ - - https://www.crans.org/ - - https://webirc.crans.org/ - - https://jitsi.crans.org/ - - https://ftps.crans.org/ - - http://ftp.crans.org/ - - https://grafana.crans.org/ - - https://roundcube.crans.org/ - - https://zero.crans.org/ - - https://wiki.crans.org/PageAccueil - - https://framadate.crans.org/ - - https://pad.crans.org/ - - https://lists.crans.org/ - - https://cas.crans.org/ - - https://ethercalc.crans.org/ - - https://phabricator.crans.org/ - - https://webmail.crans.org/horde/login.php - - https://gitlab.crans.org/ - - https://perso.crans.org/crans/ - - https://install-party.crans.org/ - - https://intranet.crans.org/ - - https://owncloud.crans.org/ - nginx_targets: - - hodaur.adm.crans.org - - charybde.adm.crans.org - apache_targets: [] # [zamok.adm.crans.org] - bird_targets: - - routeur-sam.adm.crans.org - - snmp_procurve_password: "{{ vault.snmp_procurve_password }}" - snmp_unifi_password: "{{ vault.snmp_unifi_password }}" - grafana: - root_url: https://grafana.crans.org - icon: crans_icon_white.svg - ldap_base: "{{ glob_ldap.base }}" - ldap_master_ipv4: "{{ glob_ldap.servers[0] }}" - ldap_user_tree: "ou=passwd,{{ glob_ldap.base }}" +# Deploy Prometheus on monitoring server +- hosts: prometheus + vars: + prometheus: "{{ glob_prometheus | default({}) | combine(loc_prometheus | default({})) }}" + alertmanager: "{{ glob_alertmanager | default({}) | combine(loc_alertmanager | default({})) }}" + snmp_exporter: "{{ glob_snmp_exporter | default({}) | combine(loc_snmp_exporter | default({})) }}" + ninjabot: "{{ glob_ninjabot | default({}) | combine(loc_ninjabot | default({})) }}" roles: - prometheus - prometheus-alertmanager - prometheus-snmp-exporter - - prometheus-blackbox-exporter - ninjabot - - grafana + +# # Deploy Grafana on monitoring server +# - hosts: grafana +# vars: +# grafana: "{{ glob_grafana | default({}) | combine(loc_grafana | default({})) }}" +# roles: +# - grafana + +- hosts: blackbox + roles: + - prometheus-blackbox-exporter # Monitor all hosts - hosts: server vars: - adm_ipv4: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" - roles: ["prometheus-node-exporter"] + prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}" + roles: + - prometheus-node-exporter # Export nginx metrics - hosts: nginx vars: - adm_ipv4: "{{ query('ldap', 'ip', ansible_hostname, 'adm') | ipv4 | first }}" - roles: ["prometheus-nginx-exporter"] - -# Export apache metrics -#- hosts: zamok.adm.crans.org -# vars: -# adm_ipv4: "{{ ansible_all_ipv4_addresses | ipaddr(adm_subnet) | first }}" -# roles: ["prometheus-apache-exporter"] + prometheus_nginx_exporter: "{{ glob_prometheus_nginx_exporter | default({}) | combine(loc_prometheus_nginx_exporter | default({})) }}" + roles: + - prometheus-nginx-exporter # Monitor mailq with a special text exporter #- hosts: redisdead.adm.crans.org @@ -85,7 +47,9 @@ - hosts: tealc.adm.crans.org vars: mirror: '{{ glob_mirror | default({}) | combine(loc_mirror | default({})) }}' - roles: ["mtail"] + roles: + - mtail - hosts: sputnik.adm.crans.org - roles: ["statping"] + roles: + - statping diff --git a/roles/ninjabot/tasks/main.yml b/roles/ninjabot/tasks/main.yml index 98bb4f7a..34d0184b 100644 --- a/roles/ninjabot/tasks/main.yml +++ b/roles/ninjabot/tasks/main.yml @@ -17,6 +17,11 @@ dest: /var/local/ninjabot version: master +- name: Deploy NinjaBot configuration + template: + src: ninjabot/ninjabot.json.j2 + dest: /var/local/ninjabot/ninjabot.json + - name: Deploy NinjaBot systemd unit template: src: systemd/system/ninjabot.service.j2 diff --git a/roles/ninjabot/templates/ninjabot/ninjabot.json.j2 b/roles/ninjabot/templates/ninjabot/ninjabot.json.j2 new file mode 100644 index 00000000..d0296ae2 --- /dev/null +++ b/roles/ninjabot/templates/ninjabot/ninjabot.json.j2 @@ -0,0 +1 @@ +{{ ninjabot.config | to_nice_json(indent=2) }} diff --git a/roles/ninjabot/templates/systemd/system/ninjabot.service.j2 b/roles/ninjabot/templates/systemd/system/ninjabot.service.j2 index 8c88045b..526a2962 100644 --- a/roles/ninjabot/templates/systemd/system/ninjabot.service.j2 +++ b/roles/ninjabot/templates/systemd/system/ninjabot.service.j2 @@ -8,7 +8,7 @@ Type=simple WorkingDirectory=/var/local/ninjabot User=nobody Group=nogroup -ExecStart=/usr/bin/python3 /var/local/ninjabot/main.py +ExecStart=/usr/bin/python3 /var/local/ninjabot/ninjabot.py Restart=always [Install] diff --git a/roles/prometheus-nginx-exporter/tasks/main.yml b/roles/prometheus-nginx-exporter/tasks/main.yml index c7e8f32c..0fb8001d 100644 --- a/roles/prometheus-nginx-exporter/tasks/main.yml +++ b/roles/prometheus-nginx-exporter/tasks/main.yml @@ -14,7 +14,7 @@ path: /etc/default/prometheus-nginx-exporter regexp: '^ARGS=' line: | - ARGS="-web.listen-address={{ adm_ipv4 }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status" + ARGS="-web.listen-address={{ prometheus_nginx_exporter.listen_addr }}:9117 -nginx.scrape-uri=http://[::1]:6424/stub_status" notify: - Restart nginx - Restart prometheus-nginx-exporter diff --git a/roles/prometheus-node-exporter/tasks/main.yml b/roles/prometheus-node-exporter/tasks/main.yml index d8f2a1c2..a91d9d9e 100644 --- a/roles/prometheus-node-exporter/tasks/main.yml +++ b/roles/prometheus-node-exporter/tasks/main.yml @@ -7,21 +7,6 @@ register: apt_result retries: 3 until: apt_result is succeeded - when: - - ansible_lsb.codename != 'stretch' - -# Prometheus 2 node is in stretch-backports -- name: Install Prometheus node-exporter (stretch-backports) - apt: - update_cache: true - name: prometheus-node-exporter - install_recommends: false - default_release: stretch-backports - register: apt_result - retries: 3 - until: apt_result is succeeded - when: - - ansible_lsb.codename == 'stretch' - name: Install Prometheus node-exporter-collectors (bullseye) apt: @@ -45,7 +30,7 @@ path: /etc/default/prometheus-node-exporter regexp: '^ARGS=' line: | - ARGS="--web.listen-address={{ adm_ipv4 }}:9100" + ARGS="--web.listen-address={{ prometheus_node_exporter.listen_addr }}:9100" tags: restart-node-exporter # Install new APT textfile collector, it might be upstreamed one day @@ -57,15 +42,4 @@ owner: root group: root mode: 0755 - when: ansible_lsb.id == 'Debian' and ansible_distribution_release != "bullseye" - -# Install new APT textfile collector, it might be upstreamed one day -# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/pull/35 -- name: Patch APT textfile collector - copy: - src: apt.sh - dest: /usr/share/prometheus-node-exporter-collectors/apt.sh - owner: root - group: root - mode: 0755 - when: ansible_lsb.id == 'Ubuntu' or ansible_distribution_release == "bullseye" + when: ansible_distribution_release != "bullseye" diff --git a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 index aa30bc43..fb946b74 100644 --- a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 +++ b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 @@ -113,14 +113,14 @@ procurve_switch: version: 3 auth: # To create SNMPv3 user on HP procurve, execute: - # snmpv3 user snmp_prometheus auth sha {{ snmp_procurve_password }} priv aes {{ snmp_procurve_password }} + # snmpv3 user snmp_prometheus auth sha {{ snmp_exporter.procurve_password }} priv aes {{ snmp_exporter.procurve_password }} # snmpv3 group managerpriv user snmp_prometheus sec-model ver3 security_level: authPriv username: snmp_prometheus - password: {{ snmp_procurve_password }} + password: {{ snmp_exporter.procurve_password }} auth_protocol: SHA priv_protocol: AES - priv_password: {{ snmp_procurve_password }} + priv_password: {{ snmp_exporter.procurve_password }} ubiquiti_unifi: walk: @@ -475,7 +475,7 @@ ubiquiti_unifi: auth: security_level: authPriv username: snmp_prometheus - password: {{ snmp_unifi_password }} + password: {{ snmp_exporter.unifi_password }} auth_protocol: SHA priv_protocol: AES - priv_password: {{ snmp_unifi_password }} + priv_password: {{ snmp_exporter.unifi_password }} diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 72136715..2a9f54fe 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -16,35 +16,18 @@ - name: Configure Prometheus alert rules template: - src: "prometheus/{{ item }}.j2" - dest: "/etc/prometheus/{{ item }}" + src: prometheus/alert.rules.yml.j2 + dest: /etc/prometheus/alert.rules.yml mode: 0644 notify: Restart Prometheus - loop: - - alert.rules.yml - - django.rules.yml # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus targets copy: - content: "{{ [{'targets': item.targets}] | to_nice_json }}\n" - dest: "/etc/prometheus/{{ item.file }}.json" + content: "{{ [{'targets': item.value.targets}] | to_nice_json }}\n" + dest: "/etc/prometheus/{{ item.value.file }}" mode: 0644 - loop: - - file: targets - targets: "{{ prometheus.node_targets }}" - - file: targets_ups_snmp - targets: "{{ prometheus.ups_snmp_targets }}" - - file: targets_procurve_snmp - targets: "{{ prometheus.procurve_snmp_targets }}" - - file: targets_unifi_snmp - targets: "{{ prometheus.unifi_snmp_targets }}" - - file: targets_nginx - targets: "{{ prometheus.nginx_targets }}" - - file: targets_apache - targets: "{{ prometheus.apache_targets }}" - - file: targets_blackbox - targets: "{{ prometheus.blackbox_targets }}" + loop: "{{ prometheus | dict2items }}" - name: Activate prometheus service systemd: diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index d5017c6f..afc37b65 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -144,7 +144,7 @@ groups: description: "https://grafana.crans.org/d/qtbg59mZz/alimentation" - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 225) or (upsOutputVoltage > 235) + expr: (upsOutputVoltage < 215) or (upsOutputVoltage > 245) for: 5m labels: severity: warning @@ -161,29 +161,27 @@ groups: summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}." - alert: MailqNotEmpty - expr: postfix_mailq_length > 5 + expr: postfix_mailq_length > 25 for: 1m labels: severity: warning annotations: summary: "{{ $value }} mails dans la mailq sur {{ $labels.instance }}." - # NTP (need NTP plugin in node) -# - alert: ntp_drifting -# expr: node_ntp_drift_seconds > 0.05 -# for: 3m -# labels: -# severity: critical -# annotations: -# summary: "Décalage NTP trop élevé sur {{ $labels.instance }}" -# description: "Le décalage NTP est trop élevé ({{ $value }} > 0.05)" - -# - alert: ntp_drifting -# expr: node_ntp_drift_seconds > 0.01 -# for: 1m -# labels: -# severity: warning -# annotations: -# summary: "Décalage NTP élevé sur {{ $labels.instance }}" -# description: "Le décalage NTP est élevé ({{ $value }} > 0.01)" + - alert: NoRadiusLogin + expr: rate(radiusd_access_ok[3m]) == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Personne ne vient taper le RADIUS." + + - alert: TooManyReallocatedSectors + expr: smartmon_reallocated_sector_ct_raw_value > 1e3 + for: 5m + labels: + severity: warning + annotations: + summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués." + {% endraw %} diff --git a/roles/prometheus/templates/prometheus/django.rules.yml.j2 b/roles/prometheus/templates/prometheus/django.rules.yml.j2 deleted file mode 100644 index 8131a71f..00000000 --- a/roles/prometheus/templates/prometheus/django.rules.yml.j2 +++ /dev/null @@ -1,106 +0,0 @@ -{{ ansible_header | comment }} -{# As this is also Jinja2 it will conflict without a raw block #} -{% raw %} -groups: -- name: django.rules - rules: - - record: job:django_http_requests_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job) - - record: job:django_http_ajax_requests_total:sum_rate30s - expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job) - - record: job:django_http_responses_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) - BY (job) - - record: job:django_http_requests_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job) - - record: job:django_http_responses_streaming_total:sum_rate30s - expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job) - - record: job:django_http_responses_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job) - - record: job:django_http_requests_total:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job) - - record: job:django_http_requests_total_by_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method) - - record: job:django_http_requests_total_by_transport:sum_rate30s - expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport) - - record: job:django_http_requests_total_by_view:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view) - - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view, transport, method) - - record: job:django_http_responses_total_by_templatename:sum_rate30s - expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename) - - record: job:django_http_responses_total_by_status:sum_rate30s - expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status) - - record: job:django_http_responses_total_by_charset:sum_rate30s - expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset) - - record: job:django_http_exceptions_total_by_type:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type) - - record: job:django_http_exceptions_total_by_view:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view) - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_model_inserts_total:sum_rate1m - expr: sum(rate(django_model_inserts_total[1m])) BY (job, model) - - record: job:django_model_updates_total:sum_rate1m - expr: sum(rate(django_model_updates_total[1m])) BY (job, model) - - record: job:django_model_deletes_total:sum_rate1m - expr: sum(rate(django_model_deletes_total[1m])) BY (job, model) - - record: job:django_db_new_connections_total:sum_rate30s - expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor) - - record: job:django_db_new_connection_errors_total:sum_rate30s - expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_total:sum_rate30s - expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_many_total:sum_rate30s - expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor) - - record: job:django_db_errors_total:sum_rate30s - expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type) - - record: job:django_migrations_applied_total:max - expr: max(django_migrations_applied_total) BY (job, connection) - - record: job:django_migrations_unapplied_total:max - expr: max(django_migrations_unapplied_total) BY (job, connection) -{% endraw %} diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 5877a182..daa136c4 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -20,156 +20,23 @@ alerting: # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "alert.rules.yml" # Monitoring alerts, this is the file you may be searching! - - "django.rules.yml" # Custom rules specific for Django project monitoring # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. -scrape_configs: - # The .json in file_sd_configs is dynamically reloaded - - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - - job_name: servers - file_sd_configs: - - files: - - '/etc/prometheus/targets.json' - relabel_configs: - # Do not put :9100 in instance name, rather here - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - source_labels: [__param_target] - target_label: __address__ - replacement: '$1:9100' - -{% if prometheus.ups_snmp_targets is defined %} - - job_name: ups_snmp - file_sd_configs: - - files: - - '/etc/prometheus/targets_ups_snmp.json' - metrics_path: /snmp - params: - module: [eatonups] - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: 127.0.0.1:9116 -{% endif %} - -{% if prometheus.procurve_snmp_targets is defined %} - - job_name: procurve_snmp - file_sd_configs: - - files: - - '/etc/prometheus/targets_procurve_snmp.json' - metrics_path: /snmp - params: - module: [procurve_switch] - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: 127.0.0.1:9116 -{% endif %} - -{% if prometheus.unifi_snmp_targets is defined %} - - job_name: unifi_snmp - file_sd_configs: - - files: - - '/etc/prometheus/targets_unifi_snmp.json' - metrics_path: /snmp - params: - module: [ubiquiti_unifi] - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: 127.0.0.1:9116 -{% endif %} - -{% if prometheus.nginx_targets is defined %} - - job_name: nginx - file_sd_configs: - - files: - - '/etc/prometheus/targets_nginx.json' - relabel_configs: - # Do not put :9117 in instance name, rather here - - source_labels: [__address__] - target_label: instance - - source_labels: [instance] - target_label: __address__ - replacement: '$1:9117' -{% endif %} - -{% if prometheus.apache_targets is defined %} - - job_name: apache - file_sd_configs: - - files: - - '/etc/prometheus/targets_apache.json' - relabel_configs: - # Do not put :9117 in instance name, rather here - - source_labels: [__address__] - target_label: instance - - source_labels: [instance] - target_label: __address__ - replacement: '$1:9117' -{% endif %} - -{% if prometheus.blackbox_targets is defined %} - - job_name: blackbox - file_sd_configs: - - files: - - '/etc/prometheus/targets_blackbox.json' - metrics_path: /probe - params: - module: [http_2xx] # Look for a HTTP 200 response. - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: 127.0.0.1:9115 -{% endif %} - - - job_name: mtail - static_configs: - - targets: ["tealc.adm.crans.org"] - relabel_configs: - # Do not put :3903 in instance name, rather here - - source_labels: [__address__] - target_label: instance - - source_labels: [instance] - target_label: __address__ - replacement: '$1:3903' - -{% if prometheus.bird_targets is defined %} - - job_name: bird - file_sd_configs: - - files: - - '/etc/prometheus/targets_bird.json' - relabel_configs: - # Do not put :3903 in instance name, rather here - - source_labels: [__address__] - target_label: instance - - source_labels: [instance] - target_label: __address__ - replacement: '$1:9324' -{% endif %} - - - job_name: django - scheme: https - static_configs: - - targets: [] -# Activate this line when the captive portal need monitoring -# - targets: ["portail-captif.crans.org:443"] +{{ + { + "scrape_configs": + [ + { + "job_name": "prometheus", + "static_configs" : [ + { + "targets": [ + "localhost:9090" + ] + } + ] + } + ] + (prometheus | json_query("*.config[0]")) + } | to_nice_yaml(indent=2) +}} -- GitLab