From f3c38819ef7d8753ea9f1729deff506c964de08e Mon Sep 17 00:00:00 2001 From: Alexandre Iooss <erdnaxe@crans.org> Date: Wed, 13 Jan 2021 16:37:48 +0100 Subject: [PATCH] Prometheus on bullseye --- plays/monitoring.yml | 11 +- roles/mtail/templates/mtail/radiusd.mtail.j2 | 6 +- .../templates/prometheus/alertmanager.yml.j2 | 4 - roles/prometheus-node-exporter/tasks/main.yml | 14 +- .../default/prometheus-node-exporter.j2 | 130 ------------------ .../templates/prometheus/snmp.yml.j2 | 14 +- roles/prometheus/tasks/main.yml | 60 +++----- .../templates/prometheus/alert.rules.yml.j2 | 2 +- .../templates/prometheus/prometheus.yml.j2 | 38 ++++- 9 files changed, 83 insertions(+), 196 deletions(-) delete mode 100644 roles/prometheus-node-exporter/templates/default/prometheus-node-exporter.j2 diff --git a/plays/monitoring.yml b/plays/monitoring.yml index d813a866..a046fb6b 100755 --- a/plays/monitoring.yml +++ b/plays/monitoring.yml @@ -6,9 +6,11 @@ # Prometheus targets.json prometheus: node_targets: "{{ groups['server'] | list | sort }}" - ups_snmp_targets: [] - # - pulsar.adm.crans.org # 0B - # - quasar.adm.crans.org # 4J + ups_snmp_targets: + - pulsar.adm.crans.org # 0B + - quasar.adm.crans.org # 4J + procurve_snmp_targets: + - batg-9.infra.crans.org unifi_snmp_targets: "{{ groups['crans_unifi'] | list | sort }}" blackbox_targets: - https://crans.org/ @@ -37,7 +39,10 @@ - hodaur.adm.crans.org - charybde.adm.crans.org apache_targets: [] # [zamok.adm.crans.org] + bird_targets: + - routeur-sam.adm.crans.org + snmp_procurve_password: "{{ vault_snmp_procurve_password }}" snmp_unifi_password: "{{ vault_snmp_unifi_password }}" grafana: diff --git a/roles/mtail/templates/mtail/radiusd.mtail.j2 b/roles/mtail/templates/mtail/radiusd.mtail.j2 index e7b090f0..9d1dc16f 100644 --- a/roles/mtail/templates/mtail/radiusd.mtail.j2 +++ b/roles/mtail/templates/mtail/radiusd.mtail.j2 @@ -35,7 +35,11 @@ def syslog { } # Bouh! - /Adherent non cotisant/ { + /Invalid connexion \(non-contributing user\)/ { radiusd_access_refused["Did not pay"]++ } + + /Invalid user/ { + radiusd_access_refused["Invalid user"]++ + } } diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 index 1b61324d..620ddee9 100644 --- a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 +++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 @@ -7,10 +7,6 @@ global: smtp_from: 'alertmanager@example.org' #smtp_auth_username: 'alertmanager' #smtp_auth_password: 'password' - # The auth token for Hipchat. - hipchat_auth_token: '1234556789' - # Alternative host for Hipchat. - hipchat_api_url: 'https://hipchat.foobar.org/' # The directory from which notification templates are read. templates: diff --git a/roles/prometheus-node-exporter/tasks/main.yml b/roles/prometheus-node-exporter/tasks/main.yml index b324f2fb..0e07c74d 100644 --- a/roles/prometheus-node-exporter/tasks/main.yml +++ b/roles/prometheus-node-exporter/tasks/main.yml @@ -29,12 +29,12 @@ enabled: true state: started -# Doesn't work on Debian Stretch with the old prometheus package - name: Make Prometheus node-exporter listen on adm only - template: - src: default/prometheus-node-exporter.j2 - dest: /etc/default/prometheus-node-exporter - notify: Restart prometheus-node-exporter + lineinfile: + path: /etc/default/prometheus-node-exporter + regexp: '^ARGS=' + line: | + ARGS="--web.listen-address={{ adm_ipv4 }}:9100" tags: restart-node-exporter # Install new APT textfile collector, it might be upstreamed one day @@ -46,7 +46,7 @@ owner: root group: root mode: 0755 - when: ansible_lsb.id == 'Debian' + when: ansible_lsb.id == 'Debian' and ansible_distribution_release != "bullseye" # Install new APT textfile collector, it might be upstreamed one day # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/pull/35 @@ -57,4 +57,4 @@ owner: root group: root mode: 0755 - when: ansible_lsb.id == 'Ubuntu' + when: ansible_lsb.id == 'Ubuntu' or ansible_distribution_release == "bullseye" diff --git a/roles/prometheus-node-exporter/templates/default/prometheus-node-exporter.j2 b/roles/prometheus-node-exporter/templates/default/prometheus-node-exporter.j2 deleted file mode 100644 index 819d243a..00000000 --- a/roles/prometheus-node-exporter/templates/default/prometheus-node-exporter.j2 +++ /dev/null @@ -1,130 +0,0 @@ -{{ ansible_header | comment }} - -# Set the command-line arguments to pass to the server. -# Due to shell scaping, to pass backslashes for regexes, you need to double -# them (\\d for \d). If running under systemd, you need to double them again -# (\\\\d to mean \d), and escape newlines too. -ARGS="--web.listen-address={{ adm_ipv4 }}:9100" - -# Prometheus-node-exporter supports the following options: -# -# --collector.diskstats.ignored-devices="^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$" -# Regexp of devices to ignore for diskstats. -# --collector.filesystem.ignored-mount-points="^/(dev|proc|run|sys|mnt|media|var/lib/docker)($|/)" -# Regexp of mount points to ignore for filesystem -# collector. -# --collector.filesystem.ignored-fs-types="^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$" -# Regexp of filesystem types to ignore for -# filesystem collector. -# --collector.netdev.ignored-devices="^lo$" -# Regexp of net devices to ignore for netdev -# collector. -# --collector.netstat.fields="^(.*_(InErrors|InErrs)|Ip_Forwarding|Ip(6|Ext)_(InOctets|OutOctets)|Icmp6?_(InMsgs|OutMsgs)|TcpExt_(Listen.*|Syncookies.*)|Tcp_(ActiveOpens|PassiveOpens|RetransSegs|CurrEstab)|Udp6?_(InDatagrams|OutDatagrams|NoPorts))$" -# Regexp of fields to return for netstat -# collector. -# --collector.ntp.server="127.0.0.1" -# NTP server to use for ntp collector -# --collector.ntp.protocol-version=4 -# NTP protocol version -# --collector.ntp.server-is-local -# Certify that collector.ntp.server address is the -# same local host as this collector. -# --collector.ntp.ip-ttl=1 IP TTL to use while sending NTP query -# --collector.ntp.max-distance=3.46608s -# Max accumulated distance to the root -# --collector.ntp.local-offset-tolerance=1ms -# Offset between local clock and local ntpd time -# to tolerate -# --path.procfs="/proc" procfs mountpoint. -# --path.sysfs="/sys" sysfs mountpoint. -# --collector.qdisc.fixtures="" -# test fixtures to use for qdisc collector -# end-to-end testing -# --collector.runit.servicedir="/etc/service" -# Path to runit service directory. -# --collector.supervisord.url="http://localhost:9001/RPC2" -# XML RPC endpoint. -# --collector.systemd.unit-whitelist=".+" -# Regexp of systemd units to whitelist. Units must -# both match whitelist and not match blacklist to -# be included. -# --collector.systemd.unit-blacklist=".+(\\.device|\\.scope|\\.slice|\\.target)" -# Regexp of systemd units to blacklist. Units must -# both match whitelist and not match blacklist to -# be included. -# --collector.systemd.private -# Establish a private, direct connection to -# systemd without dbus. -# --collector.textfile.directory="/var/lib/prometheus/node-exporter" -# Directory to read text files with metrics from. -# --collector.vmstat.fields="^(oom_kill|pgpg|pswp|pg.*fault).*" -# Regexp of fields to return for vmstat collector. -# --collector.wifi.fixtures="" -# test fixtures to use for wifi collector metrics -# --collector.arp Enable the arp collector (default: enabled). -# --collector.bcache Enable the bcache collector (default: enabled). -# --collector.bonding Enable the bonding collector (default: enabled). -# --collector.buddyinfo Enable the buddyinfo collector (default: -# disabled). -# --collector.conntrack Enable the conntrack collector (default: -# enabled). -# --collector.cpu Enable the cpu collector (default: enabled). -# --collector.diskstats Enable the diskstats collector (default: -# enabled). -# --collector.drbd Enable the drbd collector (default: disabled). -# --collector.edac Enable the edac collector (default: enabled). -# --collector.entropy Enable the entropy collector (default: enabled). -# --collector.filefd Enable the filefd collector (default: enabled). -# --collector.filesystem Enable the filesystem collector (default: -# enabled). -# --collector.hwmon Enable the hwmon collector (default: enabled). -# --collector.infiniband Enable the infiniband collector (default: -# enabled). -# --collector.interrupts Enable the interrupts collector (default: -# disabled). -# --collector.ipvs Enable the ipvs collector (default: enabled). -# --collector.ksmd Enable the ksmd collector (default: disabled). -# --collector.loadavg Enable the loadavg collector (default: enabled). -# --collector.logind Enable the logind collector (default: disabled). -# --collector.mdadm Enable the mdadm collector (default: enabled). -# --collector.meminfo Enable the meminfo collector (default: enabled). -# --collector.meminfo_numa Enable the meminfo_numa collector (default: -# disabled). -# --collector.mountstats Enable the mountstats collector (default: -# disabled). -# --collector.netdev Enable the netdev collector (default: enabled). -# --collector.netstat Enable the netstat collector (default: enabled). -# --collector.nfs Enable the nfs collector (default: enabled). -# --collector.nfsd Enable the nfsd collector (default: enabled). -# --collector.ntp Enable the ntp collector (default: disabled). -# --collector.qdisc Enable the qdisc collector (default: disabled). -# --collector.runit Enable the runit collector (default: disabled). -# --collector.sockstat Enable the sockstat collector (default: -# enabled). -# --collector.stat Enable the stat collector (default: enabled). -# --collector.supervisord Enable the supervisord collector (default: -# disabled). -# --collector.systemd Enable the systemd collector (default: enabled). -# --collector.tcpstat Enable the tcpstat collector (default: -# disabled). -# --collector.textfile Enable the textfile collector (default: -# enabled). -# --collector.time Enable the time collector (default: enabled). -# --collector.uname Enable the uname collector (default: enabled). -# --collector.vmstat Enable the vmstat collector (default: enabled). -# --collector.wifi Enable the wifi collector (default: enabled). -# --collector.xfs Enable the xfs collector (default: enabled). -# --collector.zfs Enable the zfs collector (default: enabled). -# --collector.timex Enable the timex collector (default: enabled). -# --web.listen-address=":9100" -# Address on which to expose metrics and web -# interface. -# --web.telemetry-path="/metrics" -# Path under which to expose metrics. -# --log.level="info" Only log messages with the given severity or -# above. Valid levels: [debug, info, warn, error, -# fatal] -# --log.format="logger:stderr" -# Set the log target and format. Example: -# "logger:syslog?appname=bob&local=7" or -# "logger:stdout?json=true" diff --git a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 index 8748744f..69770a1a 100644 --- a/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 +++ b/roles/prometheus-snmp-exporter/templates/prometheus/snmp.yml.j2 @@ -1,8 +1,4 @@ {{ ansible_header | comment }} -# TODOlist : -# - Faire fonctionner le monitoring des switchs défini ici -# * Configurer tous les switchs avec un compte SNMPv3 -# * Mettre l'inventaire des switchs dans Ansible eatonups: walk: @@ -116,7 +112,15 @@ procurve_switch: type: gauge version: 3 auth: - username: prometheus + # To create SNMPv3 user on HP procurve, execute: + # snmpv3 user snmp_prometheus auth sha {{ snmp_procurve_password }} priv aes {{ snmp_procurve_password }} + # snmpv3 group managerpriv user snmp_prometheus sec-model ver3 + security_level: authPriv + username: snmp_prometheus + password: {{ snmp_procurve_password }} + auth_protocol: SHA + priv_protocol: AES + priv_password: {{ snmp_procurve_password }} ubiquiti_unifi: walk: diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 65a5fe05..72136715 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -25,50 +25,26 @@ - django.rules.yml # We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus nodes +- name: Configure Prometheus targets copy: - content: "{{ [{'targets': prometheus.node_targets}] | to_nice_json }}" - dest: /etc/prometheus/targets.json + content: "{{ [{'targets': item.targets}] | to_nice_json }}\n" + dest: "/etc/prometheus/{{ item.file }}.json" mode: 0644 - -# We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus UPS SNMP devices - copy: - content: "{{ [{'targets': prometheus.ups_snmp_targets}] | to_nice_json }}" - dest: /etc/prometheus/targets_ups_snmp.json - mode: 0644 - -# We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus Ubiquity Unifi SNMP devices - copy: - content: "{{ [{'targets': prometheus.unifi_snmp_targets}] | to_nice_json }}" - dest: /etc/prometheus/targets_unifi_snmp.json - mode: 0644 - when: prometheus.unifi_snmp_targets is defined - -# We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus NGINX targets - copy: - content: "{{ [{'targets': prometheus.nginx_targets}] | to_nice_json }}" - dest: /etc/prometheus/targets_nginx.json - mode: 0644 - when: prometheus.nginx_targets is defined - -# We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus Apache targets - copy: - content: "{{ [{'targets': prometheus.apache_targets}] | to_nice_json }}" - dest: /etc/prometheus/targets_apache.json - mode: 0644 - when: prometheus.apache_targets is defined - -# We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus Blackbox targets - copy: - content: "{{ [{'targets': prometheus.blackbox_targets}] | to_nice_json }}" - dest: /etc/prometheus/targets_blackbox.json - mode: 0644 - when: prometheus.blackbox_targets is defined + loop: + - file: targets + targets: "{{ prometheus.node_targets }}" + - file: targets_ups_snmp + targets: "{{ prometheus.ups_snmp_targets }}" + - file: targets_procurve_snmp + targets: "{{ prometheus.procurve_snmp_targets }}" + - file: targets_unifi_snmp + targets: "{{ prometheus.unifi_snmp_targets }}" + - file: targets_nginx + targets: "{{ prometheus.nginx_targets }}" + - file: targets_apache + targets: "{{ prometheus.apache_targets }}" + - file: targets_blackbox + targets: "{{ prometheus.blackbox_targets }}" - name: Activate prometheus service systemd: diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 477ed057..d5017c6f 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -45,7 +45,7 @@ groups: # Alert for high CPU usage - alert: CpuBusy - expr: node_load5{instance="zbee.adm.crans.org"} > 7 or node_load5{instance!="zbee.adm.crans.org"} > 5 + expr: node_load5 > 9 for: 10m labels: severity: warning diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 7fe60352..5877a182 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -46,6 +46,7 @@ scrape_configs: target_label: __address__ replacement: '$1:9100' +{% if prometheus.ups_snmp_targets is defined %} - job_name: ups_snmp file_sd_configs: - files: @@ -60,6 +61,24 @@ scrape_configs: target_label: instance - target_label: __address__ replacement: 127.0.0.1:9116 +{% endif %} + +{% if prometheus.procurve_snmp_targets is defined %} + - job_name: procurve_snmp + file_sd_configs: + - files: + - '/etc/prometheus/targets_procurve_snmp.json' + metrics_path: /snmp + params: + module: [procurve_switch] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9116 +{% endif %} {% if prometheus.unifi_snmp_targets is defined %} - job_name: unifi_snmp @@ -125,7 +144,7 @@ scrape_configs: - job_name: mtail static_configs: - - targets: ["tealc.adm.crans.org"] + - targets: ["tealc.adm.crans.org"] relabel_configs: # Do not put :3903 in instance name, rather here - source_labels: [__address__] @@ -134,10 +153,23 @@ scrape_configs: target_label: __address__ replacement: '$1:3903' +{% if prometheus.bird_targets is defined %} + - job_name: bird + file_sd_configs: + - files: + - '/etc/prometheus/targets_bird.json' + relabel_configs: + # Do not put :3903 in instance name, rather here + - source_labels: [__address__] + target_label: instance + - source_labels: [instance] + target_label: __address__ + replacement: '$1:9324' +{% endif %} - job_name: django scheme: https static_configs: - - targets: [] + - targets: [] # Activate this line when the captive portal need monitoring -# - targets: ["portail-captif.crans.org:443"] +# - targets: ["portail-captif.crans.org:443"] -- GitLab