Skip to content
Snippets Groups Projects
Commit c2eab645 authored by shirenn's avatar shirenn 🌊 Committed by shirenn
Browse files

[prometheus] ilo_snmp & shit

parent e9bf2702
No related branches found
No related tags found
1 merge request!316Shirenn update
......@@ -7,7 +7,7 @@ glob_snmp_exporter:
glob_ninjabot:
config:
nick: Prometheus
nick: monitoring
server: irc.adm.crans.org
port: 6667
channel: "#monitoring"
......@@ -90,3 +90,23 @@ loc_prometheus:
- source_labels: [instance]
target_label: __address__
replacement: '$1:3903'
ilo_snmp:
file: targets_ilo_snmp.json
targets: "{{ groups['ilo_snmp'] | select('match', '^.*\\.adm\\.crans\\.org$') | list | sort }}"
config:
- job_name: ilo_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ilo_snmp.json'
metrics_path: '/snmp'
params:
module:
- ilo
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- replacement: '127.0.0.1:9116'
target_label: __address__
......@@ -480,6 +480,15 @@ marquis.infra.crans.org # manoir
mercure.infra.crans.org # 3m
#5m-5.infra.crans.org Déplacée au 2b
[ilo_snmp]
ilo-daniel.adm.crans.org
ilo-jack.adm.crans.org
ilo-odlyd.adm.crans.org
ilo-sam.adm.crans.org
ilo-stitch.adm.crans.org
ilo-thot.adm.crans.org
ilo-zamok.adm.crans.org
# everything at crans
[crans:children]
crans_server
......
......@@ -34,9 +34,14 @@
roles:
- rsyslog-client
- hosts: server
vars:
prometheus_node_exporter: "{{ glob_prometheus_node_exporter | default({}) | combine(loc_prometheus_node_exporter | default({})) }}"
roles:
- prometheus-node-exporter
- import_playbook: scripts.yml
- import_playbook: vm_setup.yml
- import_playbook: borgbackup_client.yml
- import_playbook: monitoring.yml
- import_playbook: network_interfaces.yml
- import_playbook: nullmailer.yml
......@@ -56,5 +56,3 @@ receivers:
webhook_configs:
- url: 'http://localhost:5000/'
send_resolved: true
- url: 'http://localhost:8000/'
send_resolved: true
......@@ -479,3 +479,226 @@ ubiquiti_unifi:
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_exporter.unifi_password }}
ilo:
walk:
- 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
- 1.3.6.1.4.1.232.6.2.15.3 # Power meter
- 1.3.6.1.4.1.232.6.2.16.1 # POST tests
- 1.3.6.1.4.1.232.6.2.17.1 # Battery
- 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
- 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
- 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
- 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
- 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
- 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
- 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
- 1.3.6.1.4.1.232.9.2.2 # iLO
metrics:
- name: cpqHeResilientMemCondition
oid: 1.3.6.1.4.1.232.6.2.14.4
type: EnumAsStateSet
help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHePowerMeterCurrReading
oid: 1.3.6.1.4.1.232.6.2.15.3
type: gauge
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
- name: cpqHeHWBiosCondition
oid: 1.3.6.1.4.1.232.6.2.16.1
type: EnumAsStateSet
help: This value indicates an error has been detected during Pre-OS Test (POST)
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeSysBatteryCondition
oid: 1.3.6.1.4.1.232.6.2.17.1
type: EnumAsStateSet
help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
indexes:
- labelname: cpqHeSysBatteryChassis
type: gauge
- labelname: cpqHeSysBatteryIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeTemperatureLocale
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
type: EnumAsInfo
help: This specifies the location of the temperature sensor present in the system.
- 1.3.6.1.4.1.232.6.2.6.8.1.3
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
enum_values:
1: other
2: unknown
3: system
4: systemBoard
5: ioBoard
6: cpu
7: memory
8: storage
9: removableMedia
10: powerSupply
11: ambient
12: chassis
13: bridgeCard
- name: cpqHeTemperatureCelsius
oid: 1.3.6.1.4.1.232.6.2.6.8.1.4
type: gauge
help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureThreshold
oid: 1.3.6.1.4.1.232.6.2.6.8.1.5
type: gauge
help: This is the shutdown threshold temperature sensor setting in degrees celsius
- 1.3.6.1.4.1.232.6.2.6.8.1.5
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureCondition
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
type: EnumAsStateSet
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeFltTolFanLocale
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
type: EnumAsInfo
help: This specifies the location of the fan present in the system.
- 1.3.6.1.4.1.232.6.2.6.7.1.3
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
enum_values:
1: other
2: unknown
3: system
4: systemBoard
5: ioBoard
6: cpu
7: memory
8: storage
9: removableMedia
10: powerSupply
11: ambient
12: chassis
13: bridgeCard
- name: cpqHeFltTolFanCondition
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
type: EnumAsStateSet
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeFltTolPowerSupplyStatus
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
type: EnumAsStateSet
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
indexes:
- labelname: cpqHeFltTolPowerSupplyChassis
type: gauge
- labelname: cpqHeFltTolPowerSupplyBay
type: gauge
enum_values:
1: noError
2: generalFailure
3: bistFailure
4: fanFailure
5: tempFailure
6: interlockOpen
7: epromFailed
8: vrefFailed
9: dacFailed
10: ramTestFailed
11: voltageChannelFailed
12: orringdiodeFailed
13: brownOut
14: giveupOnStartup
15: nvramInvalid
16: calibrationTableInvalid
17: noPowerInput
- name: cpqSm2CntlrInterfaceStatus
oid: 1.3.6.1.4.1.232.9.2.2.17
type: EnumAsStateSet
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
enum_values:
1: other
2: ok
3: notResponding
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
oid: 1.3.6.1.4.1.232.9.2.2.27
type: EnumAsStateSet
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
enum_values:
1: notSupported
2: set
3: notSet
- name: cpqSm2CntlrLicenseActive
oid: 1.3.6.1.4.1.232.9.2.2.30
type: EnumAsStateSet
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
enum_values:
1: none
2: iloAdvanced
3: iloLight
4: iloAdvancedBlade
5: iloStandard
6: iloEssentials
7: iloScaleOut
8: iloAdvancedPremiumSecurity
- name: cpqSm2CntlrServerPowerState
oid: 1.3.6.1.4.1.232.9.2.2.32
type: EnumAsStateSet
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
enum_values:
1: unknown
2: poweredOff
3: poweredOn
4: insufficientPowerOrPowerOnDenied
version: 3
# Reduce timeout to retry faster
timeout: 1s
auth:
security_level: authPriv
username: crans
password: SpnVOv4nuF10BOye9zjX
auth_protocol: SHA
priv_protocol: AES
priv_password: 21rW6LHShmgdHsdPQXHP
......@@ -23,7 +23,7 @@ groups:
- alert: PrometheusTargetMissing
expr: up == 0
for: 0m
for: 1m
labels:
severity: critical
annotations:
......@@ -84,22 +84,6 @@ groups:
annotations:
summary: La mémoire vive de {{ $labels.instance }} arrive à saturation ({{ $value }}%)
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
......@@ -143,7 +127,7 @@ groups:
# 0B is so hot
# En pratique c'est mauvais de tourner des disques trop chauds
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
expr: node_hwmon_temp_celsius > 85
for: 5m
labels:
severity: warning
......@@ -205,20 +189,12 @@ groups:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 20
for: 0m
......@@ -304,6 +280,80 @@ groups:
annotations:
summary: La tension de sortie de {{ $labels.instance }} est de {{ $value }}V
#######
# iLO #
#######
- alert: IloResilientMemoryDegraded
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire vive n'est plus résiliente
({{ $labels.cpqHeResilientMemCondition }}) sur {{ $labels.instance }}
- alert: IloBiosSelfTestDegraded
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Une erreur a été détectée lors du POST du serveur
({{ $labels.cpqHeHWBiosCondition }}) sur {{ $labels.instance }}
- alert: IloBatteryDegraded
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
La batterie est dégradée
({{ $labels.cpqHeSysBatteryCondition }}) sur {{ $labels.instance }}
- alert: IloTemperatureSensorDegraded
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le capteur de température est dégradé
({{ $labels.cpqHeTemperatureCondition }}) sur {{ $labels.instance }}
- alert: IloFanDegraded
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le ventilateur est dégradé
({{ $labels.cpqHeFltTolFanCondition }}) sur {{ $labels.instance }}
- alert: IloPowerSupplyDegraded
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
L'alimentation est dégradée
({{ $labels.cpqHeFltTolPowerSupplyStatus }}) sur {{ $labels.instance }}
- alert: IloOverrideSwitchState
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le switch de réinitialisation n'est pas à l'état d'origine,
l'authentification est bypassée sur {{ $labels.instance }}
#########
# Other #
#########
......@@ -316,8 +366,8 @@ groups:
annotations:
summary: "{{ $value }} paquet(s) APT sont inutile(s) sur {{ $labels.instance }}"
- alert: AptOrphans
expr: apt_orphans > 10
- alert: AptObsolete
expr: apt_obsolete > 10
for: 5m
labels:
severity: warning
......@@ -347,13 +397,4 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.disk }} sur {{ $labels.instance }} a {{ $value }} secteurs réalloués"
- alert: TooManyUDPErrors
expr: irate(node_netstat_Udp_InErrors[5m]) > 100
for: 2m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} a plus de {{ $value }} connexions UDP en erreur. Quelque chose spam!"
{% endraw %}
......@@ -3,7 +3,7 @@
global:
# scrape_interval is set to the global default (60s)
# evaluation_interval is set to the global default (60s)
# scrape_timeout is set to the global default (10s).
scrape_timeout: 30s # was 10s by default
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment