diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 41d909cddd62e94acc2fca9cb9900fb1fa13a17b..c8c43c173997b9bf927c89f77353eb172950cee0 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -18,9 +18,12 @@ - name: Configure Prometheus alert rules template: - src: prometheus/alert.rules.yml.j2 - dest: /etc/prometheus/alert.rules.yml + src: "prometheus/{{ item }}.j2" + dest: "/etc/prometheus/{{ item }}" notify: Restart Prometheus + loop: + - alert.rules.yml + - django.rules.yml # Doesn't work on Debian Stretch - name: Make Prometheus snmp-exporter listen on localhost only diff --git a/roles/prometheus/templates/prometheus/django.rules.yml.j2 b/roles/prometheus/templates/prometheus/django.rules.yml.j2 new file mode 100644 index 0000000000000000000000000000000000000000..fddd3985f49acfb097cd80d9c15f2de878fe7807 --- /dev/null +++ b/roles/prometheus/templates/prometheus/django.rules.yml.j2 @@ -0,0 +1,106 @@ +# {{ ansible_managed }} +{# As this is also Jinja2 it will conflict without a raw block #} +{% raw %} +groups: +- name: django.rules + rules: + - record: job:django_http_requests_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job) + - record: job:django_http_requests_unknown_latency_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job) + - record: job:django_http_ajax_requests_total:sum_rate30s + expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job) + - record: job:django_http_responses_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job) + - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) + BY (job) + - record: job:django_http_requests_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job) + - record: job:django_http_responses_streaming_total:sum_rate30s + expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job) + - record: job:django_http_responses_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job) + - record: job:django_http_requests_total:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job) + - record: job:django_http_requests_total_by_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method) + - record: job:django_http_requests_total_by_transport:sum_rate30s + expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport) + - record: job:django_http_requests_total_by_view:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, + view) + - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, + view, transport, method) + - record: job:django_http_responses_total_by_templatename:sum_rate30s + expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename) + - record: job:django_http_responses_total_by_status:sum_rate30s + expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status) + - record: job:django_http_responses_total_by_charset:sum_rate30s + expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset) + - record: job:django_http_exceptions_total_by_type:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type) + - record: job:django_http_exceptions_total_by_view:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view) + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99.9" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99.9" + - record: job:django_model_inserts_total:sum_rate1m + expr: sum(rate(django_model_inserts_total[1m])) BY (job, model) + - record: job:django_model_updates_total:sum_rate1m + expr: sum(rate(django_model_updates_total[1m])) BY (job, model) + - record: job:django_model_deletes_total:sum_rate1m + expr: sum(rate(django_model_deletes_total[1m])) BY (job, model) + - record: job:django_db_new_connections_total:sum_rate30s + expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor) + - record: job:django_db_new_connection_errors_total:sum_rate30s + expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor) + - record: job:django_db_execute_total:sum_rate30s + expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor) + - record: job:django_db_execute_many_total:sum_rate30s + expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor) + - record: job:django_db_errors_total:sum_rate30s + expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type) + - record: job:django_migrations_applied_total:max + expr: max(django_migrations_applied_total) BY (job, connection) + - record: job:django_migrations_unapplied_total:max + expr: max(django_migrations_unapplied_total) BY (job, connection) +{% endraw %} diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 0992ee53195ff547d5272ad0f340363d4bef2bc0..2831137f308b0e9821c0d64c51837ba2569f021a 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -19,6 +19,7 @@ alerting: # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "alert.rules.yml" + - "django.rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. @@ -43,3 +44,7 @@ scrape_configs: target_label: instance - target_label: __address__ replacement: 127.0.0.1:9116 + - job_name: django + scheme: https + static_configs: + - targets: ["portail-captif.crans.org:443"]