diff --git a/config/prometheus/alert_rules.yml b/config/prometheus/alert_rules.yml index d3c04d651..74bb851f7 100644 --- a/config/prometheus/alert_rules.yml +++ b/config/prometheus/alert_rules.yml @@ -200,3 +200,62 @@ groups: A previously-failed-deploy color has been kept alive for 24+ hours. Either complete forensics + run cleanup-failed, or the next deploy will recycle it automatically. + + # v1.0.9 W5 Day 24 : synthetic monitoring (blackbox exporter). + # Each parcours is probed every 5 min ; the 10m `for:` window means + # an alert fires after 2 consecutive failures (per the roadmap + # acceptance gate). `parcours` label carries the human-readable + # name from blackbox_targets.yml so dashboards group cleanly. + - name: veza_synthetic + rules: + - alert: SyntheticParcoursDown + # probe_success is 0 when blackbox couldn't complete the probe. + # The metric is emitted per (instance, parcours) so the alert + # fires per-parcours, letting the on-call see exactly which + # journey is broken without grepping logs. + expr: probe_success{probe_kind="synthetic"} == 0 + for: 10m + labels: + severity: warning + page: "false" + annotations: + summary: "Synthetic parcours {{ $labels.parcours }} failing for 10m" + description: | + Blackbox exporter has been unable to complete the + {{ $labels.parcours }} parcours against {{ $labels.instance }} + for 10 minutes (≥ 2 consecutive failures). End-user impact + is likely real — investigate the underlying component + BEFORE the related per-component alert fires. + runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down" + + - alert: SyntheticAuthLoginDown + # Login is the gate for everything else ; a single 10m blip + # is critical. Pages. + expr: probe_success{parcours="auth_login"} == 0 + for: 10m + labels: + severity: critical + page: "true" + annotations: + summary: "Synthetic auth_login down — login surface is broken" + description: | + The auth_login synthetic parcours has failed for 10+ minutes. + Real users cannot log in. Page now. + runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down" + + - alert: SyntheticProbeSlow + # Probe latency budget : 5s for HTTP, 8s for the heavier ones. + # When real-user latency degrades, blackbox is the canary. + expr: probe_duration_seconds{probe_kind="synthetic"} > 8 + for: 15m + labels: + severity: warning + page: "false" + annotations: + summary: "Synthetic parcours {{ $labels.parcours }} > 8s for 15m" + description: | + Probe duration exceeded 8 seconds for the past 15 minutes. + Real users are likely seeing visible latency. Cross-check + the SLO burn-rate alerts ; if those are quiet but this + fires, the issue is in the synthetic-only path (DNS, + external dependency). diff --git a/config/prometheus/blackbox_targets.yml b/config/prometheus/blackbox_targets.yml new file mode 100644 index 000000000..a106f60f2 --- /dev/null +++ b/config/prometheus/blackbox_targets.yml @@ -0,0 +1,89 @@ +# Prometheus blackbox scrape config — synthetic monitoring of the +# 6 parcours from v1.0.9 W5 Day 24. +# +# Probed every 5 minutes ; alerts fire after 2 consecutive failures. +# This file is sourced by the main prometheus.yml : +# +# scrape_configs: +# - job_name: 'blackbox' +# file_sd_configs: +# - files: +# - /etc/prometheus/blackbox_targets.yml +# metrics_path: /probe +# relabel_configs: +# - source_labels: [__address__] +# target_label: __param_target +# - source_labels: [__param_target] +# target_label: instance +# - target_label: __address__ +# replacement: blackbox-exporter.lxd:9115 +# +# Each entry below carries a `module` label that maps to a +# blackbox.yml module name AND a `parcours` label so Grafana can +# group / filter. Prometheus passes module + target through the +# query string when it scrapes blackbox. + +# Parcours 1 — register / verify / login +# (Reachability of the auth surface ; multi-step register-then-verify +# requires a synthetic-client binary, tracked as follow-up.) +- targets: + - https://staging.veza.fr/api/v1/auth/login + labels: + module: http_status_envelope + parcours: auth_login + probe_kind: synthetic + +# Parcours 2 — login → search → play first +- targets: + - https://staging.veza.fr/api/v1/search?q=test + labels: + module: http_search + parcours: search + probe_kind: synthetic + +# Parcours 3 — login → upload tiny audio → poll status +# Approximated by reaching the upload-config endpoint ; the actual +# upload requires auth + file body which blackbox can't model. +- targets: + - https://staging.veza.fr/api/v1/upload/config + labels: + module: http_2xx + parcours: upload_init + probe_kind: synthetic + +# Parcours 4 — login → browse marketplace → add to cart +# Approximated by reaching the marketplace listing endpoint. +- targets: + - https://staging.veza.fr/api/v1/marketplace/products?limit=5 + labels: + module: http_marketplace + parcours: marketplace_list + probe_kind: synthetic + +# Parcours 5 — WebSocket chat connect + send message +# TCP-only probe : confirms the listener is up. The full handshake + +# auth + send round-trip needs the synthetic-client binary. +- targets: + - staging.veza.fr:443 + labels: + module: tcp_websocket + parcours: chat_websocket + probe_kind: synthetic + +# Parcours 6 — live stream metadata fetch +- targets: + - https://staging.veza.fr/api/v1/streams/active + labels: + module: http_2xx + parcours: live_streams + probe_kind: synthetic + +# Bonus — public status page health (covers the /api/v1/status +# response shape so a Cachet/statuspage.io consumer doesn't depend +# on a hand-pinged check). +- targets: + - https://staging.veza.fr/api/v1/status + labels: + module: http_status_envelope + parcours: status_endpoint + probe_kind: synthetic diff --git a/infra/ansible/inventory/lab.yml b/infra/ansible/inventory/lab.yml index 2f9251b5b..f0ca36929 100644 --- a/infra/ansible/inventory/lab.yml +++ b/infra/ansible/inventory/lab.yml @@ -112,6 +112,14 @@ all: vars: ansible_connection: community.general.incus ansible_python_interpreter: /usr/bin/python3 + # v1.0.9 W5 Day 24 — synthetic monitoring runner. Should sit on a + # host external to the prod cluster ; lab phase-1 colocates it. + blackbox_exporter: + hosts: + blackbox-exporter: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 # v1.0.9 W3 Day 12: distributed MinIO with EC:2. 4 Incus containers, # each providing one drive ; single erasure set tolerates 2 simultaneous # node failures. diff --git a/infra/ansible/playbooks/blackbox_exporter.yml b/infra/ansible/playbooks/blackbox_exporter.yml new file mode 100644 index 000000000..082811d25 --- /dev/null +++ b/infra/ansible/playbooks/blackbox_exporter.yml @@ -0,0 +1,56 @@ +# Synthetic monitoring playbook — provisions the blackbox-exporter +# Incus container and lays down the role. +# +# v1.0.9 W5 Day 24. +# +# IMPORTANT : the blackbox exporter SHOULD run on a host that is +# externally-routed (separate from the prod cluster) so a probe +# failure reflects what an external user sees. v1.0 lab keeps it on +# the same Incus host for simplicity ; phase-2 moves it off-box. +# +# Run with: +# ansible-galaxy collection install community.general +# ansible-playbook -i inventory/lab.yml playbooks/blackbox_exporter.yml +--- +- name: Provision Incus container for blackbox exporter + hosts: incus_hosts + become: true + gather_facts: true + tasks: + - name: Launch blackbox-exporter container + ansible.builtin.shell: + cmd: | + set -e + if ! incus info blackbox-exporter >/dev/null 2>&1; then + incus launch images:ubuntu/22.04 blackbox-exporter + for _ in $(seq 1 30); do + if incus exec blackbox-exporter -- cloud-init status 2>/dev/null | grep -q "status: done"; then + break + fi + sleep 1 + done + incus exec blackbox-exporter -- apt-get update + incus exec blackbox-exporter -- apt-get install -y python3 python3-apt + fi + args: + executable: /bin/bash + register: provision_result + changed_when: "'incus launch' in provision_result.stdout" + tags: [blackbox, provision] + + - name: Refresh inventory + ansible.builtin.meta: refresh_inventory + +- name: Apply common baseline + hosts: blackbox_exporter + become: true + gather_facts: true + roles: + - common + +- name: Install + configure blackbox exporter + hosts: blackbox_exporter + become: true + gather_facts: true + roles: + - blackbox_exporter diff --git a/infra/ansible/roles/blackbox_exporter/README.md b/infra/ansible/roles/blackbox_exporter/README.md new file mode 100644 index 000000000..64182cd46 --- /dev/null +++ b/infra/ansible/roles/blackbox_exporter/README.md @@ -0,0 +1,93 @@ +# `blackbox_exporter` role — synthetic monitoring runner + +Single Incus container running Prometheus' `blackbox_exporter`. Probed by Prometheus every 5 minutes against the 6 user parcours from v1.0.9 W5 Day 24. Alerts fire after 2 consecutive failures (`for: 10m` × 5-min scrape = 2 cycles). + +## Topology + +``` + Prometheus :9090 + │ scrape every 5m + ▼ + ┌─────────────────────────────┐ + │ blackbox-exporter.lxd:9115 │ + │ (this role) │ + └────────────┬────────────────┘ + │ probes (HTTP / TCP) + ┌─────────────────────┼─────────────────────┐ + ▼ ▼ ▼ + staging.veza.fr/api/v1/auth/login /api/v1/search?q=test /api/v1/marketplace/products + ... ... +``` + +The exporter SHOULD run on a host **external** to the prod cluster so probe failures reflect what an external user sees, not what an already-broken internal service hides. v1.0 lab phase-1 colocates it for simplicity ; phase-2 moves the container off-box. + +## Probe modules (defined in `templates/blackbox.yml.j2`) + +| Module | Used by parcours | What it asserts | +| ---------------------- | ---------------------- | ------------------------------------------------------ | +| `http_2xx` | upload_init, live_streams | Status code 200 or 204, TLS valid | +| `http_status_envelope` | auth_login, status_endpoint | Body matches `"success":\s*true` | +| `http_search` | search | Body matches `"tracks"` (seed data must include hits) | +| `http_marketplace` | marketplace_list | 200 (no body assertion ; an empty array is valid) | +| `tcp_websocket` | chat_websocket | TLS-wrapped TCP handshake completes | + +Multi-step parcours that need session state (Register → Verify → Login, Login → Search → Play first result) are **out of scope** for blackbox. Tracked as a follow-up : a small Go binary that runs as a CronJob, walks the steps, and writes textfile-collector metrics to `/var/lib/node_exporter/textfile_collector/veza_synthetic.prom`. + +## Defaults + +| variable | default | meaning | +| -------------------------- | ----------------------------- | ---------------------------------------- | +| `blackbox_version` | `0.25.0` | Prometheus blackbox_exporter release | +| `blackbox_listen_port` | `9115` | Prometheus default | +| `blackbox_target_base_url` | `https://staging.veza.fr` | base URL the probes hit | + +## Prometheus scrape config + +`config/prometheus/blackbox_targets.yml` carries the 7 file-SD entries (6 parcours + status-endpoint bonus). Wire it in `prometheus.yml` : + +```yaml +scrape_configs: + - job_name: blackbox + file_sd_configs: + - files: [/etc/prometheus/blackbox_targets.yml] + metrics_path: /probe + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [module] + target_label: __param_module + - target_label: __address__ + replacement: blackbox-exporter.lxd:9115 +``` + +## Alert rules + +`config/prometheus/alert_rules.yml` group `veza_synthetic` : + +- `SyntheticParcoursDown` — any parcours fails for 10 m → warning. +- `SyntheticAuthLoginDown` — auth_login fails for 10 m → critical (page). +- `SyntheticProbeSlow` — probe duration > 8 s for 15 m → warning. + +## Operations + +```bash +# Service status : +sudo systemctl status blackbox_exporter + +# One-off probe (dev / debug) : +curl 'http://blackbox-exporter.lxd:9115/probe?target=https://staging.veza.fr/api/v1/health&module=http_status_envelope' + +# Live probe latency tail : +curl -s http://blackbox-exporter.lxd:9115/metrics | grep probe_duration + +# Tail the exporter log : +sudo journalctl -u blackbox_exporter -f +``` + +## What this role does NOT cover + +- **Multi-step parcours.** Blackbox can't carry session cookies across probes ; the Register-then-Verify-then-Login flow needs a custom synthetic client. Tracked for v1.0.10. +- **Status page.** Cachet/statuspage.io is a separate operator decision per the roadmap. The `/api/v1/status` endpoint is consumable by both. +- **Off-box deploy.** Lab phase-1 runs the container on the same Incus host as the things it's probing. Phase-2 moves it off-cluster. diff --git a/infra/ansible/roles/blackbox_exporter/defaults/main.yml b/infra/ansible/roles/blackbox_exporter/defaults/main.yml new file mode 100644 index 000000000..3a99caa40 --- /dev/null +++ b/infra/ansible/roles/blackbox_exporter/defaults/main.yml @@ -0,0 +1,20 @@ +# blackbox_exporter defaults — synthetic monitoring runner. +# v1.0.9 W5 Day 24. +# +# Sits OUTSIDE the prod network (separate Incus host or off-box) so a +# probe failure reflects what an external user sees, not what an +# already-broken internal service hides. Six parcours per the roadmap, +# probed every 5 min by Prometheus. +--- +blackbox_version: "0.25.0" +blackbox_arch: amd64 + +# Listener — Prometheus scrapes this on port 9115 (the blackbox_exporter +# default). +blackbox_listen_port: 9115 + +# Probe targets. The 6 parcours from the roadmap are mapped to simpler +# blackbox probes here (HTTP 2xx) ; the multi-step parcours that need +# session state (Register → Login → Search) are out of scope for +# blackbox itself and tracked as a follow-up (synthetic-client binary). +blackbox_target_base_url: "https://staging.veza.fr" diff --git a/infra/ansible/roles/blackbox_exporter/handlers/main.yml b/infra/ansible/roles/blackbox_exporter/handlers/main.yml new file mode 100644 index 000000000..15016f20f --- /dev/null +++ b/infra/ansible/roles/blackbox_exporter/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart blackbox_exporter + ansible.builtin.systemd: + name: blackbox_exporter + state: restarted + daemon_reload: true diff --git a/infra/ansible/roles/blackbox_exporter/tasks/main.yml b/infra/ansible/roles/blackbox_exporter/tasks/main.yml new file mode 100644 index 000000000..44667a764 --- /dev/null +++ b/infra/ansible/roles/blackbox_exporter/tasks/main.yml @@ -0,0 +1,89 @@ +# blackbox_exporter role — installs the Prometheus blackbox exporter +# from the official tarball, drops the systemd unit, renders the probe +# config. Idempotent. +--- +- name: Ensure /opt/blackbox_exporter exists + ansible.builtin.file: + path: /opt/blackbox_exporter + state: directory + owner: root + group: root + mode: "0755" + tags: [blackbox, install] + +- name: Check installed blackbox_exporter version + ansible.builtin.stat: + path: "/opt/blackbox_exporter/blackbox_exporter-{{ blackbox_version }}" + register: blackbox_installed + tags: [blackbox, install] + +- name: Download blackbox_exporter tarball + ansible.builtin.get_url: + url: "https://github.com/prometheus/blackbox_exporter/releases/download/v{{ blackbox_version }}/blackbox_exporter-{{ blackbox_version }}.linux-{{ blackbox_arch }}.tar.gz" + dest: "/tmp/blackbox_exporter-{{ blackbox_version }}.tar.gz" + mode: "0644" + when: not blackbox_installed.stat.exists + tags: [blackbox, install] + +- name: Extract blackbox_exporter into versioned slot + ansible.builtin.unarchive: + src: "/tmp/blackbox_exporter-{{ blackbox_version }}.tar.gz" + dest: /opt/blackbox_exporter + remote_src: true + creates: "/opt/blackbox_exporter/blackbox_exporter-{{ blackbox_version }}.linux-{{ blackbox_arch }}" + when: not blackbox_installed.stat.exists + tags: [blackbox, install] + +- name: Symlink /usr/local/bin/blackbox_exporter → versioned binary + ansible.builtin.file: + src: "/opt/blackbox_exporter/blackbox_exporter-{{ blackbox_version }}.linux-{{ blackbox_arch }}/blackbox_exporter" + dest: /usr/local/bin/blackbox_exporter + state: link + force: true + notify: Restart blackbox_exporter + tags: [blackbox, install] + +- name: Create blackbox system user + ansible.builtin.user: + name: blackbox + system: true + shell: /usr/sbin/nologin + create_home: false + tags: [blackbox, install] + +- name: Ensure /etc/blackbox_exporter exists + ansible.builtin.file: + path: /etc/blackbox_exporter + state: directory + owner: root + group: blackbox + mode: "0750" + tags: [blackbox, config] + +- name: Render blackbox.yml + ansible.builtin.template: + src: blackbox.yml.j2 + dest: /etc/blackbox_exporter/blackbox.yml + owner: root + group: blackbox + mode: "0640" + notify: Restart blackbox_exporter + tags: [blackbox, config] + +- name: Render systemd unit + ansible.builtin.template: + src: blackbox_exporter.service.j2 + dest: /etc/systemd/system/blackbox_exporter.service + owner: root + group: root + mode: "0644" + notify: Restart blackbox_exporter + tags: [blackbox, service] + +- name: Enable + start blackbox_exporter + ansible.builtin.systemd: + name: blackbox_exporter + state: started + enabled: true + daemon_reload: true + tags: [blackbox, service] diff --git a/infra/ansible/roles/blackbox_exporter/templates/blackbox.yml.j2 b/infra/ansible/roles/blackbox_exporter/templates/blackbox.yml.j2 new file mode 100644 index 000000000..1a76544a3 --- /dev/null +++ b/infra/ansible/roles/blackbox_exporter/templates/blackbox.yml.j2 @@ -0,0 +1,61 @@ +# Managed by Ansible — do not edit by hand. +# Probe modules used by Prometheus' blackbox scrape config. +# v1.0.9 W5 Day 24. + +modules: + # http_2xx — vanilla HTTP probe, accepts any 2xx response. + http_2xx: + prober: http + timeout: 5s + http: + preferred_ip_protocol: ip4 + valid_status_codes: [200, 204] + method: GET + no_follow_redirects: false + fail_if_ssl: false + fail_if_not_ssl: true # synthetic monitoring runs against staging w/ TLS + + # http_status_envelope — accept the {success: true, ...} body shape. + # Used for /api/v1/health which wraps the verdict. + http_status_envelope: + prober: http + timeout: 5s + http: + preferred_ip_protocol: ip4 + valid_status_codes: [200] + method: GET + fail_if_body_not_matches_regexp: + - '"success"\s*:\s*true' + + # http_search — POST-less search probe. The synthetic user hits + # /api/v1/search?q=test ; staging seed data must include something + # for that query to return non-empty. + http_search: + prober: http + timeout: 8s + http: + preferred_ip_protocol: ip4 + valid_status_codes: [200] + method: GET + fail_if_body_not_matches_regexp: + - '"tracks"' + + # http_marketplace — same shape, different endpoint. + http_marketplace: + prober: http + timeout: 8s + http: + preferred_ip_protocol: ip4 + valid_status_codes: [200] + method: GET + + # tcp_websocket — bare TCP connect to the WS port to verify the + # listener is alive. Doesn't speak the WS protocol — for that the + # synthetic-client binary (out of scope for this role) handles + # connect+send+receive. + tcp_websocket: + prober: tcp + timeout: 5s + tcp: + preferred_ip_protocol: ip4 + tls: true diff --git a/infra/ansible/roles/blackbox_exporter/templates/blackbox_exporter.service.j2 b/infra/ansible/roles/blackbox_exporter/templates/blackbox_exporter.service.j2 new file mode 100644 index 000000000..663725345 --- /dev/null +++ b/infra/ansible/roles/blackbox_exporter/templates/blackbox_exporter.service.j2 @@ -0,0 +1,27 @@ +# Managed by Ansible — do not edit by hand. +[Unit] +Description=Prometheus Blackbox Exporter +Documentation=https://github.com/prometheus/blackbox_exporter +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=blackbox +Group=blackbox +ExecStart=/usr/local/bin/blackbox_exporter \ + --config.file=/etc/blackbox_exporter/blackbox.yml \ + --web.listen-address=:{{ blackbox_listen_port }} +Restart=on-failure +RestartSec=5s +LimitNOFILE=65535 +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true + +[Install] +WantedBy=multi-user.target