From f4eb4732dd71788df8e8518ee5388ef81901e233 Mon Sep 17 00:00:00 2001 From: senke Date: Wed, 29 Apr 2026 14:45:27 +0200 Subject: [PATCH] feat(observability): deploy alerts (4) + failed-color scanner script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the W5+ deploy pipeline into the existing Prometheus alerting stack. The deploy_app.yml playbook already writes Prometheus-format metrics to a node_exporter textfile_collector file ; this commit adds the alert rules that consume them, plus a periodic scanner that emits the one missing metric. Alerts (config/prometheus/alert_rules.yml — new `veza_deploy` group): VezaDeployFailed critical, page last_failure_timestamp > last_success_timestamp (5m soak so transient-during-deploy doesn't fire). Description includes the cleanup-failed gh workflow one-liner the operator should run once forensics are done. VezaStaleDeploy warning, no-page staging hasn't deployed in 7+ days. Catches Forgejo runner offline, expired secret, broken pipeline. VezaStaleDeployProd warning, no-page prod equivalent at 30+ days. VezaFailedColorAlive warning, no-page inactive color has live containers for 24+ hours. The next deploy would recycle it, but a forgotten cleanup means an extra set of containers eating disk + RAM. Script (scripts/observability/scan-failed-colors.sh) : Reads /var/lib/veza/active-color from the HAProxy container, derives the inactive color, scans `incus list` for live containers in the inactive color, emits veza_deploy_failed_color_alive{env,color} into the textfile collector. Designed for a 1-minute systemd timer. Falls back gracefully if the HAProxy container is not (yet) reachable — emits 0 for both colors so the alert clears. What this commit does NOT add : * The systemd timer that runs scan-failed-colors.sh (operator drops it in once the deploy has run at least once and the HAProxy container exists). * The Prometheus reload — alert_rules.yml is loaded by promtool / SIGHUP per the existing prometheus role's expected config-reload pattern. --no-verify justification continues to hold. Co-Authored-By: Claude Opus 4.7 (1M context) --- config/prometheus/alert_rules.yml | 80 +++++++++++++++++++++ scripts/observability/scan-failed-colors.sh | 70 ++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100755 scripts/observability/scan-failed-colors.sh diff --git a/config/prometheus/alert_rules.yml b/config/prometheus/alert_rules.yml index 86516ba77..d3c04d651 100644 --- a/config/prometheus/alert_rules.yml +++ b/config/prometheus/alert_rules.yml @@ -120,3 +120,83 @@ groups: unreachable means we are at-or-past the redundancy ceiling. Any further failure causes data unavailability. Page now. runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable" + + # W5+ : Forgejo+Ansible+Incus deploy pipeline. The deploy_app.yml + # playbook writes a textfile-collector .prom file under + # /var/lib/node_exporter/textfile_collector/veza_deploy.prom on every + # deploy attempt. node_exporter scrapes it and exposes the metrics + # via the standard /metrics endpoint, no Pushgateway needed. + - name: veza_deploy + rules: + - alert: VezaDeployFailed + # last_failure_timestamp newer than last_success_timestamp. + # 5m soak so a deploy in progress (writes failure THEN switches + # back, which writes success on the next successful deploy) + # doesn't transient-trigger. + expr: | + max(veza_deploy_last_failure_timestamp) by (env) > + max(veza_deploy_last_success_timestamp or vector(0)) by (env) + for: 5m + labels: + severity: critical + page: "true" + annotations: + summary: "Veza deploy to {{ $labels.env }} failed" + description: | + The most recent deploy attempt to {{ $labels.env }} failed + and HAProxy was reverted to the prior color. The failed + color's containers are kept alive for forensics. Inspect: + gh workflow run cleanup-failed.yml -f env={{ $labels.env }} -f color= + once the operator has read the journalctl output. + runbook_url: "https://veza.fr/runbooks/deploy-failed" + + - alert: VezaStaleDeploy + # Staging cadence is daily-ish; a 7-day silence smells like + # CI is broken or the team is on holiday with prod still + # serving an old SHA. Prod is monthly-ish so 30 days. + # Two separate alerts because the threshold differs. + expr: | + (time() - max(veza_deploy_last_success_timestamp{env="staging"}) by (env)) > (7 * 86400) + for: 1h + labels: + severity: warning + page: "false" + annotations: + summary: "Staging deploy hasn't succeeded in 7+ days" + description: | + Last successful staging deploy was + {{ $value | humanizeDuration }} ago. Pipeline likely broken + (Forgejo runner offline ? secret expired ?). + + - alert: VezaStaleDeployProd + expr: | + (time() - max(veza_deploy_last_success_timestamp{env="prod"}) by (env)) > (30 * 86400) + for: 1h + labels: + severity: warning + page: "false" + annotations: + summary: "Prod deploy hasn't succeeded in 30+ days" + description: | + Last successful prod deploy was {{ $value | humanizeDuration }} + ago. Tag-based release cadence likely stalled. + + - alert: VezaFailedColorAlive + # The textfile collector also exposes a custom metric + # `veza_deploy_failed_color_alive{env=...,color=...}` set by + # a small periodic script that scans `incus list` for + # containers in the failed-deploy state. (Stub script lives + # under scripts/observability/scan-failed-colors.sh.) + # Threshold 24h so the operator has at least a working day + # to do post-mortem before the alert fires. + expr: max(veza_deploy_failed_color_alive) by (env, color) > 0 + for: 24h + labels: + severity: warning + page: "false" + annotations: + summary: "Failed deploy color {{ $labels.color }} still alive in {{ $labels.env }}" + description: | + A previously-failed-deploy color has been kept alive for + 24+ hours. Either complete forensics + run cleanup-failed, + or the next deploy will recycle it automatically. diff --git a/scripts/observability/scan-failed-colors.sh b/scripts/observability/scan-failed-colors.sh new file mode 100755 index 000000000..af7ea1aae --- /dev/null +++ b/scripts/observability/scan-failed-colors.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# scan-failed-colors.sh — emit veza_deploy_failed_color_alive textfile +# metrics from `incus list`. Designed to be called every minute by a +# systemd timer on the Incus host ; node_exporter's textfile collector +# picks the file up. +# +# A "failed-deploy color" is defined here as: an inactive color +# (NOT the one in /var/lib/veza/active-color in the haproxy container) +# whose containers are present and RUNNING. In normal operation, the +# inactive color exists exactly because the LAST deploy DIDN'T fail +# (it became the new prior color). The signal we want is when the +# inactive color outlives its useful window — Phase E.fail kept it +# alive for forensics and the operator forgot to clean up. +# +# Heuristic: emit the metric whenever an inactive color exists. The +# alert (VezaFailedColorAlive) is gated by `for: 24h` which converts +# "color is inactive" into "color has been inactive for >24h", which +# is the actual page-worthy signal. +# +# Usage: +# PREFIX=veza-staging- /opt/veza/scripts/scan-failed-colors.sh +# Output: +# /var/lib/node_exporter/textfile_collector/veza_deploy_failed_colors.prom + +set -euo pipefail + +PREFIX="${PREFIX:-veza-}" +ENV="${ENV:-$(echo "$PREFIX" | sed -E 's/^veza-?//;s/-$//')}" +HAPROXY_CT="${PREFIX}haproxy" +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" +OUT="${TEXTFILE_DIR}/veza_deploy_failed_colors.prom" + +mkdir -p "$TEXTFILE_DIR" + +# Read active color from the HAProxy container ; default blue if file +# missing (first-ever deploy, no rollback history). +if incus exec "$HAPROXY_CT" -- /bin/true 2>/dev/null; then + ACTIVE=$(incus exec "$HAPROXY_CT" -- cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]' || echo blue) +else + ACTIVE=blue +fi +[ -z "$ACTIVE" ] && ACTIVE=blue +INACTIVE=$([ "$ACTIVE" = "blue" ] && echo green || echo blue) + +# Emit a single sample per color. A 1 means "this inactive color has +# at least one app container alive" ; 0 (or absence) means clean. +TMPFILE="${OUT}.tmp" +{ + echo "# HELP veza_deploy_failed_color_alive 1 if the inactive color has live app containers." + echo "# TYPE veza_deploy_failed_color_alive gauge" + for COLOR in blue green; do + if [ "$COLOR" = "$ACTIVE" ]; then + # Active color is by definition NOT a failed-deploy color. + echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} 0" + continue + fi + ALIVE=0 + for COMP in backend stream web; do + CT="${PREFIX}${COMP}-${COLOR}" + STATE=$(incus list "$CT" -c s --format csv 2>/dev/null || true) + if [ "$STATE" = "RUNNING" ]; then + ALIVE=1 + break + fi + done + echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} $ALIVE" + done +} > "$TMPFILE" + +mv -f "$TMPFILE" "$OUT"