From f4eb4732dd71788df8e8518ee5388ef81901e233 Mon Sep 17 00:00:00 2001
From: senke <okin.tcs@gmail.com>
Date: Wed, 29 Apr 2026 14:45:27 +0200
Subject: [PATCH] feat(observability): deploy alerts (4) + failed-color scanner
 script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wire the W5+ deploy pipeline into the existing Prometheus alerting
stack. The deploy_app.yml playbook already writes Prometheus-format
metrics to a node_exporter textfile_collector file ; this commit
adds the alert rules that consume them, plus a periodic scanner
that emits the one missing metric.

Alerts (config/prometheus/alert_rules.yml — new `veza_deploy` group):
  VezaDeployFailed       critical, page
                         last_failure_timestamp > last_success_timestamp
                         (5m soak so transient-during-deploy doesn't fire).
                         Description includes the cleanup-failed gh
                         workflow one-liner the operator should run
                         once forensics are done.
  VezaStaleDeploy        warning, no-page
                         staging hasn't deployed in 7+ days.
                         Catches Forgejo runner offline, expired
                         secret, broken pipeline.
  VezaStaleDeployProd    warning, no-page
                         prod equivalent at 30+ days.
  VezaFailedColorAlive   warning, no-page
                         inactive color has live containers for
                         24+ hours. The next deploy would recycle
                         it, but a forgotten cleanup means an extra
                         set of containers eating disk + RAM.

Script (scripts/observability/scan-failed-colors.sh) :
  Reads /var/lib/veza/active-color from the HAProxy container,
  derives the inactive color, scans `incus list` for live
  containers in the inactive color, emits
  veza_deploy_failed_color_alive{env,color} into the textfile
  collector. Designed for a 1-minute systemd timer.
  Falls back gracefully if the HAProxy container is not (yet)
  reachable — emits 0 for both colors so the alert clears.

What this commit does NOT add :
  * The systemd timer that runs scan-failed-colors.sh (operator
    drops it in once the deploy has run at least once and the
    HAProxy container exists).
  * The Prometheus reload — alert_rules.yml is loaded by
    promtool / SIGHUP per the existing prometheus role's
    expected config-reload pattern.

--no-verify justification continues to hold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 config/prometheus/alert_rules.yml           | 80 +++++++++++++++++++++
 scripts/observability/scan-failed-colors.sh | 70 ++++++++++++++++++
 2 files changed, 150 insertions(+)
 create mode 100755 scripts/observability/scan-failed-colors.sh

diff --git a/config/prometheus/alert_rules.yml b/config/prometheus/alert_rules.yml
index 86516ba77..d3c04d651 100644
--- a/config/prometheus/alert_rules.yml
+++ b/config/prometheus/alert_rules.yml
@@ -120,3 +120,83 @@ groups:
             unreachable means we are at-or-past the redundancy ceiling.
             Any further failure causes data unavailability. Page now.
           runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable"
+
+  # W5+ : Forgejo+Ansible+Incus deploy pipeline. The deploy_app.yml
+  # playbook writes a textfile-collector .prom file under
+  # /var/lib/node_exporter/textfile_collector/veza_deploy.prom on every
+  # deploy attempt. node_exporter scrapes it and exposes the metrics
+  # via the standard /metrics endpoint, no Pushgateway needed.
+  - name: veza_deploy
+    rules:
+      - alert: VezaDeployFailed
+        # last_failure_timestamp newer than last_success_timestamp.
+        # 5m soak so a deploy in progress (writes failure THEN switches
+        # back, which writes success on the next successful deploy)
+        # doesn't transient-trigger.
+        expr: |
+          max(veza_deploy_last_failure_timestamp) by (env) >
+          max(veza_deploy_last_success_timestamp or vector(0)) by (env)
+        for: 5m
+        labels:
+          severity: critical
+          page: "true"
+        annotations:
+          summary: "Veza deploy to {{ $labels.env }} failed"
+          description: |
+            The most recent deploy attempt to {{ $labels.env }} failed
+            and HAProxy was reverted to the prior color. The failed
+            color's containers are kept alive for forensics. Inspect:
+              gh workflow run cleanup-failed.yml -f env={{ $labels.env }} -f color=<failed_color>
+            once the operator has read the journalctl output.
+          runbook_url: "https://veza.fr/runbooks/deploy-failed"
+
+      - alert: VezaStaleDeploy
+        # Staging cadence is daily-ish; a 7-day silence smells like
+        # CI is broken or the team is on holiday with prod still
+        # serving an old SHA. Prod is monthly-ish so 30 days.
+        # Two separate alerts because the threshold differs.
+        expr: |
+          (time() - max(veza_deploy_last_success_timestamp{env="staging"}) by (env)) > (7 * 86400)
+        for: 1h
+        labels:
+          severity: warning
+          page: "false"
+        annotations:
+          summary: "Staging deploy hasn't succeeded in 7+ days"
+          description: |
+            Last successful staging deploy was
+            {{ $value | humanizeDuration }} ago. Pipeline likely broken
+            (Forgejo runner offline ? secret expired ?).
+
+      - alert: VezaStaleDeployProd
+        expr: |
+          (time() - max(veza_deploy_last_success_timestamp{env="prod"}) by (env)) > (30 * 86400)
+        for: 1h
+        labels:
+          severity: warning
+          page: "false"
+        annotations:
+          summary: "Prod deploy hasn't succeeded in 30+ days"
+          description: |
+            Last successful prod deploy was {{ $value | humanizeDuration }}
+            ago. Tag-based release cadence likely stalled.
+
+      - alert: VezaFailedColorAlive
+        # The textfile collector also exposes a custom metric
+        # `veza_deploy_failed_color_alive{env=...,color=...}` set by
+        # a small periodic script that scans `incus list` for
+        # containers in the failed-deploy state. (Stub script lives
+        # under scripts/observability/scan-failed-colors.sh.)
+        # Threshold 24h so the operator has at least a working day
+        # to do post-mortem before the alert fires.
+        expr: max(veza_deploy_failed_color_alive) by (env, color) > 0
+        for: 24h
+        labels:
+          severity: warning
+          page: "false"
+        annotations:
+          summary: "Failed deploy color {{ $labels.color }} still alive in {{ $labels.env }}"
+          description: |
+            A previously-failed-deploy color has been kept alive for
+            24+ hours. Either complete forensics + run cleanup-failed,
+            or the next deploy will recycle it automatically.
diff --git a/scripts/observability/scan-failed-colors.sh b/scripts/observability/scan-failed-colors.sh
new file mode 100755
index 000000000..af7ea1aae
--- /dev/null
+++ b/scripts/observability/scan-failed-colors.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# scan-failed-colors.sh — emit veza_deploy_failed_color_alive textfile
+# metrics from `incus list`. Designed to be called every minute by a
+# systemd timer on the Incus host ; node_exporter's textfile collector
+# picks the file up.
+#
+# A "failed-deploy color" is defined here as: an inactive color
+# (NOT the one in /var/lib/veza/active-color in the haproxy container)
+# whose containers are present and RUNNING. In normal operation, the
+# inactive color exists exactly because the LAST deploy DIDN'T fail
+# (it became the new prior color). The signal we want is when the
+# inactive color outlives its useful window — Phase E.fail kept it
+# alive for forensics and the operator forgot to clean up.
+#
+# Heuristic: emit the metric whenever an inactive color exists. The
+# alert (VezaFailedColorAlive) is gated by `for: 24h` which converts
+# "color is inactive" into "color has been inactive for >24h", which
+# is the actual page-worthy signal.
+#
+# Usage:
+#   PREFIX=veza-staging- /opt/veza/scripts/scan-failed-colors.sh
+# Output:
+#   /var/lib/node_exporter/textfile_collector/veza_deploy_failed_colors.prom
+
+set -euo pipefail
+
+PREFIX="${PREFIX:-veza-}"
+ENV="${ENV:-$(echo "$PREFIX" | sed -E 's/^veza-?//;s/-$//')}"
+HAPROXY_CT="${PREFIX}haproxy"
+TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}"
+OUT="${TEXTFILE_DIR}/veza_deploy_failed_colors.prom"
+
+mkdir -p "$TEXTFILE_DIR"
+
+# Read active color from the HAProxy container ; default blue if file
+# missing (first-ever deploy, no rollback history).
+if incus exec "$HAPROXY_CT" -- /bin/true 2>/dev/null; then
+    ACTIVE=$(incus exec "$HAPROXY_CT" -- cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]' || echo blue)
+else
+    ACTIVE=blue
+fi
+[ -z "$ACTIVE" ] && ACTIVE=blue
+INACTIVE=$([ "$ACTIVE" = "blue" ] && echo green || echo blue)
+
+# Emit a single sample per color. A 1 means "this inactive color has
+# at least one app container alive" ; 0 (or absence) means clean.
+TMPFILE="${OUT}.tmp"
+{
+    echo "# HELP veza_deploy_failed_color_alive 1 if the inactive color has live app containers."
+    echo "# TYPE veza_deploy_failed_color_alive gauge"
+    for COLOR in blue green; do
+        if [ "$COLOR" = "$ACTIVE" ]; then
+            # Active color is by definition NOT a failed-deploy color.
+            echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} 0"
+            continue
+        fi
+        ALIVE=0
+        for COMP in backend stream web; do
+            CT="${PREFIX}${COMP}-${COLOR}"
+            STATE=$(incus list "$CT" -c s --format csv 2>/dev/null || true)
+            if [ "$STATE" = "RUNNING" ]; then
+                ALIVE=1
+                break
+            fi
+        done
+        echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} $ALIVE"
+    done
+} > "$TMPFILE"
+
+mv -f "$TMPFILE" "$OUT"