#!/usr/bin/env bash # scan-failed-colors.sh — emit veza_deploy_failed_color_alive textfile # metrics from `incus list`. Designed to be called every minute by a # systemd timer on the Incus host ; node_exporter's textfile collector # picks the file up. # # A "failed-deploy color" is defined here as: an inactive color # (NOT the one in /var/lib/veza/active-color in the haproxy container) # whose containers are present and RUNNING. In normal operation, the # inactive color exists exactly because the LAST deploy DIDN'T fail # (it became the new prior color). The signal we want is when the # inactive color outlives its useful window — Phase E.fail kept it # alive for forensics and the operator forgot to clean up. # # Heuristic: emit the metric whenever an inactive color exists. The # alert (VezaFailedColorAlive) is gated by `for: 24h` which converts # "color is inactive" into "color has been inactive for >24h", which # is the actual page-worthy signal. # # Usage: # PREFIX=veza-staging- /opt/veza/scripts/scan-failed-colors.sh # Output: # /var/lib/node_exporter/textfile_collector/veza_deploy_failed_colors.prom set -euo pipefail PREFIX="${PREFIX:-veza-}" ENV="${ENV:-$(echo "$PREFIX" | sed -E 's/^veza-?//;s/-$//')}" HAPROXY_CT="${PREFIX}haproxy" TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" OUT="${TEXTFILE_DIR}/veza_deploy_failed_colors.prom" mkdir -p "$TEXTFILE_DIR" # Read active color from the HAProxy container ; default blue if file # missing (first-ever deploy, no rollback history). if incus exec "$HAPROXY_CT" -- /bin/true 2>/dev/null; then ACTIVE=$(incus exec "$HAPROXY_CT" -- cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]' || echo blue) else ACTIVE=blue fi [ -z "$ACTIVE" ] && ACTIVE=blue INACTIVE=$([ "$ACTIVE" = "blue" ] && echo green || echo blue) # Emit a single sample per color. A 1 means "this inactive color has # at least one app container alive" ; 0 (or absence) means clean. TMPFILE="${OUT}.tmp" { echo "# HELP veza_deploy_failed_color_alive 1 if the inactive color has live app containers." echo "# TYPE veza_deploy_failed_color_alive gauge" for COLOR in blue green; do if [ "$COLOR" = "$ACTIVE" ]; then # Active color is by definition NOT a failed-deploy color. echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} 0" continue fi ALIVE=0 for COMP in backend stream web; do CT="${PREFIX}${COMP}-${COLOR}" STATE=$(incus list "$CT" -c s --format csv 2>/dev/null || true) if [ "$STATE" = "RUNNING" ]; then ALIVE=1 break fi done echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} $ALIVE" done } > "$TMPFILE" mv -f "$TMPFILE" "$OUT"