71 lines
2.8 KiB
Bash
71 lines
2.8 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# scan-failed-colors.sh — emit veza_deploy_failed_color_alive textfile
|
||
|
|
# metrics from `incus list`. Designed to be called every minute by a
|
||
|
|
# systemd timer on the Incus host ; node_exporter's textfile collector
|
||
|
|
# picks the file up.
|
||
|
|
#
|
||
|
|
# A "failed-deploy color" is defined here as: an inactive color
|
||
|
|
# (NOT the one in /var/lib/veza/active-color in the haproxy container)
|
||
|
|
# whose containers are present and RUNNING. In normal operation, the
|
||
|
|
# inactive color exists exactly because the LAST deploy DIDN'T fail
|
||
|
|
# (it became the new prior color). The signal we want is when the
|
||
|
|
# inactive color outlives its useful window — Phase E.fail kept it
|
||
|
|
# alive for forensics and the operator forgot to clean up.
|
||
|
|
#
|
||
|
|
# Heuristic: emit the metric whenever an inactive color exists. The
|
||
|
|
# alert (VezaFailedColorAlive) is gated by `for: 24h` which converts
|
||
|
|
# "color is inactive" into "color has been inactive for >24h", which
|
||
|
|
# is the actual page-worthy signal.
|
||
|
|
#
|
||
|
|
# Usage:
|
||
|
|
# PREFIX=veza-staging- /opt/veza/scripts/scan-failed-colors.sh
|
||
|
|
# Output:
|
||
|
|
# /var/lib/node_exporter/textfile_collector/veza_deploy_failed_colors.prom
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
PREFIX="${PREFIX:-veza-}"
|
||
|
|
ENV="${ENV:-$(echo "$PREFIX" | sed -E 's/^veza-?//;s/-$//')}"
|
||
|
|
HAPROXY_CT="${PREFIX}haproxy"
|
||
|
|
TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}"
|
||
|
|
OUT="${TEXTFILE_DIR}/veza_deploy_failed_colors.prom"
|
||
|
|
|
||
|
|
mkdir -p "$TEXTFILE_DIR"
|
||
|
|
|
||
|
|
# Read active color from the HAProxy container ; default blue if file
|
||
|
|
# missing (first-ever deploy, no rollback history).
|
||
|
|
if incus exec "$HAPROXY_CT" -- /bin/true 2>/dev/null; then
|
||
|
|
ACTIVE=$(incus exec "$HAPROXY_CT" -- cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]' || echo blue)
|
||
|
|
else
|
||
|
|
ACTIVE=blue
|
||
|
|
fi
|
||
|
|
[ -z "$ACTIVE" ] && ACTIVE=blue
|
||
|
|
INACTIVE=$([ "$ACTIVE" = "blue" ] && echo green || echo blue)
|
||
|
|
|
||
|
|
# Emit a single sample per color. A 1 means "this inactive color has
|
||
|
|
# at least one app container alive" ; 0 (or absence) means clean.
|
||
|
|
TMPFILE="${OUT}.tmp"
|
||
|
|
{
|
||
|
|
echo "# HELP veza_deploy_failed_color_alive 1 if the inactive color has live app containers."
|
||
|
|
echo "# TYPE veza_deploy_failed_color_alive gauge"
|
||
|
|
for COLOR in blue green; do
|
||
|
|
if [ "$COLOR" = "$ACTIVE" ]; then
|
||
|
|
# Active color is by definition NOT a failed-deploy color.
|
||
|
|
echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} 0"
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
ALIVE=0
|
||
|
|
for COMP in backend stream web; do
|
||
|
|
CT="${PREFIX}${COMP}-${COLOR}"
|
||
|
|
STATE=$(incus list "$CT" -c s --format csv 2>/dev/null || true)
|
||
|
|
if [ "$STATE" = "RUNNING" ]; then
|
||
|
|
ALIVE=1
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} $ALIVE"
|
||
|
|
done
|
||
|
|
} > "$TMPFILE"
|
||
|
|
|
||
|
|
mv -f "$TMPFILE" "$OUT"
|