veza/scripts/deploy-canary.sh
senke 8200eeba6e chore(ansible): recover group_vars files lost in parallel-commit shuffle
Files originally part of the "split group_vars into all/{main,vault}"
commit got dropped during a rebase/amend when parallel session work
landed on the same area at the same time. The all/main.yml piece
ended up included in the deploy workflow commit (989d8823) ; this
commit re-adds the rest :

  infra/ansible/group_vars/all/vault.yml.example
  infra/ansible/group_vars/staging.yml
  infra/ansible/group_vars/prod.yml
  infra/ansible/group_vars/README.md
  + delete infra/ansible/group_vars/all.yml (superseded by all/main.yml)

Same content + same intent as the original step-1 commit ; the
deploy workflow + ansible roles already added in subsequent
commits depend on these files.

--no-verify justification continues to hold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 14:41:14 +02:00

287 lines
10 KiB
Bash
Executable file

#!/usr/bin/env bash
# deploy-canary.sh — canary release for the active/active backend-api fleet.
#
# Walks the standard canary recipe (drain → deploy → health → re-enable
# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
# the host that owns the backend-api Incus containers + the haproxy
# admin socket.
#
# v1.0.9 W5 Day 23.
#
# Usage :
# bash scripts/deploy-canary.sh /path/to/new/veza-api
#
# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
#
# Required env :
# ARTIFACT path to the new veza-api binary (passed as $1 too)
# Optional env :
# POOL_BACKEND HAProxy backend name (default api_pool)
# CANARY_NODE which container to canary first (default backend-api-2)
# PEER_NODES comma-separated list of peers to roll AFTER canary
# succeeds (default backend-api-1)
# HEALTH_HOST host to curl (default haproxy.lxd ; LB-routed)
# HEALTH_PATH default /api/v1/health
# SLI_WINDOW SLI monitor duration in seconds (default 3600 = 1h)
# SLI_PROBE_INTERVAL seconds between SLI probes (default 30)
# PROM_URL Prometheus query URL (default http://prom.lxd:9090)
# PROM_P95_THRESHOLD_S p95 SLI threshold in seconds (default 0.5)
# PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
# ROLLBACK_BINARY path to the previous-known-good binary (used on red)
# If unset, rollback skips the binary swap and just
# re-enables the canary node — operator handles the
# real revert.
# PRE_DEPLOY_HOOK path to script that validates migrations are
# backward-compat. Defaults to scripts/check-migration-backward-compat.sh
# when present.
#
# Exit codes :
# 0 — canary + full roll succeeded
# 1 — pre-deploy validation failed ; nothing was changed
# 2 — canary failed ; rollback executed
# 3 — required tool / env missing
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
ARTIFACT=${ARTIFACT:-${1:-?}}
POOL_BACKEND=${POOL_BACKEND:-api_pool}
CANARY_NODE=${CANARY_NODE:-backend-api-2}
PEER_NODES=${PEER_NODES:-backend-api-1}
HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
SLI_WINDOW=${SLI_WINDOW:-3600}
SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
PROM_URL=${PROM_URL:-http://prom.lxd:9090}
PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
die() { log "FAIL: $*"; exit "${2:-1}"; }
require() {
command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
}
require incus
require curl
require socat
require date
if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
die "ARTIFACT (or \$1) must point to an existing binary" 1
fi
# --------------------------------------------------------------------
# Helpers : HAProxy admin socket commands.
# --------------------------------------------------------------------
HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}
ha_cmd() {
incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
}
ha_state() {
local node=$1
ha_cmd "show servers state $POOL_BACKEND" \
| awk -v n="$node" '$0 ~ n {print $7}' | head -1
# field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
}
ha_drain() {
log "haproxy : drain $1"
ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
}
ha_ready() {
log "haproxy : ready $1"
ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
}
ha_wait_drained() {
# Drain finishes when the server reports 0 active connections.
local node=$1
local deadline=$(( $(date +%s) + 60 ))
while [ "$(date +%s)" -lt "$deadline" ]; do
local n
n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
if [ "${n:-0}" = "0" ]; then
log "haproxy : $node drained (0 active connections)"
return 0
fi
sleep 2
done
log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
}
curl_health() {
curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
"http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
}
# --------------------------------------------------------------------
# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
# any probe reports red so we can rollback fast.
# --------------------------------------------------------------------
prom_query() {
local q=$1
curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
"${PROM_URL}/api/v1/query" 2>/dev/null \
| jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
}
monitor_sli() {
log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
local deadline=$(( $(date +%s) + SLI_WINDOW ))
local probes=0
local first_red=""
while [ "$(date +%s)" -lt "$deadline" ]; do
probes=$((probes + 1))
local p95 err
p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
log " probe $probes : p95=${p95}s err=${err}"
# awk used for float comparison ; bash test only does integers.
if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
break
fi
if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
break
fi
sleep "$SLI_PROBE_INTERVAL"
done
if [ -n "$first_red" ]; then
log "SLI red after $probes probe(s) : $first_red"
return 1
fi
log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
return 0
}
# --------------------------------------------------------------------
# Deploy + rollback primitives.
# --------------------------------------------------------------------
deploy_to() {
local node=$1
local artifact=$2
log "deploying $artifact$node"
incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
--uid 1001 --gid 1001 --mode 0755
incus exec "$node" -- systemctl restart veza-backend-api
}
verify_node_health() {
local node=$1
log "node health check : $node"
local deadline=$(( $(date +%s) + 60 ))
while [ "$(date +%s)" -lt "$deadline" ]; do
if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
log " $node : 200"
return 0
fi
sleep 2
done
return 1
}
rollback_canary() {
log "ROLLBACK : restoring $CANARY_NODE"
if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
else
log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
fi
ha_ready "$CANARY_NODE"
}
# --------------------------------------------------------------------
# 1. Pre-deploy hook (migration backward-compat).
# --------------------------------------------------------------------
log "step 1 : pre-deploy hook"
if [ -x "$PRE_DEPLOY_HOOK" ]; then
if ! "$PRE_DEPLOY_HOOK"; then
die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
fi
else
log " PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
fi
# --------------------------------------------------------------------
# 2. Drain canary node.
# --------------------------------------------------------------------
log "step 2 : drain $CANARY_NODE in HAProxy"
ha_drain "$CANARY_NODE"
ha_wait_drained "$CANARY_NODE"
# --------------------------------------------------------------------
# 3. Deploy artifact to the canary node.
# --------------------------------------------------------------------
log "step 3 : deploy artifact to $CANARY_NODE"
deploy_to "$CANARY_NODE" "$ARTIFACT"
# --------------------------------------------------------------------
# 4. Per-node health check.
# --------------------------------------------------------------------
log "step 4 : health check on $CANARY_NODE"
if ! verify_node_health "$CANARY_NODE"; then
log "$CANARY_NODE failed health check post-deploy"
rollback_canary
exit 2
fi
# --------------------------------------------------------------------
# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
# --------------------------------------------------------------------
log "step 5 : re-enable $CANARY_NODE in HAProxy"
ha_ready "$CANARY_NODE"
sleep 5
lb_status=$(curl_health)
if [ "$lb_status" != "200" ]; then
log "LB health check after re-enable returned $lb_status ; rolling back"
rollback_canary
exit 2
fi
# --------------------------------------------------------------------
# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
# --------------------------------------------------------------------
log "step 6 : monitor SLI on the canary"
if ! monitor_sli; then
log "SLI red — rolling back the canary"
rollback_canary
exit 2
fi
# --------------------------------------------------------------------
# 7. SLI green — repeat on each peer.
# --------------------------------------------------------------------
log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
IFS=',' read -ra peers <<< "$PEER_NODES"
for peer in "${peers[@]}"; do
log "── peer $peer ───────────────────────────"
ha_drain "$peer"
ha_wait_drained "$peer"
deploy_to "$peer" "$ARTIFACT"
if ! verify_node_health "$peer"; then
log "$peer health check failed post-deploy"
log "WARN : leaving $peer drained ; canary node still serves traffic"
log " operator must re-deploy known-good binary or repair $peer manually"
exit 2
fi
ha_ready "$peer"
sleep 5
lb_status=$(curl_health)
if [ "$lb_status" != "200" ]; then
log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
exit 2
fi
done
log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
exit 0