#!/usr/bin/env bash # deploy-canary.sh — canary release for the active/active backend-api fleet. # # Walks the standard canary recipe (drain → deploy → health → re-enable # → SLI monitor → repeat or rollback) end-to-end. Designed to run on # the host that owns the backend-api Incus containers + the haproxy # admin socket. # # v1.0.9 W5 Day 23. # # Usage : # bash scripts/deploy-canary.sh /path/to/new/veza-api # # Required tools : incus, curl, socat (HAProxy admin socket), bash 4+. # # Required env : # ARTIFACT path to the new veza-api binary (passed as $1 too) # Optional env : # POOL_BACKEND HAProxy backend name (default api_pool) # CANARY_NODE which container to canary first (default backend-api-2) # PEER_NODES comma-separated list of peers to roll AFTER canary # succeeds (default backend-api-1) # HEALTH_HOST host to curl (default haproxy.lxd ; LB-routed) # HEALTH_PATH default /api/v1/health # SLI_WINDOW SLI monitor duration in seconds (default 3600 = 1h) # SLI_PROBE_INTERVAL seconds between SLI probes (default 30) # PROM_URL Prometheus query URL (default http://prom.lxd:9090) # PROM_P95_THRESHOLD_S p95 SLI threshold in seconds (default 0.5) # PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%) # ROLLBACK_BINARY path to the previous-known-good binary (used on red) # If unset, rollback skips the binary swap and just # re-enables the canary node — operator handles the # real revert. # PRE_DEPLOY_HOOK path to script that validates migrations are # backward-compat. Defaults to scripts/check-migration-backward-compat.sh # when present. # # Exit codes : # 0 — canary + full roll succeeded # 1 — pre-deploy validation failed ; nothing was changed # 2 — canary failed ; rollback executed # 3 — required tool / env missing set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" ARTIFACT=${ARTIFACT:-${1:-?}} POOL_BACKEND=${POOL_BACKEND:-api_pool} CANARY_NODE=${CANARY_NODE:-backend-api-2} PEER_NODES=${PEER_NODES:-backend-api-1} HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd} HEALTH_PATH=${HEALTH_PATH:-/api/v1/health} SLI_WINDOW=${SLI_WINDOW:-3600} SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30} PROM_URL=${PROM_URL:-http://prom.lxd:9090} PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5} PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005} ROLLBACK_BINARY=${ROLLBACK_BINARY:-} PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh} log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } die() { log "FAIL: $*"; exit "${2:-1}"; } require() { command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3 } require incus require curl require socat require date if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then die "ARTIFACT (or \$1) must point to an existing binary" 1 fi # -------------------------------------------------------------------- # Helpers : HAProxy admin socket commands. # -------------------------------------------------------------------- HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy} ha_cmd() { incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -" } ha_state() { local node=$1 ha_cmd "show servers state $POOL_BACKEND" \ | awk -v n="$node" '$0 ~ n {print $7}' | head -1 # field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain) } ha_drain() { log "haproxy : drain $1" ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null } ha_ready() { log "haproxy : ready $1" ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null } ha_wait_drained() { # Drain finishes when the server reports 0 active connections. local node=$1 local deadline=$(( $(date +%s) + 60 )) while [ "$(date +%s)" -lt "$deadline" ]; do local n n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0) if [ "${n:-0}" = "0" ]; then log "haproxy : $node drained (0 active connections)" return 0 fi sleep 2 done log "WARN : $node still has active connections after 60s drain ; proceeding anyway" } curl_health() { curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \ "http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000" } # -------------------------------------------------------------------- # SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as # any probe reports red so we can rollback fast. # -------------------------------------------------------------------- prom_query() { local q=$1 curl --max-time 10 -sS -G --data-urlencode "query=${q}" \ "${PROM_URL}/api/v1/query" 2>/dev/null \ | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0 } monitor_sli() { log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)" local deadline=$(( $(date +%s) + SLI_WINDOW )) local probes=0 local first_red="" while [ "$(date +%s)" -lt "$deadline" ]; do probes=$((probes + 1)) local p95 err p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))') err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))') log " probe $probes : p95=${p95}s err=${err}" # awk used for float comparison ; bash test only does integers. if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s" break fi if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}" break fi sleep "$SLI_PROBE_INTERVAL" done if [ -n "$first_red" ]; then log "SLI red after $probes probe(s) : $first_red" return 1 fi log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)" return 0 } # -------------------------------------------------------------------- # Deploy + rollback primitives. # -------------------------------------------------------------------- deploy_to() { local node=$1 local artifact=$2 log "deploying $artifact → $node" incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \ --uid 1001 --gid 1001 --mode 0755 incus exec "$node" -- systemctl restart veza-backend-api } verify_node_health() { local node=$1 log "node health check : $node" local deadline=$(( $(date +%s) + 60 )) while [ "$(date +%s)" -lt "$deadline" ]; do if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then log " $node : 200" return 0 fi sleep 2 done return 1 } rollback_canary() { log "ROLLBACK : restoring $CANARY_NODE" if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing" else log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert" fi ha_ready "$CANARY_NODE" } # -------------------------------------------------------------------- # 1. Pre-deploy hook (migration backward-compat). # -------------------------------------------------------------------- log "step 1 : pre-deploy hook" if [ -x "$PRE_DEPLOY_HOOK" ]; then if ! "$PRE_DEPLOY_HOOK"; then die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1 fi else log " PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)" fi # -------------------------------------------------------------------- # 2. Drain canary node. # -------------------------------------------------------------------- log "step 2 : drain $CANARY_NODE in HAProxy" ha_drain "$CANARY_NODE" ha_wait_drained "$CANARY_NODE" # -------------------------------------------------------------------- # 3. Deploy artifact to the canary node. # -------------------------------------------------------------------- log "step 3 : deploy artifact to $CANARY_NODE" deploy_to "$CANARY_NODE" "$ARTIFACT" # -------------------------------------------------------------------- # 4. Per-node health check. # -------------------------------------------------------------------- log "step 4 : health check on $CANARY_NODE" if ! verify_node_health "$CANARY_NODE"; then log "$CANARY_NODE failed health check post-deploy" rollback_canary exit 2 fi # -------------------------------------------------------------------- # 5. Re-enable + LB health check (proves HAProxy sees the node ready). # -------------------------------------------------------------------- log "step 5 : re-enable $CANARY_NODE in HAProxy" ha_ready "$CANARY_NODE" sleep 5 lb_status=$(curl_health) if [ "$lb_status" != "200" ]; then log "LB health check after re-enable returned $lb_status ; rolling back" rollback_canary exit 2 fi # -------------------------------------------------------------------- # 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback. # -------------------------------------------------------------------- log "step 6 : monitor SLI on the canary" if ! monitor_sli; then log "SLI red — rolling back the canary" rollback_canary exit 2 fi # -------------------------------------------------------------------- # 7. SLI green — repeat on each peer. # -------------------------------------------------------------------- log "step 7 : SLI green on canary, rolling peers : $PEER_NODES" IFS=',' read -ra peers <<< "$PEER_NODES" for peer in "${peers[@]}"; do log "── peer $peer ───────────────────────────" ha_drain "$peer" ha_wait_drained "$peer" deploy_to "$peer" "$ARTIFACT" if ! verify_node_health "$peer"; then log "$peer health check failed post-deploy" log "WARN : leaving $peer drained ; canary node still serves traffic" log " operator must re-deploy known-good binary or repair $peer manually" exit 2 fi ha_ready "$peer" sleep 5 lb_status=$(curl_health) if [ "$lb_status" != "200" ]; then log "LB health check after re-enable of $peer returned $lb_status — abandoning roll" exit 2 fi done log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly" exit 0