288 lines
10 KiB
Bash
288 lines
10 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# deploy-canary.sh — canary release for the active/active backend-api fleet.
|
||
|
|
#
|
||
|
|
# Walks the standard canary recipe (drain → deploy → health → re-enable
|
||
|
|
# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
|
||
|
|
# the host that owns the backend-api Incus containers + the haproxy
|
||
|
|
# admin socket.
|
||
|
|
#
|
||
|
|
# v1.0.9 W5 Day 23.
|
||
|
|
#
|
||
|
|
# Usage :
|
||
|
|
# bash scripts/deploy-canary.sh /path/to/new/veza-api
|
||
|
|
#
|
||
|
|
# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
|
||
|
|
#
|
||
|
|
# Required env :
|
||
|
|
# ARTIFACT path to the new veza-api binary (passed as $1 too)
|
||
|
|
# Optional env :
|
||
|
|
# POOL_BACKEND HAProxy backend name (default api_pool)
|
||
|
|
# CANARY_NODE which container to canary first (default backend-api-2)
|
||
|
|
# PEER_NODES comma-separated list of peers to roll AFTER canary
|
||
|
|
# succeeds (default backend-api-1)
|
||
|
|
# HEALTH_HOST host to curl (default haproxy.lxd ; LB-routed)
|
||
|
|
# HEALTH_PATH default /api/v1/health
|
||
|
|
# SLI_WINDOW SLI monitor duration in seconds (default 3600 = 1h)
|
||
|
|
# SLI_PROBE_INTERVAL seconds between SLI probes (default 30)
|
||
|
|
# PROM_URL Prometheus query URL (default http://prom.lxd:9090)
|
||
|
|
# PROM_P95_THRESHOLD_S p95 SLI threshold in seconds (default 0.5)
|
||
|
|
# PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
|
||
|
|
# ROLLBACK_BINARY path to the previous-known-good binary (used on red)
|
||
|
|
# If unset, rollback skips the binary swap and just
|
||
|
|
# re-enables the canary node — operator handles the
|
||
|
|
# real revert.
|
||
|
|
# PRE_DEPLOY_HOOK path to script that validates migrations are
|
||
|
|
# backward-compat. Defaults to scripts/check-migration-backward-compat.sh
|
||
|
|
# when present.
|
||
|
|
#
|
||
|
|
# Exit codes :
|
||
|
|
# 0 — canary + full roll succeeded
|
||
|
|
# 1 — pre-deploy validation failed ; nothing was changed
|
||
|
|
# 2 — canary failed ; rollback executed
|
||
|
|
# 3 — required tool / env missing
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||
|
|
|
||
|
|
ARTIFACT=${ARTIFACT:-${1:-?}}
|
||
|
|
POOL_BACKEND=${POOL_BACKEND:-api_pool}
|
||
|
|
CANARY_NODE=${CANARY_NODE:-backend-api-2}
|
||
|
|
PEER_NODES=${PEER_NODES:-backend-api-1}
|
||
|
|
HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
|
||
|
|
HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
|
||
|
|
SLI_WINDOW=${SLI_WINDOW:-3600}
|
||
|
|
SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
|
||
|
|
PROM_URL=${PROM_URL:-http://prom.lxd:9090}
|
||
|
|
PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
|
||
|
|
PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
|
||
|
|
ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
|
||
|
|
PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}
|
||
|
|
|
||
|
|
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
|
||
|
|
die() { log "FAIL: $*"; exit "${2:-1}"; }
|
||
|
|
|
||
|
|
require() {
|
||
|
|
command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
|
||
|
|
}
|
||
|
|
|
||
|
|
require incus
|
||
|
|
require curl
|
||
|
|
require socat
|
||
|
|
require date
|
||
|
|
|
||
|
|
if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
|
||
|
|
die "ARTIFACT (or \$1) must point to an existing binary" 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# Helpers : HAProxy admin socket commands.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}
|
||
|
|
|
||
|
|
ha_cmd() {
|
||
|
|
incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
|
||
|
|
}
|
||
|
|
|
||
|
|
ha_state() {
|
||
|
|
local node=$1
|
||
|
|
ha_cmd "show servers state $POOL_BACKEND" \
|
||
|
|
| awk -v n="$node" '$0 ~ n {print $7}' | head -1
|
||
|
|
# field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
|
||
|
|
}
|
||
|
|
|
||
|
|
ha_drain() {
|
||
|
|
log "haproxy : drain $1"
|
||
|
|
ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
|
||
|
|
}
|
||
|
|
|
||
|
|
ha_ready() {
|
||
|
|
log "haproxy : ready $1"
|
||
|
|
ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
|
||
|
|
}
|
||
|
|
|
||
|
|
ha_wait_drained() {
|
||
|
|
# Drain finishes when the server reports 0 active connections.
|
||
|
|
local node=$1
|
||
|
|
local deadline=$(( $(date +%s) + 60 ))
|
||
|
|
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||
|
|
local n
|
||
|
|
n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
|
||
|
|
if [ "${n:-0}" = "0" ]; then
|
||
|
|
log "haproxy : $node drained (0 active connections)"
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
sleep 2
|
||
|
|
done
|
||
|
|
log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
|
||
|
|
}
|
||
|
|
|
||
|
|
curl_health() {
|
||
|
|
curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
|
||
|
|
"http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
|
||
|
|
}
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
|
||
|
|
# any probe reports red so we can rollback fast.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
prom_query() {
|
||
|
|
local q=$1
|
||
|
|
curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
|
||
|
|
"${PROM_URL}/api/v1/query" 2>/dev/null \
|
||
|
|
| jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
|
||
|
|
}
|
||
|
|
|
||
|
|
monitor_sli() {
|
||
|
|
log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
|
||
|
|
local deadline=$(( $(date +%s) + SLI_WINDOW ))
|
||
|
|
local probes=0
|
||
|
|
local first_red=""
|
||
|
|
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||
|
|
probes=$((probes + 1))
|
||
|
|
local p95 err
|
||
|
|
p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
|
||
|
|
err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
|
||
|
|
log " probe $probes : p95=${p95}s err=${err}"
|
||
|
|
|
||
|
|
# awk used for float comparison ; bash test only does integers.
|
||
|
|
if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
|
||
|
|
first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
|
||
|
|
first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
sleep "$SLI_PROBE_INTERVAL"
|
||
|
|
done
|
||
|
|
if [ -n "$first_red" ]; then
|
||
|
|
log "SLI red after $probes probe(s) : $first_red"
|
||
|
|
return 1
|
||
|
|
fi
|
||
|
|
log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
|
||
|
|
return 0
|
||
|
|
}
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# Deploy + rollback primitives.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
deploy_to() {
|
||
|
|
local node=$1
|
||
|
|
local artifact=$2
|
||
|
|
log "deploying $artifact → $node"
|
||
|
|
incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
|
||
|
|
--uid 1001 --gid 1001 --mode 0755
|
||
|
|
incus exec "$node" -- systemctl restart veza-backend-api
|
||
|
|
}
|
||
|
|
|
||
|
|
verify_node_health() {
|
||
|
|
local node=$1
|
||
|
|
log "node health check : $node"
|
||
|
|
local deadline=$(( $(date +%s) + 60 ))
|
||
|
|
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||
|
|
if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
|
||
|
|
log " $node : 200"
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
sleep 2
|
||
|
|
done
|
||
|
|
return 1
|
||
|
|
}
|
||
|
|
|
||
|
|
rollback_canary() {
|
||
|
|
log "ROLLBACK : restoring $CANARY_NODE"
|
||
|
|
if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
|
||
|
|
deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
|
||
|
|
verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
|
||
|
|
else
|
||
|
|
log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
|
||
|
|
fi
|
||
|
|
ha_ready "$CANARY_NODE"
|
||
|
|
}
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# 1. Pre-deploy hook (migration backward-compat).
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
log "step 1 : pre-deploy hook"
|
||
|
|
if [ -x "$PRE_DEPLOY_HOOK" ]; then
|
||
|
|
if ! "$PRE_DEPLOY_HOOK"; then
|
||
|
|
die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
|
||
|
|
fi
|
||
|
|
else
|
||
|
|
log " PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# 2. Drain canary node.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
log "step 2 : drain $CANARY_NODE in HAProxy"
|
||
|
|
ha_drain "$CANARY_NODE"
|
||
|
|
ha_wait_drained "$CANARY_NODE"
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# 3. Deploy artifact to the canary node.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
log "step 3 : deploy artifact to $CANARY_NODE"
|
||
|
|
deploy_to "$CANARY_NODE" "$ARTIFACT"
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# 4. Per-node health check.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
log "step 4 : health check on $CANARY_NODE"
|
||
|
|
if ! verify_node_health "$CANARY_NODE"; then
|
||
|
|
log "$CANARY_NODE failed health check post-deploy"
|
||
|
|
rollback_canary
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
log "step 5 : re-enable $CANARY_NODE in HAProxy"
|
||
|
|
ha_ready "$CANARY_NODE"
|
||
|
|
sleep 5
|
||
|
|
lb_status=$(curl_health)
|
||
|
|
if [ "$lb_status" != "200" ]; then
|
||
|
|
log "LB health check after re-enable returned $lb_status ; rolling back"
|
||
|
|
rollback_canary
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
log "step 6 : monitor SLI on the canary"
|
||
|
|
if ! monitor_sli; then
|
||
|
|
log "SLI red — rolling back the canary"
|
||
|
|
rollback_canary
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
# 7. SLI green — repeat on each peer.
|
||
|
|
# --------------------------------------------------------------------
|
||
|
|
log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
|
||
|
|
IFS=',' read -ra peers <<< "$PEER_NODES"
|
||
|
|
for peer in "${peers[@]}"; do
|
||
|
|
log "── peer $peer ───────────────────────────"
|
||
|
|
ha_drain "$peer"
|
||
|
|
ha_wait_drained "$peer"
|
||
|
|
deploy_to "$peer" "$ARTIFACT"
|
||
|
|
if ! verify_node_health "$peer"; then
|
||
|
|
log "$peer health check failed post-deploy"
|
||
|
|
log "WARN : leaving $peer drained ; canary node still serves traffic"
|
||
|
|
log " operator must re-deploy known-good binary or repair $peer manually"
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
ha_ready "$peer"
|
||
|
|
sleep 5
|
||
|
|
lb_status=$(curl_health)
|
||
|
|
if [ "$lb_status" != "200" ]; then
|
||
|
|
log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
|
||
|
|
exit 0
|