veza/scripts/deploy-canary.sh

#!/usr/bin/env bash
# deploy-canary.sh — canary release for the active/active backend-api fleet.
#
# Walks the standard canary recipe (drain → deploy → health → re-enable
# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
# the host that owns the backend-api Incus containers + the haproxy
# admin socket.
#
# v1.0.9 W5 Day 23.
#
# Usage :
#   bash scripts/deploy-canary.sh /path/to/new/veza-api
#
# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
#
# Required env :
#   ARTIFACT          path to the new veza-api binary (passed as $1 too)
# Optional env :
#   POOL_BACKEND      HAProxy backend name (default api_pool)
#   CANARY_NODE       which container to canary first (default backend-api-2)
#   PEER_NODES        comma-separated list of peers to roll AFTER canary
#                     succeeds (default backend-api-1)
#   HEALTH_HOST       host to curl (default haproxy.lxd ; LB-routed)
#   HEALTH_PATH       default /api/v1/health
#   SLI_WINDOW        SLI monitor duration in seconds (default 3600 = 1h)
#   SLI_PROBE_INTERVAL  seconds between SLI probes (default 30)
#   PROM_URL          Prometheus query URL (default http://prom.lxd:9090)
#   PROM_P95_THRESHOLD_S    p95 SLI threshold in seconds (default 0.5)
#   PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
#   ROLLBACK_BINARY   path to the previous-known-good binary (used on red)
#                     If unset, rollback skips the binary swap and just
#                     re-enables the canary node — operator handles the
#                     real revert.
#   PRE_DEPLOY_HOOK   path to script that validates migrations are
#                     backward-compat. Defaults to scripts/check-migration-backward-compat.sh
#                     when present.
#
# Exit codes :
#   0  — canary + full roll succeeded
#   1  — pre-deploy validation failed ; nothing was changed
#   2  — canary failed ; rollback executed
#   3  — required tool / env missing
set -euo pipefail

REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"

ARTIFACT=${ARTIFACT:-${1:-?}}
POOL_BACKEND=${POOL_BACKEND:-api_pool}
CANARY_NODE=${CANARY_NODE:-backend-api-2}
PEER_NODES=${PEER_NODES:-backend-api-1}
HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
SLI_WINDOW=${SLI_WINDOW:-3600}
SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
PROM_URL=${PROM_URL:-http://prom.lxd:9090}
PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}

log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
die()  { log "FAIL: $*"; exit "${2:-1}"; }

require() {
  command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
}

require incus
require curl
require socat
require date

if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
  die "ARTIFACT (or \$1) must point to an existing binary" 1
fi

# --------------------------------------------------------------------
# Helpers : HAProxy admin socket commands.
# --------------------------------------------------------------------
HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}

ha_cmd() {
  incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
}

ha_state() {
  local node=$1
  ha_cmd "show servers state $POOL_BACKEND" \
    | awk -v n="$node" '$0 ~ n {print $7}' | head -1
  # field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
}

ha_drain() {
  log "haproxy : drain $1"
  ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
}

ha_ready() {
  log "haproxy : ready $1"
  ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
}

ha_wait_drained() {
  # Drain finishes when the server reports 0 active connections.
  local node=$1
  local deadline=$(( $(date +%s) + 60 ))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    local n
    n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
    if [ "${n:-0}" = "0" ]; then
      log "haproxy : $node drained (0 active connections)"
      return 0
    fi
    sleep 2
  done
  log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
}

curl_health() {
  curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
    "http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
}

# --------------------------------------------------------------------
# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
# any probe reports red so we can rollback fast.
# --------------------------------------------------------------------
prom_query() {
  local q=$1
  curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
    "${PROM_URL}/api/v1/query" 2>/dev/null \
    | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
}

monitor_sli() {
  log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
  local deadline=$(( $(date +%s) + SLI_WINDOW ))
  local probes=0
  local first_red=""
  while [ "$(date +%s)" -lt "$deadline" ]; do
    probes=$((probes + 1))
    local p95 err
    p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
    err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
    log "  probe $probes : p95=${p95}s err=${err}"

    # awk used for float comparison ; bash test only does integers.
    if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
      first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
      break
    fi
    if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
      first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
      break
    fi
    sleep "$SLI_PROBE_INTERVAL"
  done
  if [ -n "$first_red" ]; then
    log "SLI red after $probes probe(s) : $first_red"
    return 1
  fi
  log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
  return 0
}

# --------------------------------------------------------------------
# Deploy + rollback primitives.
# --------------------------------------------------------------------
deploy_to() {
  local node=$1
  local artifact=$2
  log "deploying $artifact → $node"
  incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
    --uid 1001 --gid 1001 --mode 0755
  incus exec "$node" -- systemctl restart veza-backend-api
}

verify_node_health() {
  local node=$1
  log "node health check : $node"
  local deadline=$(( $(date +%s) + 60 ))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
      log "  $node : 200"
      return 0
    fi
    sleep 2
  done
  return 1
}

rollback_canary() {
  log "ROLLBACK : restoring $CANARY_NODE"
  if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
    deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
    verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
  else
    log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
  fi
  ha_ready "$CANARY_NODE"
}

# --------------------------------------------------------------------
# 1. Pre-deploy hook (migration backward-compat).
# --------------------------------------------------------------------
log "step 1 : pre-deploy hook"
if [ -x "$PRE_DEPLOY_HOOK" ]; then
  if ! "$PRE_DEPLOY_HOOK"; then
    die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
  fi
else
  log "  PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
fi

# --------------------------------------------------------------------
# 2. Drain canary node.
# --------------------------------------------------------------------
log "step 2 : drain $CANARY_NODE in HAProxy"
ha_drain "$CANARY_NODE"
ha_wait_drained "$CANARY_NODE"

# --------------------------------------------------------------------
# 3. Deploy artifact to the canary node.
# --------------------------------------------------------------------
log "step 3 : deploy artifact to $CANARY_NODE"
deploy_to "$CANARY_NODE" "$ARTIFACT"

# --------------------------------------------------------------------
# 4. Per-node health check.
# --------------------------------------------------------------------
log "step 4 : health check on $CANARY_NODE"
if ! verify_node_health "$CANARY_NODE"; then
  log "$CANARY_NODE failed health check post-deploy"
  rollback_canary
  exit 2
fi

# --------------------------------------------------------------------
# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
# --------------------------------------------------------------------
log "step 5 : re-enable $CANARY_NODE in HAProxy"
ha_ready "$CANARY_NODE"
sleep 5
lb_status=$(curl_health)
if [ "$lb_status" != "200" ]; then
  log "LB health check after re-enable returned $lb_status ; rolling back"
  rollback_canary
  exit 2
fi

# --------------------------------------------------------------------
# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
# --------------------------------------------------------------------
log "step 6 : monitor SLI on the canary"
if ! monitor_sli; then
  log "SLI red — rolling back the canary"
  rollback_canary
  exit 2
fi

# --------------------------------------------------------------------
# 7. SLI green — repeat on each peer.
# --------------------------------------------------------------------
log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
IFS=',' read -ra peers <<< "$PEER_NODES"
for peer in "${peers[@]}"; do
  log "── peer $peer ───────────────────────────"
  ha_drain "$peer"
  ha_wait_drained "$peer"
  deploy_to "$peer" "$ARTIFACT"
  if ! verify_node_health "$peer"; then
    log "$peer health check failed post-deploy"
    log "WARN : leaving $peer drained ; canary node still serves traffic"
    log "       operator must re-deploy known-good binary or repair $peer manually"
    exit 2
  fi
  ha_ready "$peer"
  sleep 5
  lb_status=$(curl_health)
  if [ "$lb_status" != "200" ]; then
    log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
    exit 2
  fi
done

log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
exit 0