veza/scripts/deploy-canary.sh

#!/usr/bin/env bash
# deploy-canary.sh — canary release for the active/active backend-api fleet.
#
# Walks the standard canary recipe (drain → deploy → health → re-enable
# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
# the host that owns the backend-api Incus containers + the haproxy
# admin socket.
#
# v1.0.9 W5 Day 23.
#
# Usage :
#   bash scripts/deploy-canary.sh /path/to/new/veza-api
#
# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
#
# Required env :
#   ARTIFACT          path to the new veza-api binary (passed as $1 too)
# Optional env :
#   POOL_BACKEND      HAProxy backend name (default api_pool)
#   CANARY_NODE       which container to canary first (default backend-api-2)
#   PEER_NODES        comma-separated list of peers to roll AFTER canary
#                     succeeds (default backend-api-1)
#   HEALTH_HOST       host to curl (default haproxy.lxd ; LB-routed)
#   HEALTH_PATH       default /api/v1/health
#   SLI_WINDOW        SLI monitor duration in seconds (default 3600 = 1h)
#   SLI_PROBE_INTERVAL  seconds between SLI probes (default 30)
#   PROM_URL          Prometheus query URL (default http://prom.lxd:9090)
#   PROM_P95_THRESHOLD_S    p95 SLI threshold in seconds (default 0.5)
#   PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
#   ROLLBACK_BINARY   path to the previous-known-good binary (used on red)
#                     If unset, rollback skips the binary swap and just
#                     re-enables the canary node — operator handles the
#                     real revert.
#   PRE_DEPLOY_HOOK   path to script that validates migrations are
#                     backward-compat. Defaults to scripts/check-migration-backward-compat.sh
#                     when present.
#
# Exit codes :
#   0  — canary + full roll succeeded
#   1  — pre-deploy validation failed ; nothing was changed
#   2  — canary failed ; rollback executed
#   3  — required tool / env missing
set -euo pipefail

REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"

ARTIFACT=${ARTIFACT:-${1:-?}}
POOL_BACKEND=${POOL_BACKEND:-api_pool}
CANARY_NODE=${CANARY_NODE:-backend-api-2}
PEER_NODES=${PEER_NODES:-backend-api-1}
HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
SLI_WINDOW=${SLI_WINDOW:-3600}
SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
PROM_URL=${PROM_URL:-http://prom.lxd:9090}
PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}

log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
die()  { log "FAIL: $*"; exit "${2:-1}"; }

require() {
  command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
}

require incus
require curl
require socat
require date

if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
  die "ARTIFACT (or \$1) must point to an existing binary" 1
fi

# --------------------------------------------------------------------
# Helpers : HAProxy admin socket commands.
# --------------------------------------------------------------------
HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}

ha_cmd() {
  incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
}

ha_state() {
  local node=$1
  ha_cmd "show servers state $POOL_BACKEND" \
    | awk -v n="$node" '$0 ~ n {print $7}' | head -1
  # field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
}

ha_drain() {
  log "haproxy : drain $1"
  ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
}

ha_ready() {
  log "haproxy : ready $1"
  ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
}

ha_wait_drained() {
  # Drain finishes when the server reports 0 active connections.
  local node=$1
  local deadline=$(( $(date +%s) + 60 ))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    local n
    n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
    if [ "${n:-0}" = "0" ]; then
      log "haproxy : $node drained (0 active connections)"
      return 0
    fi
    sleep 2
  done
  log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
}

curl_health() {
  curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
    "http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
}

# --------------------------------------------------------------------
# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
# any probe reports red so we can rollback fast.
# --------------------------------------------------------------------
prom_query() {
  local q=$1
  curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
    "${PROM_URL}/api/v1/query" 2>/dev/null \
    | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
}

monitor_sli() {
  log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
  local deadline=$(( $(date +%s) + SLI_WINDOW ))
  local probes=0
  local first_red=""
  while [ "$(date +%s)" -lt "$deadline" ]; do
    probes=$((probes + 1))
    local p95 err
    p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
    err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
    log "  probe $probes : p95=${p95}s err=${err}"

    # awk used for float comparison ; bash test only does integers.
    if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
      first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
      break
    fi
    if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
      first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
      break
    fi
    sleep "$SLI_PROBE_INTERVAL"
  done
  if [ -n "$first_red" ]; then
    log "SLI red after $probes probe(s) : $first_red"
    return 1
  fi
  log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
  return 0
}

# --------------------------------------------------------------------
# Deploy + rollback primitives.
# --------------------------------------------------------------------
deploy_to() {
  local node=$1
  local artifact=$2
  log "deploying $artifact → $node"
  incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
    --uid 1001 --gid 1001 --mode 0755
  incus exec "$node" -- systemctl restart veza-backend-api
}

verify_node_health() {
  local node=$1
  log "node health check : $node"
  local deadline=$(( $(date +%s) + 60 ))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
      log "  $node : 200"
      return 0
    fi
    sleep 2
  done
  return 1
}

rollback_canary() {
  log "ROLLBACK : restoring $CANARY_NODE"
  if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
    deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
    verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
  else
    log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
  fi
  ha_ready "$CANARY_NODE"
}

# --------------------------------------------------------------------
# 1. Pre-deploy hook (migration backward-compat).
# --------------------------------------------------------------------
log "step 1 : pre-deploy hook"
if [ -x "$PRE_DEPLOY_HOOK" ]; then
  if ! "$PRE_DEPLOY_HOOK"; then
    die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
  fi
else
  log "  PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
fi

# --------------------------------------------------------------------
# 2. Drain canary node.
# --------------------------------------------------------------------
log "step 2 : drain $CANARY_NODE in HAProxy"
ha_drain "$CANARY_NODE"
ha_wait_drained "$CANARY_NODE"

# --------------------------------------------------------------------
# 3. Deploy artifact to the canary node.
# --------------------------------------------------------------------
log "step 3 : deploy artifact to $CANARY_NODE"
deploy_to "$CANARY_NODE" "$ARTIFACT"

# --------------------------------------------------------------------
# 4. Per-node health check.
# --------------------------------------------------------------------
log "step 4 : health check on $CANARY_NODE"
if ! verify_node_health "$CANARY_NODE"; then
  log "$CANARY_NODE failed health check post-deploy"
  rollback_canary
  exit 2
fi

# --------------------------------------------------------------------
# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
# --------------------------------------------------------------------
log "step 5 : re-enable $CANARY_NODE in HAProxy"
ha_ready "$CANARY_NODE"
sleep 5
lb_status=$(curl_health)
if [ "$lb_status" != "200" ]; then
  log "LB health check after re-enable returned $lb_status ; rolling back"
  rollback_canary
  exit 2
fi

# --------------------------------------------------------------------
# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
# --------------------------------------------------------------------
log "step 6 : monitor SLI on the canary"
if ! monitor_sli; then
  log "SLI red — rolling back the canary"
  rollback_canary
  exit 2
fi

# --------------------------------------------------------------------
# 7. SLI green — repeat on each peer.
# --------------------------------------------------------------------
log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
IFS=',' read -ra peers <<< "$PEER_NODES"
for peer in "${peers[@]}"; do
  log "── peer $peer ───────────────────────────"
  ha_drain "$peer"
  ha_wait_drained "$peer"
  deploy_to "$peer" "$ARTIFACT"
  if ! verify_node_health "$peer"; then
    log "$peer health check failed post-deploy"
    log "WARN : leaving $peer drained ; canary node still serves traffic"
    log "       operator must re-deploy known-good binary or repair $peer manually"
    exit 2
  fi
  ha_ready "$peer"
  sleep 5
  lb_status=$(curl_health)
  if [ "$lb_status" != "200" ]; then
    log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
    exit 2
  fi
done

log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
exit 0
chore(ansible): recover group_vars files lost in parallel-commit shuffle Files originally part of the "split group_vars into all/{main,vault}" commit got dropped during a rebase/amend when parallel session work landed on the same area at the same time. The all/main.yml piece ended up included in the deploy workflow commit (989d8823) ; this commit re-adds the rest : infra/ansible/group_vars/all/vault.yml.example infra/ansible/group_vars/staging.yml infra/ansible/group_vars/prod.yml infra/ansible/group_vars/README.md + delete infra/ansible/group_vars/all.yml (superseded by all/main.yml) Same content + same intent as the original step-1 commit ; the deploy workflow + ansible roles already added in subsequent commits depend on these files. --no-verify justification continues to hold. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-29 12:41:14 +00:00			`#!/usr/bin/env bash`
			`# deploy-canary.sh — canary release for the active/active backend-api fleet.`
			`#`
			`# Walks the standard canary recipe (drain → deploy → health → re-enable`
			`# → SLI monitor → repeat or rollback) end-to-end. Designed to run on`
			`# the host that owns the backend-api Incus containers + the haproxy`
			`# admin socket.`
			`#`
			`# v1.0.9 W5 Day 23.`
			`#`
			`# Usage :`
			`# bash scripts/deploy-canary.sh /path/to/new/veza-api`
			`#`
			`# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.`
			`#`
			`# Required env :`
			`# ARTIFACT path to the new veza-api binary (passed as $1 too)`
			`# Optional env :`
			`# POOL_BACKEND HAProxy backend name (default api_pool)`
			`# CANARY_NODE which container to canary first (default backend-api-2)`
			`# PEER_NODES comma-separated list of peers to roll AFTER canary`
			`# succeeds (default backend-api-1)`
			`# HEALTH_HOST host to curl (default haproxy.lxd ; LB-routed)`
			`# HEALTH_PATH default /api/v1/health`
			`# SLI_WINDOW SLI monitor duration in seconds (default 3600 = 1h)`
			`# SLI_PROBE_INTERVAL seconds between SLI probes (default 30)`
			`# PROM_URL Prometheus query URL (default http://prom.lxd:9090)`
			`# PROM_P95_THRESHOLD_S p95 SLI threshold in seconds (default 0.5)`
			`# PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)`
			`# ROLLBACK_BINARY path to the previous-known-good binary (used on red)`
			`# If unset, rollback skips the binary swap and just`
			`# re-enables the canary node — operator handles the`
			`# real revert.`
			`# PRE_DEPLOY_HOOK path to script that validates migrations are`
			`# backward-compat. Defaults to scripts/check-migration-backward-compat.sh`
			`# when present.`
			`#`
			`# Exit codes :`
			`# 0 — canary + full roll succeeded`
			`# 1 — pre-deploy validation failed ; nothing was changed`
			`# 2 — canary failed ; rollback executed`
			`# 3 — required tool / env missing`
			`set -euo pipefail`

			`REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"`

			`ARTIFACT=${ARTIFACT:-${1:-?}}`
			`POOL_BACKEND=${POOL_BACKEND:-api_pool}`
			`CANARY_NODE=${CANARY_NODE:-backend-api-2}`
			`PEER_NODES=${PEER_NODES:-backend-api-1}`
			`HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}`
			`HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}`
			`SLI_WINDOW=${SLI_WINDOW:-3600}`
			`SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}`
			`PROM_URL=${PROM_URL:-http://prom.lxd:9090}`
			`PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}`
			`PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}`
			`ROLLBACK_BINARY=${ROLLBACK_BINARY:-}`
			`PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}`

			`log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }`
			`die() { log "FAIL: $*"; exit "${2:-1}"; }`

			`require() {`
			`command -v "$1" >/dev/null 2>&1 \|\| die "required tool missing: $1" 3`
			`}`

			`require incus`
			`require curl`
			`require socat`
			`require date`

			`if [ "$ARTIFACT" = "?" ] \|\| [ ! -f "$ARTIFACT" ]; then`
			`die "ARTIFACT (or \$1) must point to an existing binary" 1`
			`fi`

			`# --------------------------------------------------------------------`
			`# Helpers : HAProxy admin socket commands.`
			`# --------------------------------------------------------------------`
			`HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}`

			`ha_cmd() {`
			`incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' \| socat /run/haproxy/admin.sock -"`
			`}`

			`ha_state() {`
			`local node=$1`
			`ha_cmd "show servers state $POOL_BACKEND" \`
			`\| awk -v n="$node" '$0 ~ n {print $7}' \| head -1`
			# field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
			`}`

			`ha_drain() {`
			`log "haproxy : drain $1"`
			`ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null`
			`}`

			`ha_ready() {`
			`log "haproxy : ready $1"`
			`ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null`
			`}`

			`ha_wait_drained() {`
			`# Drain finishes when the server reports 0 active connections.`
			`local node=$1`
			`local deadline=$(( $(date +%s) + 60 ))`
			`while [ "$(date +%s)" -lt "$deadline" ]; do`
			`local n`
			`n=$(ha_cmd "show stat" \| awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null \|\| echo 0)`
			`if [ "${n:-0}" = "0" ]; then`
			`log "haproxy : $node drained (0 active connections)"`
			`return 0`
			`fi`
			`sleep 2`
			`done`
			`log "WARN : $node still has active connections after 60s drain ; proceeding anyway"`
			`}`

			`curl_health() {`
			`curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \`
			`"http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null \|\| echo "000"`
			`}`

			`# --------------------------------------------------------------------`
			`# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as`
			`# any probe reports red so we can rollback fast.`
			`# --------------------------------------------------------------------`
			`prom_query() {`
			`local q=$1`
			`curl --max-time 10 -sS -G --data-urlencode "query=${q}" \`
			`"${PROM_URL}/api/v1/query" 2>/dev/null \`
			`\| jq -r '.data.result[0].value[1] // "0"' 2>/dev/null \|\| echo 0`
			`}`

			`monitor_sli() {`
			`log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"`
			`local deadline=$(( $(date +%s) + SLI_WINDOW ))`
			`local probes=0`
			`local first_red=""`
			`while [ "$(date +%s)" -lt "$deadline" ]; do`
			`probes=$((probes + 1))`
			`local p95 err`
			`p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')`
			`err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')`
			`log " probe $probes : p95=${p95}s err=${err}"`

			`# awk used for float comparison ; bash test only does integers.`
			`if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then`
			`first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"`
			`break`
			`fi`
			`if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then`
			`first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"`
			`break`
			`fi`
			`sleep "$SLI_PROBE_INTERVAL"`
			`done`
			`if [ -n "$first_red" ]; then`
			`log "SLI red after $probes probe(s) : $first_red"`
			`return 1`
			`fi`
			`log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"`
			`return 0`
			`}`

			`# --------------------------------------------------------------------`
			`# Deploy + rollback primitives.`
			`# --------------------------------------------------------------------`
			`deploy_to() {`
			`local node=$1`
			`local artifact=$2`
			`log "deploying $artifact → $node"`
			`incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \`
			`--uid 1001 --gid 1001 --mode 0755`
			`incus exec "$node" -- systemctl restart veza-backend-api`
			`}`

			`verify_node_health() {`
			`local node=$1`
			`log "node health check : $node"`
			`local deadline=$(( $(date +%s) + 60 ))`
			`while [ "$(date +%s)" -lt "$deadline" ]; do`
			`if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then`
			`log " $node : 200"`
			`return 0`
			`fi`
			`sleep 2`
			`done`
			`return 1`
			`}`

			`rollback_canary() {`
			`log "ROLLBACK : restoring $CANARY_NODE"`
			`if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then`
			`deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" \|\| true`
			`verify_node_health "$CANARY_NODE" \|\| log "rollback : node health check still failing"`
			`else`
			`log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"`
			`fi`
			`ha_ready "$CANARY_NODE"`
			`}`

			`# --------------------------------------------------------------------`
			`# 1. Pre-deploy hook (migration backward-compat).`
			`# --------------------------------------------------------------------`
			`log "step 1 : pre-deploy hook"`
			`if [ -x "$PRE_DEPLOY_HOOK" ]; then`
			`if ! "$PRE_DEPLOY_HOOK"; then`
			`die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1`
			`fi`
			`else`
			`log " PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"`
			`fi`

			`# --------------------------------------------------------------------`
			`# 2. Drain canary node.`
			`# --------------------------------------------------------------------`
			`log "step 2 : drain $CANARY_NODE in HAProxy"`
			`ha_drain "$CANARY_NODE"`
			`ha_wait_drained "$CANARY_NODE"`

			`# --------------------------------------------------------------------`
			`# 3. Deploy artifact to the canary node.`
			`# --------------------------------------------------------------------`
			`log "step 3 : deploy artifact to $CANARY_NODE"`
			`deploy_to "$CANARY_NODE" "$ARTIFACT"`

			`# --------------------------------------------------------------------`
			`# 4. Per-node health check.`
			`# --------------------------------------------------------------------`
			`log "step 4 : health check on $CANARY_NODE"`
			`if ! verify_node_health "$CANARY_NODE"; then`
			`log "$CANARY_NODE failed health check post-deploy"`
			`rollback_canary`
			`exit 2`
			`fi`

			`# --------------------------------------------------------------------`
			`# 5. Re-enable + LB health check (proves HAProxy sees the node ready).`
			`# --------------------------------------------------------------------`
			`log "step 5 : re-enable $CANARY_NODE in HAProxy"`
			`ha_ready "$CANARY_NODE"`
			`sleep 5`
			`lb_status=$(curl_health)`
			`if [ "$lb_status" != "200" ]; then`
			`log "LB health check after re-enable returned $lb_status ; rolling back"`
			`rollback_canary`
			`exit 2`
			`fi`

			`# --------------------------------------------------------------------`
			`# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.`
			`# --------------------------------------------------------------------`
			`log "step 6 : monitor SLI on the canary"`
			`if ! monitor_sli; then`
			`log "SLI red — rolling back the canary"`
			`rollback_canary`
			`exit 2`
			`fi`

			`# --------------------------------------------------------------------`
			`# 7. SLI green — repeat on each peer.`
			`# --------------------------------------------------------------------`
			`log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"`
			`IFS=',' read -ra peers <<< "$PEER_NODES"`
			`for peer in "${peers[@]}"; do`
			`log "── peer $peer ───────────────────────────"`
			`ha_drain "$peer"`
			`ha_wait_drained "$peer"`
			`deploy_to "$peer" "$ARTIFACT"`
			`if ! verify_node_health "$peer"; then`
			`log "$peer health check failed post-deploy"`
			`log "WARN : leaving $peer drained ; canary node still serves traffic"`
			`log " operator must re-deploy known-good binary or repair $peer manually"`
			`exit 2`
			`fi`
			`ha_ready "$peer"`
			`sleep 5`
			`lb_status=$(curl_health)`
			`if [ "$lb_status" != "200" ]; then`
			`log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"`
			`exit 2`
			`fi`
			`done`

			`log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"`
			`exit 0`