veza/infra/ansible/tests/test_backend_failover.sh

#!/usr/bin/env bash
# test_backend_failover.sh — verify HAProxy fails over from backend-api-1
# to backend-api-2 when the first instance dies, with no client-visible
# error window beyond the health-check fall.
#
# Sequence :
#   1. Pre-flight : both backends UP per HAProxy stats.
#   2. Issue 5 GET /api/v1/health through HAProxy ; all should return 200.
#      Capture the SERVERID cookie to know which backend was chosen.
#   3. incus stop --force backend-api-1 (or whoever the cookie pinned).
#   4. Poll HAProxy stats until the killed server is marked DOWN
#      (typically within fall × interval = 3 × 5 s = 15 s).
#   5. Issue another 5 GET /api/v1/health ; all must return 200, served
#      by the surviving backend.
#   6. incus start backend-api-1 ; poll until UP again.
#
# v1.0.9 W4 Day 19 — acceptance for the verification gate.
#
# Usage :
#   bash infra/ansible/tests/test_backend_failover.sh
#
# Exit codes :
#   0  — failover happened, no errors during the window
#   1  — pool not healthy at start
#   2  — failover took too long OR errors observed during the window
#   3  — required tool missing
set -euo pipefail

HAPROXY_HOST=${HAPROXY_HOST:-haproxy.lxd}
HAPROXY_PORT=${HAPROXY_PORT:-80}
KILL_BACKEND=${KILL_BACKEND:-backend-api-1}
SURVIVING_BACKEND=${SURVIVING_BACKEND:-backend-api-2}
HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
DOWN_TIMEOUT_SECONDS=${DOWN_TIMEOUT_SECONDS:-30}
UP_TIMEOUT_SECONDS=${UP_TIMEOUT_SECONDS:-60}

log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
fail() { log "FAIL: $*"; exit "${2:-2}"; }

require() {
  command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3
}

require incus
require curl
require date

# -----------------------------------------------------------------------------
# Helper : ask HAProxy admin socket for a server's status (UP / DOWN / DRAIN /
# MAINT). Bound to loopback inside the haproxy container.
# -----------------------------------------------------------------------------
server_status() {
  local server=$1
  incus exec haproxy -- bash -c \
    "echo 'show stat' | socat /run/haproxy/admin.sock - \
       | awk -F, -v s=\"$server\" '\$2 == s {print \$18; exit}'"
}

curl_via_lb() {
  local accept_404=${1:-0}
  local code
  code=$(curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
    "http://${HAPROXY_HOST}:${HAPROXY_PORT}${HEALTH_PATH}" || echo 000)
  echo "$code"
}

# -----------------------------------------------------------------------------
# 1. Pre-flight — both backends must be UP.
# -----------------------------------------------------------------------------
log "step 0: pre-flight — querying HAProxy admin socket"
status_kill=$(server_status "$KILL_BACKEND")
status_survive=$(server_status "$SURVIVING_BACKEND")
log "  $KILL_BACKEND : $status_kill"
log "  $SURVIVING_BACKEND : $status_survive"
if [ "$status_kill" != "UP" ] || [ "$status_survive" != "UP" ]; then
  fail "pool not fully UP at start — refusing to test from a degraded baseline" 1
fi

# -----------------------------------------------------------------------------
# 2. Sanity — 5 successful requests through the LB.
# -----------------------------------------------------------------------------
log "step 1: 5 baseline requests through HAProxy"
for i in 1 2 3 4 5; do
  code=$(curl_via_lb)
  log "  request $i → HTTP $code"
  if [ "$code" != "200" ]; then
    fail "baseline request $i returned HTTP $code, want 200" 1
  fi
done

# -----------------------------------------------------------------------------
# 3. Kill the backend container.
# -----------------------------------------------------------------------------
log "step 2: stopping $KILL_BACKEND — start failover timer"
t0=$(date +%s)
incus stop --force "$KILL_BACKEND"

# -----------------------------------------------------------------------------
# 4. Poll until HAProxy marks the killed server DOWN.
# -----------------------------------------------------------------------------
log "step 3: polling HAProxy until $KILL_BACKEND is DOWN (timeout ${DOWN_TIMEOUT_SECONDS}s)"
deadline=$((t0 + DOWN_TIMEOUT_SECONDS))
killed_down=0
while [ "$(date +%s)" -lt "$deadline" ]; do
  s=$(server_status "$KILL_BACKEND")
  if [ "$s" = "DOWN" ] || [ "$s" = "MAINT" ]; then
    killed_down=1
    break
  fi
  sleep 1
done
elapsed=$(( $(date +%s) - t0 ))
if [ "$killed_down" -eq 0 ]; then
  fail "$KILL_BACKEND not marked DOWN within ${DOWN_TIMEOUT_SECONDS}s" 2
fi
log "  $KILL_BACKEND went DOWN in ${elapsed}s"

# -----------------------------------------------------------------------------
# 5. 5 requests through the LB — all must succeed via the surviving backend.
# -----------------------------------------------------------------------------
log "step 4: 5 requests through HAProxy with $KILL_BACKEND down"
errors=0
for i in 1 2 3 4 5; do
  code=$(curl_via_lb)
  log "  request $i → HTTP $code"
  if [ "$code" != "200" ]; then
    errors=$((errors + 1))
  fi
done
if [ "$errors" -gt 0 ]; then
  fail "$errors of 5 requests failed during failover — survivor isn't catching all traffic" 2
fi

# -----------------------------------------------------------------------------
# 6. Restart the killed backend and confirm it rejoins as UP.
# -----------------------------------------------------------------------------
log "step 5: restarting $KILL_BACKEND"
incus start "$KILL_BACKEND" || true
log "  polling until $KILL_BACKEND is UP again (timeout ${UP_TIMEOUT_SECONDS}s)"
deadline=$(( $(date +%s) + UP_TIMEOUT_SECONDS ))
recovered=0
while [ "$(date +%s)" -lt "$deadline" ]; do
  s=$(server_status "$KILL_BACKEND")
  if [ "$s" = "UP" ]; then
    recovered=1
    break
  fi
  sleep 2
done
if [ "$recovered" -eq 0 ]; then
  log "WARN: $KILL_BACKEND did not return to UP within ${UP_TIMEOUT_SECONDS}s — manual check needed"
else
  log "  $KILL_BACKEND back UP"
fi

log "PASS: HAProxy fail-over OK ($KILL_BACKEND down in ${elapsed}s, no client-visible errors during the window)"
exit 0