#!/usr/bin/env bash # test_backend_failover.sh — verify HAProxy fails over from backend-api-1 # to backend-api-2 when the first instance dies, with no client-visible # error window beyond the health-check fall. # # Sequence : # 1. Pre-flight : both backends UP per HAProxy stats. # 2. Issue 5 GET /api/v1/health through HAProxy ; all should return 200. # Capture the SERVERID cookie to know which backend was chosen. # 3. incus stop --force backend-api-1 (or whoever the cookie pinned). # 4. Poll HAProxy stats until the killed server is marked DOWN # (typically within fall × interval = 3 × 5 s = 15 s). # 5. Issue another 5 GET /api/v1/health ; all must return 200, served # by the surviving backend. # 6. incus start backend-api-1 ; poll until UP again. # # v1.0.9 W4 Day 19 — acceptance for the verification gate. # # Usage : # bash infra/ansible/tests/test_backend_failover.sh # # Exit codes : # 0 — failover happened, no errors during the window # 1 — pool not healthy at start # 2 — failover took too long OR errors observed during the window # 3 — required tool missing set -euo pipefail HAPROXY_HOST=${HAPROXY_HOST:-haproxy.lxd} HAPROXY_PORT=${HAPROXY_PORT:-80} KILL_BACKEND=${KILL_BACKEND:-backend-api-1} SURVIVING_BACKEND=${SURVIVING_BACKEND:-backend-api-2} HEALTH_PATH=${HEALTH_PATH:-/api/v1/health} DOWN_TIMEOUT_SECONDS=${DOWN_TIMEOUT_SECONDS:-30} UP_TIMEOUT_SECONDS=${UP_TIMEOUT_SECONDS:-60} log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } fail() { log "FAIL: $*"; exit "${2:-2}"; } require() { command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3 } require incus require curl require date # ----------------------------------------------------------------------------- # Helper : ask HAProxy admin socket for a server's status (UP / DOWN / DRAIN / # MAINT). Bound to loopback inside the haproxy container. # ----------------------------------------------------------------------------- server_status() { local server=$1 incus exec haproxy -- bash -c \ "echo 'show stat' | socat /run/haproxy/admin.sock - \ | awk -F, -v s=\"$server\" '\$2 == s {print \$18; exit}'" } curl_via_lb() { local accept_404=${1:-0} local code code=$(curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \ "http://${HAPROXY_HOST}:${HAPROXY_PORT}${HEALTH_PATH}" || echo 000) echo "$code" } # ----------------------------------------------------------------------------- # 1. Pre-flight — both backends must be UP. # ----------------------------------------------------------------------------- log "step 0: pre-flight — querying HAProxy admin socket" status_kill=$(server_status "$KILL_BACKEND") status_survive=$(server_status "$SURVIVING_BACKEND") log " $KILL_BACKEND : $status_kill" log " $SURVIVING_BACKEND : $status_survive" if [ "$status_kill" != "UP" ] || [ "$status_survive" != "UP" ]; then fail "pool not fully UP at start — refusing to test from a degraded baseline" 1 fi # ----------------------------------------------------------------------------- # 2. Sanity — 5 successful requests through the LB. # ----------------------------------------------------------------------------- log "step 1: 5 baseline requests through HAProxy" for i in 1 2 3 4 5; do code=$(curl_via_lb) log " request $i → HTTP $code" if [ "$code" != "200" ]; then fail "baseline request $i returned HTTP $code, want 200" 1 fi done # ----------------------------------------------------------------------------- # 3. Kill the backend container. # ----------------------------------------------------------------------------- log "step 2: stopping $KILL_BACKEND — start failover timer" t0=$(date +%s) incus stop --force "$KILL_BACKEND" # ----------------------------------------------------------------------------- # 4. Poll until HAProxy marks the killed server DOWN. # ----------------------------------------------------------------------------- log "step 3: polling HAProxy until $KILL_BACKEND is DOWN (timeout ${DOWN_TIMEOUT_SECONDS}s)" deadline=$((t0 + DOWN_TIMEOUT_SECONDS)) killed_down=0 while [ "$(date +%s)" -lt "$deadline" ]; do s=$(server_status "$KILL_BACKEND") if [ "$s" = "DOWN" ] || [ "$s" = "MAINT" ]; then killed_down=1 break fi sleep 1 done elapsed=$(( $(date +%s) - t0 )) if [ "$killed_down" -eq 0 ]; then fail "$KILL_BACKEND not marked DOWN within ${DOWN_TIMEOUT_SECONDS}s" 2 fi log " $KILL_BACKEND went DOWN in ${elapsed}s" # ----------------------------------------------------------------------------- # 5. 5 requests through the LB — all must succeed via the surviving backend. # ----------------------------------------------------------------------------- log "step 4: 5 requests through HAProxy with $KILL_BACKEND down" errors=0 for i in 1 2 3 4 5; do code=$(curl_via_lb) log " request $i → HTTP $code" if [ "$code" != "200" ]; then errors=$((errors + 1)) fi done if [ "$errors" -gt 0 ]; then fail "$errors of 5 requests failed during failover — survivor isn't catching all traffic" 2 fi # ----------------------------------------------------------------------------- # 6. Restart the killed backend and confirm it rejoins as UP. # ----------------------------------------------------------------------------- log "step 5: restarting $KILL_BACKEND" incus start "$KILL_BACKEND" || true log " polling until $KILL_BACKEND is UP again (timeout ${UP_TIMEOUT_SECONDS}s)" deadline=$(( $(date +%s) + UP_TIMEOUT_SECONDS )) recovered=0 while [ "$(date +%s)" -lt "$deadline" ]; do s=$(server_status "$KILL_BACKEND") if [ "$s" = "UP" ]; then recovered=1 break fi sleep 2 done if [ "$recovered" -eq 0 ]; then log "WARN: $KILL_BACKEND did not return to UP within ${UP_TIMEOUT_SECONDS}s — manual check needed" else log " $KILL_BACKEND back UP" fi log "PASS: HAProxy fail-over OK ($KILL_BACKEND down in ${elapsed}s, no client-visible errors during the window)" exit 0