#!/usr/bin/env bash # test_pg_failover.sh — validate pg_auto_failover RTO < 60s. # # Run on the Incus host that owns the pgaf-* containers (typically # the lab R720 with `incus list` showing all three). Assumes the # postgres_ha playbook has been applied so the formation is healthy # at script start — bails early otherwise. # # v1.0.9 Day 6 — acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 2 # day 6: kill primary, time the standby's promotion, fail when > 60s. # # Usage: # bash infra/ansible/tests/test_pg_failover.sh # # Exit codes: # 0 — failover happened in < 60s (acceptance met) # 1 — formation not healthy at start # 2 — failover did not happen within 60s # 3 — required tool missing on the host set -euo pipefail PRIMARY_CONTAINER=${PRIMARY_CONTAINER:-pgaf-primary} REPLICA_CONTAINER=${REPLICA_CONTAINER:-pgaf-replica} MONITOR_CONTAINER=${MONITOR_CONTAINER:-pgaf-monitor} RTO_TARGET_SECONDS=${RTO_TARGET_SECONDS:-60} PG_AUTO_FAILOVER_PGDATA=${PG_AUTO_FAILOVER_PGDATA:-/var/lib/postgresql/16/pgaf/postgres} PG_AUTO_FAILOVER_MONITOR_PGDATA=${PG_AUTO_FAILOVER_MONITOR_PGDATA:-/var/lib/postgresql/16/pgaf/monitor} log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } fail() { log "FAIL: $*"; exit "${2:-2}"; } require() { command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3 } require incus require date require awk # ----------------------------------------------------------------------------- # 0. Sanity — formation must be healthy at start. # ----------------------------------------------------------------------------- log "step 0: pre-flight — formation state via monitor" state_before=$(incus exec "$MONITOR_CONTAINER" -- sudo -u postgres \ pg_autoctl show state --pgdata "$PG_AUTO_FAILOVER_MONITOR_PGDATA" 2>&1 || true) log "monitor state:" echo "$state_before" | sed 's/^/ /' >&2 if ! echo "$state_before" | grep -qE 'primary[[:space:]]+\|.*primary'; then fail "no primary visible in formation state — refusing to test failover from a degraded baseline" 1 fi if ! echo "$state_before" | grep -qE 'secondary[[:space:]]+\|.*secondary'; then fail "no secondary visible — failover requires a hot standby ready to take over" 1 fi primary_node=$(echo "$state_before" | awk '/primary[[:space:]]+\|/ {print $1; exit}') log "current primary node: $primary_node (container: $PRIMARY_CONTAINER)" # ----------------------------------------------------------------------------- # 1. Kill primary container — simulates a hardware/process death. # ----------------------------------------------------------------------------- log "step 1: stopping primary container ($PRIMARY_CONTAINER) — start timer" t0=$(date +%s) incus stop --force "$PRIMARY_CONTAINER" # ----------------------------------------------------------------------------- # 2. Poll the monitor until the standby is promoted. # ----------------------------------------------------------------------------- log "step 2: polling monitor for failover (target RTO ${RTO_TARGET_SECONDS}s)" deadline=$((t0 + RTO_TARGET_SECONDS)) promoted=0 while [ "$(date +%s)" -lt "$deadline" ]; do state_now=$(incus exec "$MONITOR_CONTAINER" -- sudo -u postgres \ pg_autoctl show state --pgdata "$PG_AUTO_FAILOVER_MONITOR_PGDATA" 2>&1 || true) # Replica's node name should now appear in the "primary" column AND # the previous primary should appear as "demoted" / "draining" / "stopped". if echo "$state_now" | grep -qE 'primary[[:space:]]+\|' \ && ! echo "$state_now" | grep -qE "^[[:space:]]*${primary_node}[[:space:]]+\|.*primary"; then promoted=1 break fi sleep 1 done t1=$(date +%s) elapsed=$((t1 - t0)) # ----------------------------------------------------------------------------- # 3. Restart the killed container so the lab returns to a 2-node # formation for subsequent runs. # ----------------------------------------------------------------------------- log "step 3: restarting $PRIMARY_CONTAINER (it'll come back as standby once it catches up)" incus start "$PRIMARY_CONTAINER" || true # ----------------------------------------------------------------------------- # 4. Verdict. # ----------------------------------------------------------------------------- if [ "$promoted" -eq 1 ] && [ "$elapsed" -le "$RTO_TARGET_SECONDS" ]; then log "PASS: failover completed in ${elapsed}s (target ${RTO_TARGET_SECONDS}s)" exit 0 fi log "post-failover state:" echo "$state_now" | sed 's/^/ /' >&2 fail "no standby promotion within ${RTO_TARGET_SECONDS}s (elapsed ${elapsed}s, promoted=${promoted})"