veza/infra/ansible/tests/test_pg_failover.sh

#!/usr/bin/env bash
# test_pg_failover.sh — validate pg_auto_failover RTO < 60s.
#
# Run on the Incus host that owns the pgaf-* containers (typically
# the lab R720 with `incus list` showing all three). Assumes the
# postgres_ha playbook has been applied so the formation is healthy
# at script start — bails early otherwise.
#
# v1.0.9 Day 6 — acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 2
# day 6: kill primary, time the standby's promotion, fail when > 60s.
#
# Usage:
#   bash infra/ansible/tests/test_pg_failover.sh
#
# Exit codes:
#   0  — failover happened in < 60s (acceptance met)
#   1  — formation not healthy at start
#   2  — failover did not happen within 60s
#   3  — required tool missing on the host
set -euo pipefail

PRIMARY_CONTAINER=${PRIMARY_CONTAINER:-pgaf-primary}
REPLICA_CONTAINER=${REPLICA_CONTAINER:-pgaf-replica}
MONITOR_CONTAINER=${MONITOR_CONTAINER:-pgaf-monitor}
RTO_TARGET_SECONDS=${RTO_TARGET_SECONDS:-60}
PG_AUTO_FAILOVER_PGDATA=${PG_AUTO_FAILOVER_PGDATA:-/var/lib/postgresql/16/pgaf/postgres}
PG_AUTO_FAILOVER_MONITOR_PGDATA=${PG_AUTO_FAILOVER_MONITOR_PGDATA:-/var/lib/postgresql/16/pgaf/monitor}

log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
fail() { log "FAIL: $*"; exit "${2:-2}"; }

require() {
  command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3
}

require incus
require date
require awk

# -----------------------------------------------------------------------------
# 0. Sanity — formation must be healthy at start.
# -----------------------------------------------------------------------------
log "step 0: pre-flight — formation state via monitor"
state_before=$(incus exec "$MONITOR_CONTAINER" -- sudo -u postgres \
    pg_autoctl show state --pgdata "$PG_AUTO_FAILOVER_MONITOR_PGDATA" 2>&1 || true)
log "monitor state:"
echo "$state_before" | sed 's/^/    /' >&2

if ! echo "$state_before" | grep -qE 'primary[[:space:]]+\|.*primary'; then
  fail "no primary visible in formation state — refusing to test failover from a degraded baseline" 1
fi
if ! echo "$state_before" | grep -qE 'secondary[[:space:]]+\|.*secondary'; then
  fail "no secondary visible — failover requires a hot standby ready to take over" 1
fi

primary_node=$(echo "$state_before" | awk '/primary[[:space:]]+\|/ {print $1; exit}')
log "current primary node: $primary_node (container: $PRIMARY_CONTAINER)"

# -----------------------------------------------------------------------------
# 1. Kill primary container — simulates a hardware/process death.
# -----------------------------------------------------------------------------
log "step 1: stopping primary container ($PRIMARY_CONTAINER) — start timer"
t0=$(date +%s)
incus stop --force "$PRIMARY_CONTAINER"

# -----------------------------------------------------------------------------
# 2. Poll the monitor until the standby is promoted.
# -----------------------------------------------------------------------------
log "step 2: polling monitor for failover (target RTO ${RTO_TARGET_SECONDS}s)"
deadline=$((t0 + RTO_TARGET_SECONDS))
promoted=0
while [ "$(date +%s)" -lt "$deadline" ]; do
  state_now=$(incus exec "$MONITOR_CONTAINER" -- sudo -u postgres \
    pg_autoctl show state --pgdata "$PG_AUTO_FAILOVER_MONITOR_PGDATA" 2>&1 || true)

  # Replica's node name should now appear in the "primary" column AND
  # the previous primary should appear as "demoted" / "draining" / "stopped".
  if echo "$state_now" | grep -qE 'primary[[:space:]]+\|' \
     && ! echo "$state_now" | grep -qE "^[[:space:]]*${primary_node}[[:space:]]+\|.*primary"; then
    promoted=1
    break
  fi
  sleep 1
done

t1=$(date +%s)
elapsed=$((t1 - t0))

# -----------------------------------------------------------------------------
# 3. Restart the killed container so the lab returns to a 2-node
#    formation for subsequent runs.
# -----------------------------------------------------------------------------
log "step 3: restarting $PRIMARY_CONTAINER (it'll come back as standby once it catches up)"
incus start "$PRIMARY_CONTAINER" || true

# -----------------------------------------------------------------------------
# 4. Verdict.
# -----------------------------------------------------------------------------
if [ "$promoted" -eq 1 ] && [ "$elapsed" -le "$RTO_TARGET_SECONDS" ]; then
  log "PASS: failover completed in ${elapsed}s (target ${RTO_TARGET_SECONDS}s)"
  exit 0
fi

log "post-failover state:"
echo "$state_now" | sed 's/^/    /' >&2
fail "no standby promotion within ${RTO_TARGET_SECONDS}s (elapsed ${elapsed}s, promoted=${promoted})"