#!/usr/bin/env bash # game-day-driver.sh — orchestrate the W5 Day 22 game-day exercise. # # Walks the 5 failure scenarios in sequence, captures stdout/stderr + # exit code per scenario, writes a session report under # docs/runbooks/game-days/-game-day-driver.log, and prints a # summary table at the end. # # v1.0.9 W5 Day 22. # # Scenarios (mapped to existing smoke tests) : # A : test_pg_failover.sh — kill Postgres primary, RTO < 60s # B : test_backend_failover.sh — kill backend-api 1, HAProxy bascule # C : test_redis_failover.sh — kill Redis master, Sentinel promote # D : test_minio_resilience.sh — kill 2 MinIO nodes, EC:2 reconstructs # E : test_rabbitmq_outage.sh — stop RabbitMQ 60s, backend stays up # # Usage : # bash scripts/security/game-day-driver.sh # all scenarios on staging (default) # SKIP=DE bash scripts/security/game-day-driver.sh # skip D + E # ONLY=A bash scripts/security/game-day-driver.sh # only A # INVENTORY=prod CONFIRM_PROD=1 bash scripts/security/game-day-driver.sh # prod (gated) # # Required env (passed through to the underlying smoke tests) : # REDIS_PASS / SENTINEL_PASS for scenario C # MINIO_ROOT_USER / MINIO_ROOT_PASSWORD for scenario D # # v1.0.10 polish — production gating : # INVENTORY=prod must be paired with CONFIRM_PROD=1 or the script # refuses to run, so a stale shell-history line can't accidentally # kill prod Postgres on a Monday morning. The driver also runs a # backup-freshness pre-flight when targeting prod (most recent # pgBackRest backup must be < 24 h old). # # Exit codes : # 0 — every selected scenario passed # 1 — at least one scenario failed # 2 — runner pre-flight failed (script missing, prod safety guard tripped, stale backup, etc.) set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" TESTS_DIR="$REPO_ROOT/infra/ansible/tests" LOGS_DIR="$REPO_ROOT/docs/runbooks/game-days" SESSION_DATE="$(date +%Y-%m-%d-%H%M)" SESSION_LOG="$LOGS_DIR/$SESSION_DATE-game-day-driver.log" mkdir -p "$LOGS_DIR" : > "$SESSION_LOG" ONLY=${ONLY:-} SKIP=${SKIP:-} INVENTORY=${INVENTORY:-staging} CONFIRM_PROD=${CONFIRM_PROD:-0} SKIP_BACKUP_FRESHNESS=${SKIP_BACKUP_FRESHNESS:-0} log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" | tee -a "$SESSION_LOG" >&2; } fail() { log "FAIL: $*"; exit "${2:-2}"; } declare -A SCENARIO_SCRIPT=( [A]="$TESTS_DIR/test_pg_failover.sh" [B]="$TESTS_DIR/test_backend_failover.sh" [C]="$TESTS_DIR/test_redis_failover.sh" [D]="$TESTS_DIR/test_minio_resilience.sh" [E]="$TESTS_DIR/test_rabbitmq_outage.sh" ) declare -A SCENARIO_DESC=( [A]="Postgres primary failover RTO < 60s" [B]="HAProxy backend-api 1 fail-over" [C]="Redis Sentinel master promotion" [D]="MinIO 2-node loss EC:2 reconstruction" [E]="RabbitMQ outage backend stays up" ) SCENARIOS=(A B C D E) want() { local s=$1 if [ -n "$ONLY" ] && [[ "$ONLY" != *"$s"* ]]; then return 1; fi if [ -n "$SKIP" ] && [[ "$SKIP" == *"$s"* ]]; then return 1; fi return 0 } # v1.0.10 polish — prod safety gate. INVENTORY=prod requires # CONFIRM_PROD=1 + an interactive type-the-word confirm. Anything else # defaults to staging so a forgotten env-var doesn't matter. case "$INVENTORY" in staging|stg|dev|local) ;; prod|production) if [ "$CONFIRM_PROD" != "1" ]; then cat >&2 < 24 h old. Recovery from a stale backup # can extend an outage from minutes to hours, so the cost of # postponing the game day is much less than the cost of compounded # data loss if scenario A fails to recover and we have to restore # from yesterday-but-one. if [ "$SKIP_BACKUP_FRESHNESS" != "1" ]; then if command -v pgbackrest >/dev/null 2>&1; then last_backup_ts=$(pgbackrest --stanza=veza info --output=json 2>/dev/null \ | python3 -c " import json, sys try: data = json.load(sys.stdin) backups = data[0]['backup'] if data else [] if not backups: print(0); sys.exit(0) print(max(b['timestamp']['stop'] for b in backups)) except Exception: print(0) " 2>/dev/null || echo 0) now_ts=$(date +%s) age_seconds=$(( now_ts - last_backup_ts )) if [ "$last_backup_ts" -eq 0 ]; then fail "pgBackRest backup-freshness check failed : could not parse 'pgbackrest info'. Set SKIP_BACKUP_FRESHNESS=1 to override (only after manually verifying a recent backup exists)." 2 fi if [ "$age_seconds" -gt 86400 ]; then age_hours=$(( age_seconds / 3600 )) fail "pgBackRest most recent backup is ${age_hours}h old (threshold 24h). Run a backup before the game day, or set SKIP_BACKUP_FRESHNESS=1 if you've validated freshness another way." 2 fi log "pre-flight : pgBackRest most recent backup is $(( age_seconds / 3600 ))h $(( (age_seconds % 3600) / 60 ))m old (< 24h threshold) — OK" else log "WARN : pgbackrest CLI not on \$PATH ; skipping backup-freshness check. Set SKIP_BACKUP_FRESHNESS=1 to silence this warning if intentional." fi fi # Final type-the-word confirm. Everything above can be set in env # by mistake ; this last step requires a human at the keyboard. cat >&2 <&1 | tee -a "$SESSION_LOG" rc=${PIPESTATUS[0]} set -e elapsed=$(( $(date +%s) - t0 )) SCENARIO_DURATION[$s]="${elapsed}s" if [ "$rc" -eq 0 ]; then SCENARIO_RESULT[$s]="PASS" log "scenario $s : PASS in ${elapsed}s" else SCENARIO_RESULT[$s]="FAIL (exit $rc)" log "scenario $s : FAIL (exit $rc) after ${elapsed}s" fi done log "" log "================================================================" log "Session summary" log "----------------------------------------------------------------" printf '%-3s | %-12s | %-8s | %s\n' "ID" "result" "duration" "scenario" | tee -a "$SESSION_LOG" >&2 printf '%-3s-+-%-12s-+-%-8s-+-%s\n' "---" "------------" "--------" "$(printf '%.0s-' {1..50})" | tee -a "$SESSION_LOG" >&2 overall=0 for s in "${SCENARIOS[@]}"; do result=${SCENARIO_RESULT[$s]} duration=${SCENARIO_DURATION[$s]} printf '%-3s | %-12s | %-8s | %s\n' "$s" "$result" "$duration" "${SCENARIO_DESC[$s]}" \ | tee -a "$SESSION_LOG" >&2 if [[ "$result" == "FAIL"* ]]; then overall=1; fi done log "================================================================" log "" log "Operator next steps :" log " 1. Open the runbook template :" log " docs/runbooks/game-days/$SESSION_DATE.md" log " (copy from docs/runbooks/game-days/TEMPLATE.md if missing)" log " 2. For each scenario, fill : timestamp, action, observation," log " runbook used, gap discovered." log " 3. File one PR per gap that needs a code or runbook fix." log "" if [ "$overall" -eq 0 ]; then log "PASS : every selected scenario passed." else log "FAIL : at least one scenario failed — review $SESSION_LOG." fi exit "$overall"