#!/usr/bin/env bash # game-day-driver.sh — orchestrate the W5 Day 22 game-day exercise. # # Walks the 5 failure scenarios in sequence, captures stdout/stderr + # exit code per scenario, writes a session report under # docs/runbooks/game-days/-game-day-driver.log, and prints a # summary table at the end. # # v1.0.9 W5 Day 22. # # Scenarios (mapped to existing smoke tests) : # A : test_pg_failover.sh — kill Postgres primary, RTO < 60s # B : test_backend_failover.sh — kill backend-api 1, HAProxy bascule # C : test_redis_failover.sh — kill Redis master, Sentinel promote # D : test_minio_resilience.sh — kill 2 MinIO nodes, EC:2 reconstructs # E : test_rabbitmq_outage.sh — stop RabbitMQ 60s, backend stays up # # Usage : # bash scripts/security/game-day-driver.sh # run all scenarios # SKIP=DE bash scripts/security/game-day-driver.sh # skip scenarios D + E # ONLY=A bash scripts/security/game-day-driver.sh # only run scenario A # # Required env (passed through to the underlying smoke tests) : # REDIS_PASS / SENTINEL_PASS for scenario C # MINIO_ROOT_USER / MINIO_ROOT_PASSWORD for scenario D # # Exit codes : # 0 — every selected scenario passed # 1 — at least one scenario failed # 2 — runner pre-flight failed (script missing, etc.) set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" TESTS_DIR="$REPO_ROOT/infra/ansible/tests" LOGS_DIR="$REPO_ROOT/docs/runbooks/game-days" SESSION_DATE="$(date +%Y-%m-%d-%H%M)" SESSION_LOG="$LOGS_DIR/$SESSION_DATE-game-day-driver.log" mkdir -p "$LOGS_DIR" : > "$SESSION_LOG" ONLY=${ONLY:-} SKIP=${SKIP:-} log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" | tee -a "$SESSION_LOG" >&2; } fail() { log "FAIL: $*"; exit "${2:-2}"; } declare -A SCENARIO_SCRIPT=( [A]="$TESTS_DIR/test_pg_failover.sh" [B]="$TESTS_DIR/test_backend_failover.sh" [C]="$TESTS_DIR/test_redis_failover.sh" [D]="$TESTS_DIR/test_minio_resilience.sh" [E]="$TESTS_DIR/test_rabbitmq_outage.sh" ) declare -A SCENARIO_DESC=( [A]="Postgres primary failover RTO < 60s" [B]="HAProxy backend-api 1 fail-over" [C]="Redis Sentinel master promotion" [D]="MinIO 2-node loss EC:2 reconstruction" [E]="RabbitMQ outage backend stays up" ) SCENARIOS=(A B C D E) want() { local s=$1 if [ -n "$ONLY" ] && [[ "$ONLY" != *"$s"* ]]; then return 1; fi if [ -n "$SKIP" ] && [[ "$SKIP" == *"$s"* ]]; then return 1; fi return 0 } # Pre-flight : every selected scenario script must exist + be executable. for s in "${SCENARIOS[@]}"; do if want "$s"; then script="${SCENARIO_SCRIPT[$s]}" if [ ! -x "$script" ]; then fail "scenario $s : script $script not found or not executable" 2 fi fi done declare -A SCENARIO_RESULT declare -A SCENARIO_DURATION log "================================================================" log "Game day session : $SESSION_DATE" log "Session log : $SESSION_LOG" log "Scenarios run : ${SCENARIOS[*]}" [ -n "$ONLY" ] && log "ONLY filter : $ONLY" [ -n "$SKIP" ] && log "SKIP filter : $SKIP" log "================================================================" for s in "${SCENARIOS[@]}"; do if ! want "$s"; then SCENARIO_RESULT[$s]="SKIPPED" SCENARIO_DURATION[$s]="-" continue fi log "" log "── scenario $s : ${SCENARIO_DESC[$s]} ──────────────────────────" t0=$(date +%s) set +e "${SCENARIO_SCRIPT[$s]}" 2>&1 | tee -a "$SESSION_LOG" rc=${PIPESTATUS[0]} set -e elapsed=$(( $(date +%s) - t0 )) SCENARIO_DURATION[$s]="${elapsed}s" if [ "$rc" -eq 0 ]; then SCENARIO_RESULT[$s]="PASS" log "scenario $s : PASS in ${elapsed}s" else SCENARIO_RESULT[$s]="FAIL (exit $rc)" log "scenario $s : FAIL (exit $rc) after ${elapsed}s" fi done log "" log "================================================================" log "Session summary" log "----------------------------------------------------------------" printf '%-3s | %-12s | %-8s | %s\n' "ID" "result" "duration" "scenario" | tee -a "$SESSION_LOG" >&2 printf '%-3s-+-%-12s-+-%-8s-+-%s\n' "---" "------------" "--------" "$(printf '%.0s-' {1..50})" | tee -a "$SESSION_LOG" >&2 overall=0 for s in "${SCENARIOS[@]}"; do result=${SCENARIO_RESULT[$s]} duration=${SCENARIO_DURATION[$s]} printf '%-3s | %-12s | %-8s | %s\n' "$s" "$result" "$duration" "${SCENARIO_DESC[$s]}" \ | tee -a "$SESSION_LOG" >&2 if [[ "$result" == "FAIL"* ]]; then overall=1; fi done log "================================================================" log "" log "Operator next steps :" log " 1. Open the runbook template :" log " docs/runbooks/game-days/$SESSION_DATE.md" log " (copy from docs/runbooks/game-days/TEMPLATE.md if missing)" log " 2. For each scenario, fill : timestamp, action, observation," log " runbook used, gap discovered." log " 3. File one PR per gap that needs a code or runbook fix." log "" if [ "$overall" -eq 0 ]; then log "PASS : every selected scenario passed." else log "FAIL : at least one scenario failed — review $SESSION_LOG." fi exit "$overall"