#!/usr/bin/env bash # dr-drill.sh — Postgres backup restore drill. # # Restores the most recent pgBackRest full+WAL into an ephemeral # Incus container, runs a smoke query against the recovered DB, # tears the container down, and writes a textfile metric for the # Prometheus alert BackupRestoreDrillFailed. # # Acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 2 day 8. # # Usage: # bash scripts/dr-drill.sh [--keep] # # Env overrides: # PGBACKREST_STANZA default: veza # PGBACKREST_SECRETS default: /etc/pgbackrest/pgbackrest.conf # (mounted into the drill container so # the same S3 creds + cipher pass apply) # POSTGRES_VERSION default: 16 # DRILL_CONTAINER default: pg-restore-drill # DRILL_METRICS_FILE default: /var/lib/node_exporter/textfile_collector/pgbackrest_drill.prom # MIN_USERS_EXPECTED default: 1 ; set higher when the seed grows # # Exit codes: # 0 — drill passed (restore + smoke query OK) # 1 — drill failed (restore error, smoke query failure, or # short user count) # 2 — environment problem (missing tool, no backups, can't # reach the Incus host) set -euo pipefail PGBACKREST_STANZA=${PGBACKREST_STANZA:-veza} PGBACKREST_CONF_HOST=${PGBACKREST_CONF_HOST:-/etc/pgbackrest/pgbackrest.conf} POSTGRES_VERSION=${POSTGRES_VERSION:-16} DRILL_CONTAINER=${DRILL_CONTAINER:-pg-restore-drill} DRILL_METRICS_FILE=${DRILL_METRICS_FILE:-/var/lib/node_exporter/textfile_collector/pgbackrest_drill.prom} DRILL_METRICS_TMP=${DRILL_METRICS_FILE}.tmp MIN_USERS_EXPECTED=${MIN_USERS_EXPECTED:-1} KEEP_CONTAINER=0 if [ "${1:-}" = "--keep" ]; then KEEP_CONTAINER=1; fi log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } fail() { log "FAIL: $*"; write_metric 0 "${1:-failed}" "${SECONDS}"; exit "${2:-1}"; } require() { command -v "$1" >/dev/null 2>&1 || { log "missing tool: $1"; exit 2; } } write_metric() { local success="$1" reason="${2:-ok}" duration="${3:-0}" local now now=$(date +%s) mkdir -p "$(dirname "$DRILL_METRICS_FILE")" cat >"$DRILL_METRICS_TMP" </dev/null 2>&1; then log "tearing down $DRILL_CONTAINER" incus delete --force "$DRILL_CONTAINER" || true fi } trap cleanup EXIT # ----------------------------------------------------------------------------- # 0. Pre-flight. # ----------------------------------------------------------------------------- require incus require pgbackrest require date [ -f "$PGBACKREST_CONF_HOST" ] || fail "pgbackrest.conf not found at $PGBACKREST_CONF_HOST" 2 log "step 0: read latest backup metadata for stanza=$PGBACKREST_STANZA" backup_info=$(pgbackrest --stanza="$PGBACKREST_STANZA" --output=text info 2>&1 || true) echo "$backup_info" | sed 's/^/ /' >&2 if ! echo "$backup_info" | grep -q "full backup:"; then fail "no full backup visible — has the stanza had time to run yet?" 2 fi # ----------------------------------------------------------------------------- # 1. Provision the drill container. # ----------------------------------------------------------------------------- log "step 1: launching $DRILL_CONTAINER (ephemeral Ubuntu 22.04)" if incus info "$DRILL_CONTAINER" >/dev/null 2>&1; then log " pre-existing container, tearing it down for a clean run" incus delete --force "$DRILL_CONTAINER" fi incus launch images:ubuntu/22.04 "$DRILL_CONTAINER" -c security.privileged=true # Wait for cloud-init. for _ in $(seq 1 60); do if incus exec "$DRILL_CONTAINER" -- cloud-init status 2>/dev/null | grep -q "status: done"; then break fi sleep 1 done # ----------------------------------------------------------------------------- # 2. Install postgres + pgbackrest inside, push the same config in # (read-only against the bucket). # ----------------------------------------------------------------------------- log "step 2: installing postgres + pgbackrest in $DRILL_CONTAINER" incus exec "$DRILL_CONTAINER" -- bash -c " set -e apt-get update >/dev/null apt-get install -y curl ca-certificates gnupg lsb-release >/dev/null install -d -m 0755 /etc/apt/keyrings curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc -o /etc/apt/keyrings/postgresql.asc echo 'deb [signed-by=/etc/apt/keyrings/postgresql.asc] https://apt.postgresql.org/pub/repos/apt jammy-pgdg main' \ > /etc/apt/sources.list.d/pgdg.list apt-get update >/dev/null DEBIAN_FRONTEND=noninteractive apt-get install -y \ postgresql-${POSTGRES_VERSION} \ postgresql-client-${POSTGRES_VERSION} \ pgbackrest >/dev/null systemctl stop postgresql@${POSTGRES_VERSION}-main || true rm -rf /var/lib/postgresql/${POSTGRES_VERSION}/main install -d -o postgres -g postgres -m 0700 /var/lib/postgresql/${POSTGRES_VERSION}/main install -d -o postgres -g postgres -m 0750 /etc/pgbackrest install -d -o postgres -g postgres -m 0750 /var/log/pgbackrest " incus file push "$PGBACKREST_CONF_HOST" "$DRILL_CONTAINER/etc/pgbackrest/pgbackrest.conf" incus exec "$DRILL_CONTAINER" -- chown postgres:postgres /etc/pgbackrest/pgbackrest.conf # Patch the conf so pg1-path points at the empty-dir we just made, # and add `delta = y` for resumable restores. Stanza name and S3 # credentials carry over verbatim — the drill restores from the # real prod repo (read-only via pgbackrest semantics). incus exec "$DRILL_CONTAINER" -- bash -c " sed -i 's|^pg1-path =.*|pg1-path = /var/lib/postgresql/${POSTGRES_VERSION}/main|' /etc/pgbackrest/pgbackrest.conf echo 'delta = y' >> /etc/pgbackrest/pgbackrest.conf " # ----------------------------------------------------------------------------- # 3. Restore. # ----------------------------------------------------------------------------- log "step 3: pgbackrest restore (latest backup, full WAL replay)" incus exec "$DRILL_CONTAINER" -- sudo -u postgres \ pgbackrest --stanza="$PGBACKREST_STANZA" --log-level-console=info restore \ || fail "restore failed" 1 # ----------------------------------------------------------------------------- # 4. Start postgres + smoke query. # ----------------------------------------------------------------------------- log "step 4: starting postgres + waiting for ready" incus exec "$DRILL_CONTAINER" -- bash -c " systemctl start postgresql@${POSTGRES_VERSION}-main for i in \$(seq 1 30); do if sudo -u postgres pg_isready -p 5432 >/dev/null 2>&1; then break fi sleep 1 done " if ! incus exec "$DRILL_CONTAINER" -- sudo -u postgres pg_isready -p 5432 >/dev/null 2>&1; then fail "postgres did not become ready inside drill container" 1 fi log "step 5: smoke query — SELECT count(*) FROM users" users_count=$(incus exec "$DRILL_CONTAINER" -- sudo -u postgres \ psql -At -d veza -c 'select count(*) from users' 2>&1 || true) log "users.count = $users_count (expecting >= $MIN_USERS_EXPECTED)" if ! [[ "$users_count" =~ ^[0-9]+$ ]]; then fail "users count is not numeric: '$users_count' (table missing? wrong db?)" 1 fi if [ "$users_count" -lt "$MIN_USERS_EXPECTED" ]; then fail "users count $users_count < expected $MIN_USERS_EXPECTED — backup may be broken" 1 fi # ----------------------------------------------------------------------------- # 6. Verdict. # ----------------------------------------------------------------------------- write_metric 1 "ok" "$SECONDS" log "PASS: drill completed in ${SECONDS}s, users=$users_count" exit 0