veza/infra/ansible/tests/test_pgbouncer_load.sh

#!/usr/bin/env bash
# test_pgbouncer_load.sh — exercise PgBouncer with 500 concurrent
# clients × 30s, fail unless every connection lands and stays
# under the query_wait_timeout ceiling.
#
# v1.0.9 Day 7 acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 2:
# "pgbench 500 clients × 30s sans erreur de connexion".
#
# Usage:
#   bash infra/ansible/tests/test_pgbouncer_load.sh
#
# Env overrides:
#   PGBOUNCER_HOST       default: pgaf-pgbouncer.lxd
#   PGBOUNCER_PORT       default: 6432
#   PGBOUNCER_DB         default: veza
#   PGBENCH_CLIENTS      default: 500
#   PGBENCH_DURATION     default: 30
#
# Exit codes:
#   0  — pgbench completed clean (no connection errors, no aborts)
#   1  — pgbench reported errors during the run
#   2  — pgbouncer not reachable
#   3  — required tool missing on host

set -euo pipefail

PGBOUNCER_HOST=${PGBOUNCER_HOST:-pgaf-pgbouncer.lxd}
PGBOUNCER_PORT=${PGBOUNCER_PORT:-6432}
PGBOUNCER_DB=${PGBOUNCER_DB:-veza}
PGBOUNCER_USER=${PGBOUNCER_USER:-veza}
PGBENCH_CLIENTS=${PGBENCH_CLIENTS:-500}
PGBENCH_DURATION=${PGBENCH_DURATION:-30}
PGBENCH_THREADS=${PGBENCH_THREADS:-8}

log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
fail() { log "FAIL: $*"; exit "${2:-1}"; }
require() { command -v "$1" >/dev/null 2>&1 || fail "missing tool: $1" 3; }

require pgbench
require psql
require awk

# 0. Reachability — PgBouncer alive on listen_addr:listen_port.
log "step 0: probing pgbouncer at ${PGBOUNCER_HOST}:${PGBOUNCER_PORT}"
if ! psql "host=${PGBOUNCER_HOST} port=${PGBOUNCER_PORT} dbname=${PGBOUNCER_DB} user=${PGBOUNCER_USER} connect_timeout=5" -c 'select 1' >/dev/null 2>&1; then
  fail "pgbouncer not reachable (or app db ${PGBOUNCER_DB} not provisioned). Check the pgbouncer service + the formation primary." 2
fi

# 1. pgbench fixture — initialise the standard pgbench tables ONCE
#    before the load run. The init connects through pgbouncer too,
#    which incidentally checks transaction-mode compatibility.
log "step 1: initialising pgbench fixture (scale=10)"
if ! pgbench -h "${PGBOUNCER_HOST}" -p "${PGBOUNCER_PORT}" -U "${PGBOUNCER_USER}" -d "${PGBOUNCER_DB}" -i -s 10 --no-vacuum 2>&1 | tail -20 >&2; then
  fail "pgbench -i failed — check pgbouncer auth / pool_mode" 1
fi

# 2. Load run.
log "step 2: pgbench ${PGBENCH_CLIENTS} clients × ${PGBENCH_DURATION}s × ${PGBENCH_THREADS} threads"
out=$(pgbench \
    -h "${PGBOUNCER_HOST}" \
    -p "${PGBOUNCER_PORT}" \
    -U "${PGBOUNCER_USER}" \
    -d "${PGBOUNCER_DB}" \
    -c "${PGBENCH_CLIENTS}" \
    -j "${PGBENCH_THREADS}" \
    -T "${PGBENCH_DURATION}" \
    --no-vacuum \
    -P 5 \
    -r 2>&1)

echo "$out" | sed 's/^/    /' >&2

# pgbench reports "number of failed transactions: N (X.XX%)" — anything
# > 0 fails the test. Also catch outright "connection refused" errors
# from the runner output.
failed_tx=$(echo "$out" | awk '/number of failed transactions:/ { print $5; exit }' | tr -d ',()')
failed_tx=${failed_tx:-0}
conn_errors=$(echo "$out" | grep -ciE 'connection (refused|reset|timeout)' || true)

log "verdict: failed_tx=${failed_tx} conn_errors=${conn_errors}"
if [ "${failed_tx}" != "0" ] || [ "${conn_errors}" -gt 0 ]; then
  fail "pgbench surfaced errors — pool sizing, query_wait_timeout, or upstream is the bottleneck"
fi

log "PASS: pgbench ${PGBENCH_CLIENTS} clients × ${PGBENCH_DURATION}s clean"
exit 0