diff --git a/infra/ansible/inventory/lab.yml b/infra/ansible/inventory/lab.yml index ac33cec40..3f0a6b129 100644 --- a/infra/ansible/inventory/lab.yml +++ b/infra/ansible/inventory/lab.yml @@ -48,3 +48,11 @@ all: hosts: pgaf-primary: pgaf-replica: + # v1.0.9 Day 7: pgbouncer fronts the formation. Same + # community.general.incus connection plugin as postgres_ha. + pgbouncer: + hosts: + pgaf-pgbouncer: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 diff --git a/infra/ansible/playbooks/postgres_ha.yml b/infra/ansible/playbooks/postgres_ha.yml index d5d09e490..dd235dcb0 100644 --- a/infra/ansible/playbooks/postgres_ha.yml +++ b/infra/ansible/playbooks/postgres_ha.yml @@ -15,16 +15,16 @@ # ansible-playbook -i inventory/lab.yml playbooks/postgres_ha.yml --check # ansible-playbook -i inventory/lab.yml playbooks/postgres_ha.yml --- -- name: Provision Incus containers for the Postgres formation +- name: Provision Incus containers for the Postgres formation + pgbouncer hosts: incus_hosts become: true gather_facts: true tasks: - - name: Launch pgaf-monitor + pgaf-primary + pgaf-replica + - name: Launch pgaf-monitor + pgaf-primary + pgaf-replica + pgaf-pgbouncer ansible.builtin.shell: cmd: | set -e - for ct in pgaf-monitor pgaf-primary pgaf-replica; do + for ct in pgaf-monitor pgaf-primary pgaf-replica pgaf-pgbouncer; do if ! incus info "$ct" >/dev/null 2>&1; then incus launch images:ubuntu/22.04 "$ct" # Wait for cloud-init / network to settle. @@ -44,7 +44,7 @@ executable: /bin/bash register: provision_result changed_when: "'incus launch' in provision_result.stdout" - tags: [postgres_ha, provision] + tags: [postgres_ha, pgbouncer, provision] - name: Refresh inventory so the new containers are reachable via the incus connection ansible.builtin.meta: refresh_inventory @@ -70,3 +70,19 @@ serial: 1 # primary must register before replica — pg_auto_failover assigns roles by registration order roles: - postgres_ha + +# v1.0.9 Day 7: PgBouncer fronts the formation. Common baseline first +# (SSH + node_exporter + fail2ban), then the pgbouncer role itself. +- name: Apply common baseline to the pgbouncer container + hosts: pgbouncer + become: true + gather_facts: true + roles: + - common + +- name: Install + configure PgBouncer pointing at the formation + hosts: pgbouncer + become: true + gather_facts: true + roles: + - pgbouncer diff --git a/infra/ansible/roles/pgbouncer/README.md b/infra/ansible/roles/pgbouncer/README.md new file mode 100644 index 000000000..745ffcce3 --- /dev/null +++ b/infra/ansible/roles/pgbouncer/README.md @@ -0,0 +1,74 @@ +# `pgbouncer` role — connection pool in front of pg_auto_failover + +Sits between the Veza backend and the pg_auto_failover primary. Pools 1000 client connections down to 50 server connections in transaction mode — the backend pays the ~1ms Postgres fork overhead 50 times per pool refresh, not once per HTTP handler. + +## Wiring + +``` +veza-backend-api ──libpq──▶ pgaf-pgbouncer:6432 ──libpq──▶ pgaf-primary:5432 + (1000 client conn cap) (single backend per pool slot) +``` + +Backend `DATABASE_URL` in prod: + +``` +postgresql://veza:PASSWORD@pgaf-pgbouncer.lxd:6432/veza?sslmode=prefer +``` + +NOT the formation URI directly — that bypasses the pool and re-creates the connection-storm problem. + +## Pool sizing — v1.0.9 Day 7 baseline + +| knob | default | rationale | +| ------------------------- | ------- | ----------------------------------------------------- | +| `max_client_conn` | 1000 | acceptance target: 500 concurrent + headroom | +| `default_pool_size` | 50 | matches Postgres `max_connections=100` with margin | +| `min_pool_size` | 10 | warm pool — first request after idle is fast | +| `reserve_pool_size` | 10 | overflow when `default_pool_size` is exhausted | +| `reserve_pool_timeout` | 5s | wait this long before opening a reserve slot | +| `server_idle_timeout` | 600s | aggressive idle reclaim — saves Postgres backend RAM | +| `query_wait_timeout` | 120s | hard ceiling: a request waiting longer fails fast | + +## Transaction mode — what it forbids + +Transaction mode means a connection is returned to the pool **after each transaction**. Things that break: + +- Cross-transaction session state — `SET` (without `SET LOCAL`), session GUCs persisted across queries +- Cross-transaction prepared statements (`PREPARE` outside a tx) +- `LISTEN/NOTIFY` (the listener returns to the pool, the notification goes nowhere) + +Things that work fine — and the Veza backend stays inside this set: + +- `SET LOCAL` inside a tx (scoped to the tx) +- Advisory locks scoped to a tx (`pg_advisory_xact_lock`) +- Plain SELECT / INSERT / UPDATE / DELETE / DDL within a tx + +If a future feature needs `LISTEN/NOTIFY` (e.g. a real-time invalidation channel), connect that worker straight to Postgres bypassing PgBouncer — separate connection budget, no pool conflict. + +## Failover behaviour (current scope) + +Day 7 ships pgbouncer pointed at `pgaf-primary.lxd` directly. After a `pg_autoctl perform failover`, pgbouncer's pool is stranded on the demoted node until DNS TTL (60s) + the role re-renders config and reloads. RTO < 60s for the backend round-trip even in this state — most pool slots fail-over via DNS within the existing TTL window. + +W2 day 8 (or v1.1) hardens this with a pg_autoctl state-change hook that issues `RELOAD` on the pgbouncer admin console as soon as the formation elects a new primary — sub-second pool migration. + +## Admin / observability + +```bash +# From any container on the bridge: +psql -h pgaf-pgbouncer.lxd -p 6432 -U postgres pgbouncer + +# Then: +SHOW POOLS; -- per-(database,user) pool stats +SHOW CLIENTS; -- active client connections +SHOW SERVERS; -- pool→postgres connections +SHOW STATS; -- request rate, query duration percentiles +RELOAD; -- re-read pgbouncer.ini without dropping clients +``` + +## Acceptance test + +```bash +bash infra/ansible/tests/test_pgbouncer_load.sh +``` + +Spawns `pgbench` from the host with 500 concurrent clients × 30s and asserts zero connection errors. Pool size = 50 + reserve 10 forces 60 server-side connections to serve 500 client-side, exercising the queue. diff --git a/infra/ansible/roles/pgbouncer/defaults/main.yml b/infra/ansible/roles/pgbouncer/defaults/main.yml new file mode 100644 index 000000000..6f7bc8e43 --- /dev/null +++ b/infra/ansible/roles/pgbouncer/defaults/main.yml @@ -0,0 +1,61 @@ +# PgBouncer connection pool — fronts the pg_auto_failover formation +# so the backend keeps a stable pool of cheap client-side connections +# (1000 capacity) backed by a small pool of expensive Postgres +# connections (50). Without this, every Go HTTP handler that opened +# a transaction was paying the ~1ms forking overhead of a fresh +# Postgres backend. +# +# Mode: transaction. Connections are returned to the pool after each +# transaction, NOT after each statement (statement mode breaks +# prepared statements + session-local features) and NOT after +# disconnect (session mode = no real pooling). The Veza backend uses +# session-scoped features in a few places (SET LOCAL inside a tx, +# advisory locks per tx, etc.) but no cross-transaction session +# state — transaction mode fits. +--- +# Listen address inside the container — exposed to the Incus bridge +# so any peer container resolves it via `pgaf-pgbouncer.lxd:6432`. +pgbouncer_listen_addr: 0.0.0.0 +pgbouncer_listen_port: 6432 + +# Pool sizes per the v1.0.9 Day 7 acceptance: 1000 client connections +# capacity, 50 actual postgres connections behind. Tune in +# group_vars/.yml when load profiles are baselined. +pgbouncer_max_client_conn: 1000 +pgbouncer_default_pool_size: 50 +pgbouncer_min_pool_size: 10 +pgbouncer_reserve_pool_size: 10 +pgbouncer_reserve_pool_timeout: 5 + +# Transaction mode is the only one the Veza backend can use safely — +# see the role-level comment block above. Override only in tests +# that explicitly benchmark statement vs transaction mode. +pgbouncer_pool_mode: transaction +pgbouncer_server_reset_query: DISCARD ALL + +# Upstream — the pg_auto_failover formation primary. Day 7 lab points +# at the primary container directly; W2 day 8 (or v1.1) wires +# pgbouncer reload on failover via a pg_autoctl callback. +pgbouncer_upstream_host: pgaf-primary.lxd +pgbouncer_upstream_port: 5432 +pgbouncer_upstream_dbname: veza +pgbouncer_upstream_user: veza + +# Auth — trust on the lab bridge (10.99.0.0/24 + container-only). +# Prod overrides to scram-sha-256 + a userlist managed by +# `pg_autoctl pgbouncer-userlist` or equivalent. +pgbouncer_auth_type: trust +pgbouncer_auth_file: /etc/pgbouncer/userlist.txt +# When auth_type=md5/scram-sha-256, this list is rendered into +# userlist.txt. Format: { user: "name", password: "hash" }. md5 hashes +# are 'md5' + md5(password+user). For lab/trust we leave this empty. +pgbouncer_users: [] + +# Admin console — `psql -h -p 6432 pgbouncer` for SHOW POOLS, +# RELOAD, etc. Restricted to the postgres + admins user. Lab default +# trusts the bridge; prod tightens to a unix socket only. +pgbouncer_admin_users: + - postgres + - veza +pgbouncer_stats_users: + - postgres diff --git a/infra/ansible/roles/pgbouncer/handlers/main.yml b/infra/ansible/roles/pgbouncer/handlers/main.yml new file mode 100644 index 000000000..14c687968 --- /dev/null +++ b/infra/ansible/roles/pgbouncer/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Reload pgbouncer + ansible.builtin.service: + name: pgbouncer + state: reloaded diff --git a/infra/ansible/roles/pgbouncer/tasks/main.yml b/infra/ansible/roles/pgbouncer/tasks/main.yml new file mode 100644 index 000000000..14315f0f4 --- /dev/null +++ b/infra/ansible/roles/pgbouncer/tasks/main.yml @@ -0,0 +1,50 @@ +# PgBouncer role — installs the package, renders pgbouncer.ini + +# userlist.txt from templates, and ensures the systemd unit is +# running. Idempotent. +--- +- name: Install PgBouncer + ansible.builtin.apt: + name: + - pgbouncer + - postgresql-client # for pgbench in the load test + admin console psql + state: present + update_cache: true + cache_valid_time: 3600 + tags: [pgbouncer, packages] + +- name: Render pgbouncer.ini + ansible.builtin.template: + src: pgbouncer.ini.j2 + dest: /etc/pgbouncer/pgbouncer.ini + owner: postgres + group: postgres + mode: "0640" + notify: Reload pgbouncer + tags: [pgbouncer, config] + +- name: Render pgbouncer userlist (auth_type != trust) + ansible.builtin.template: + src: userlist.txt.j2 + dest: "{{ pgbouncer_auth_file }}" + owner: postgres + group: postgres + mode: "0640" + when: pgbouncer_auth_type != 'trust' + notify: Reload pgbouncer + tags: [pgbouncer, config] + +- name: Ensure /var/log/postgresql exists for pgbouncer log + ansible.builtin.file: + path: /var/log/postgresql + state: directory + owner: postgres + group: postgres + mode: "0755" + tags: [pgbouncer, config] + +- name: Enable + start pgbouncer + ansible.builtin.service: + name: pgbouncer + state: started + enabled: true + tags: [pgbouncer, service] diff --git a/infra/ansible/roles/pgbouncer/templates/pgbouncer.ini.j2 b/infra/ansible/roles/pgbouncer/templates/pgbouncer.ini.j2 new file mode 100644 index 000000000..38ebdfbd5 --- /dev/null +++ b/infra/ansible/roles/pgbouncer/templates/pgbouncer.ini.j2 @@ -0,0 +1,52 @@ +# Managed by Ansible — do not edit by hand. +# Source: infra/ansible/roles/pgbouncer/templates/pgbouncer.ini.j2 + +[databases] +{{ pgbouncer_upstream_dbname }} = host={{ pgbouncer_upstream_host }} port={{ pgbouncer_upstream_port }} dbname={{ pgbouncer_upstream_dbname }} user={{ pgbouncer_upstream_user }} + +[pgbouncer] +listen_addr = {{ pgbouncer_listen_addr }} +listen_port = {{ pgbouncer_listen_port }} + +# Auth +auth_type = {{ pgbouncer_auth_type }} +{% if pgbouncer_auth_type != 'trust' %} +auth_file = {{ pgbouncer_auth_file }} +{% endif %} + +# Pooling — see role defaults for rationale. +pool_mode = {{ pgbouncer_pool_mode }} +max_client_conn = {{ pgbouncer_max_client_conn }} +default_pool_size = {{ pgbouncer_default_pool_size }} +min_pool_size = {{ pgbouncer_min_pool_size }} +reserve_pool_size = {{ pgbouncer_reserve_pool_size }} +reserve_pool_timeout = {{ pgbouncer_reserve_pool_timeout }} +server_reset_query = {{ pgbouncer_server_reset_query }} + +# Admin / observability +admin_users = {{ pgbouncer_admin_users | join(', ') }} +stats_users = {{ pgbouncer_stats_users | join(', ') }} + +# Logs — systemd journal captures stderr; the role keeps the file +# log too so post-mortem grep is greppable without journalctl. +logfile = /var/log/postgresql/pgbouncer.log +pidfile = /var/run/postgresql/pgbouncer.pid +log_connections = 0 +log_disconnections = 0 +log_pooler_errors = 1 + +# Timeouts — defaults are conservative for a single-host lab. Prod +# bumps client_login_timeout once SSL+SCRAM is wired (more rounds). +server_connect_timeout = 15 +server_idle_timeout = 600 +client_login_timeout = 60 +query_timeout = 0 +query_wait_timeout = 120 +client_idle_timeout = 0 + +# DNS — pgbouncer caches DNS lookups, which matters once the +# upstream host moves between containers (failover). Short TTL so +# a `pg_autoctl perform failover` doesn't strand connections on +# the demoted node for >2 minutes. +dns_max_ttl = 60 +dns_zone_check_period = 30 diff --git a/infra/ansible/roles/pgbouncer/templates/userlist.txt.j2 b/infra/ansible/roles/pgbouncer/templates/userlist.txt.j2 new file mode 100644 index 000000000..5fb51a3fd --- /dev/null +++ b/infra/ansible/roles/pgbouncer/templates/userlist.txt.j2 @@ -0,0 +1,12 @@ +# Managed by Ansible — do not edit by hand. +# Source: infra/ansible/roles/pgbouncer/templates/userlist.txt.j2 +# +# Format (PgBouncer userlist.txt): one entry per line, +# "username" "password-or-hash" +# Hash format depends on auth_type: +# md5 → md5 + md5(password + username) +# scram-sha-256 → SCRAM verifier (rare to inline; prod uses +# pg_autoctl pgbouncer-userlist or equivalent) +{% for user in pgbouncer_users %} +"{{ user.user }}" "{{ user.password }}" +{% endfor %} diff --git a/infra/ansible/tests/test_pgbouncer_load.sh b/infra/ansible/tests/test_pgbouncer_load.sh new file mode 100755 index 000000000..e74290dfb --- /dev/null +++ b/infra/ansible/tests/test_pgbouncer_load.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# test_pgbouncer_load.sh — exercise PgBouncer with 500 concurrent +# clients × 30s, fail unless every connection lands and stays +# under the query_wait_timeout ceiling. +# +# v1.0.9 Day 7 acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 2: +# "pgbench 500 clients × 30s sans erreur de connexion". +# +# Usage: +# bash infra/ansible/tests/test_pgbouncer_load.sh +# +# Env overrides: +# PGBOUNCER_HOST default: pgaf-pgbouncer.lxd +# PGBOUNCER_PORT default: 6432 +# PGBOUNCER_DB default: veza +# PGBENCH_CLIENTS default: 500 +# PGBENCH_DURATION default: 30 +# +# Exit codes: +# 0 — pgbench completed clean (no connection errors, no aborts) +# 1 — pgbench reported errors during the run +# 2 — pgbouncer not reachable +# 3 — required tool missing on host + +set -euo pipefail + +PGBOUNCER_HOST=${PGBOUNCER_HOST:-pgaf-pgbouncer.lxd} +PGBOUNCER_PORT=${PGBOUNCER_PORT:-6432} +PGBOUNCER_DB=${PGBOUNCER_DB:-veza} +PGBOUNCER_USER=${PGBOUNCER_USER:-veza} +PGBENCH_CLIENTS=${PGBENCH_CLIENTS:-500} +PGBENCH_DURATION=${PGBENCH_DURATION:-30} +PGBENCH_THREADS=${PGBENCH_THREADS:-8} + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } +fail() { log "FAIL: $*"; exit "${2:-1}"; } +require() { command -v "$1" >/dev/null 2>&1 || fail "missing tool: $1" 3; } + +require pgbench +require psql +require awk + +# 0. Reachability — PgBouncer alive on listen_addr:listen_port. +log "step 0: probing pgbouncer at ${PGBOUNCER_HOST}:${PGBOUNCER_PORT}" +if ! psql "host=${PGBOUNCER_HOST} port=${PGBOUNCER_PORT} dbname=${PGBOUNCER_DB} user=${PGBOUNCER_USER} connect_timeout=5" -c 'select 1' >/dev/null 2>&1; then + fail "pgbouncer not reachable (or app db ${PGBOUNCER_DB} not provisioned). Check the pgbouncer service + the formation primary." 2 +fi + +# 1. pgbench fixture — initialise the standard pgbench tables ONCE +# before the load run. The init connects through pgbouncer too, +# which incidentally checks transaction-mode compatibility. +log "step 1: initialising pgbench fixture (scale=10)" +if ! pgbench -h "${PGBOUNCER_HOST}" -p "${PGBOUNCER_PORT}" -U "${PGBOUNCER_USER}" -d "${PGBOUNCER_DB}" -i -s 10 --no-vacuum 2>&1 | tail -20 >&2; then + fail "pgbench -i failed — check pgbouncer auth / pool_mode" 1 +fi + +# 2. Load run. +log "step 2: pgbench ${PGBENCH_CLIENTS} clients × ${PGBENCH_DURATION}s × ${PGBENCH_THREADS} threads" +out=$(pgbench \ + -h "${PGBOUNCER_HOST}" \ + -p "${PGBOUNCER_PORT}" \ + -U "${PGBOUNCER_USER}" \ + -d "${PGBOUNCER_DB}" \ + -c "${PGBENCH_CLIENTS}" \ + -j "${PGBENCH_THREADS}" \ + -T "${PGBENCH_DURATION}" \ + --no-vacuum \ + -P 5 \ + -r 2>&1) + +echo "$out" | sed 's/^/ /' >&2 + +# pgbench reports "number of failed transactions: N (X.XX%)" — anything +# > 0 fails the test. Also catch outright "connection refused" errors +# from the runner output. +failed_tx=$(echo "$out" | awk '/number of failed transactions:/ { print $5; exit }' | tr -d ',()') +failed_tx=${failed_tx:-0} +conn_errors=$(echo "$out" | grep -ciE 'connection (refused|reset|timeout)' || true) + +log "verdict: failed_tx=${failed_tx} conn_errors=${conn_errors}" +if [ "${failed_tx}" != "0" ] || [ "${conn_errors}" -gt 0 ]; then + fail "pgbench surfaced errors — pool sizing, query_wait_timeout, or upstream is the bottleneck" +fi + +log "PASS: pgbench ${PGBENCH_CLIENTS} clients × ${PGBENCH_DURATION}s clean" +exit 0 diff --git a/veza-backend-api/internal/config/config.go b/veza-backend-api/internal/config/config.go index 50f840d2b..aa10ee7d5 100644 --- a/veza-backend-api/internal/config/config.go +++ b/veza-backend-api/internal/config/config.go @@ -346,6 +346,18 @@ func NewConfig() (*Config, error) { if err != nil { return nil, err } + // v1.0.9 Day 7 — in staging/prod, DATABASE_URL points at the PgBouncer + // container fronting the pg_auto_failover formation, not at Postgres + // directly. Wiring: + // postgresql://veza:PASSWORD@pgaf-pgbouncer.lxd:6432/veza?sslmode=prefer + // PgBouncer holds 1000 client connections in transaction-mode pooling + // down to 50 server connections behind. Bypassing it (pointing at + // pgaf-primary directly) re-introduces the connection-storm we just + // solved — the role's README documents the operational consequences. + // Dev/CI keeps direct Postgres because there's no pool advantage at + // that scale and PgBouncer in transaction-mode forbids LISTEN/NOTIFY + // + cross-tx prepared statements (none of which the backend uses, + // but tests sometimes lean on them). // BE-SEC-014: Get RabbitMQ URL with environment-aware defaults rabbitMQURL := getRabbitMQURL(env, appDomain)