veza/infra/ansible/playbooks/postgres_ha.yml

# Postgres HA playbook — provisions 3 Incus containers on the
# `incus_hosts` group (lab/staging/prod) and lays down the
# pg_auto_failover formation across them.
#
# Topology:
#   - pgaf-monitor   — the state machine (single instance)
#   - pgaf-primary   — first data node, becomes primary at first boot
#   - pgaf-replica   — second data node, becomes hot-standby
#
# v1.0.9 Day 6 — single host (R720 lab) for now. W2 day 7+ moves
# the data nodes onto separate physical hosts when Hetzner standby
# is provisioned. The formation works the same either way.
#
# Run with:
#   ansible-playbook -i inventory/lab.yml playbooks/postgres_ha.yml --check
#   ansible-playbook -i inventory/lab.yml playbooks/postgres_ha.yml
---
- name: Provision Incus containers for the Postgres formation + pgbouncer
  hosts: incus_hosts
  become: true
  gather_facts: true
  tasks:
    - name: Launch pgaf-monitor + pgaf-primary + pgaf-replica + pgaf-pgbouncer
      ansible.builtin.shell:
        cmd: |
          set -e
          for ct in pgaf-monitor pgaf-primary pgaf-replica pgaf-pgbouncer; do
            if ! incus info "$ct" >/dev/null 2>&1; then
              incus launch images:ubuntu/22.04 "$ct"
              # Wait for cloud-init / network to settle.
              for _ in $(seq 1 30); do
                if incus exec "$ct" -- cloud-init status 2>/dev/null | grep -q "status: done"; then
                  break
                fi
                sleep 1
              done
              # Install python3 inside the container so Ansible can
              # speak to it via the incus connection plugin.
              incus exec "$ct" -- apt-get update
              incus exec "$ct" -- apt-get install -y python3 python3-apt
            fi
          done
      args:
        executable: /bin/bash
      register: provision_result
      changed_when: "'incus launch' in provision_result.stdout"
      tags: [postgres_ha, pgbouncer, provision]

    - name: Refresh inventory so the new containers are reachable via the incus connection
      ansible.builtin.meta: refresh_inventory

- name: Apply common baseline to the formation containers
  hosts: postgres_ha
  become: true
  gather_facts: true
  roles:
    - common

- name: Bring up the pg_auto_failover monitor first (formation depends on it)
  hosts: postgres_ha_monitor
  become: true
  gather_facts: true
  roles:
    - postgres_ha

- name: Bring up the data nodes (primary registers first, replica registers second)
  hosts: postgres_ha_nodes
  become: true
  gather_facts: true
  serial: 1  # primary must register before replica — pg_auto_failover assigns roles by registration order
  roles:
    - postgres_ha

# v1.0.9 Day 7: PgBouncer fronts the formation. Common baseline first
# (SSH + node_exporter + fail2ban), then the pgbouncer role itself.
- name: Apply common baseline to the pgbouncer container
  hosts: pgbouncer
  become: true
  gather_facts: true
  roles:
    - common

- name: Install + configure PgBouncer pointing at the formation
  hosts: pgbouncer
  become: true
  gather_facts: true
  roles:
    - pgbouncer

# v1.0.9 Day 8: pgBackRest on the data nodes (archive_command + full
# / diff timers + stanza-create from whoever is primary).
- name: Install + configure pgBackRest on the data nodes
  hosts: postgres_ha_nodes
  become: true
  gather_facts: true
  roles:
    - pgbackrest

# Drill installer — runs on the Incus host so it can `incus launch`
# the ephemeral restore container. Pushes dr-drill.sh to
# /usr/local/bin, ensures the textfile-collector dir exists for
# node_exporter, and wires the weekly drill timer.
- name: Install dr-drill on the Incus host
  hosts: incus_hosts
  become: true
  gather_facts: true
  tasks:
    - name: Push dr-drill.sh to /usr/local/bin
      ansible.builtin.copy:
        src: ../../../scripts/dr-drill.sh
        dest: /usr/local/bin/dr-drill.sh
        owner: root
        group: root
        mode: "0755"
      tags: [pgbackrest, drill]

    - name: Ensure node_exporter textfile collector dir
      ansible.builtin.file:
        path: /var/lib/node_exporter/textfile_collector
        state: directory
        owner: node_exporter
        group: node_exporter
        mode: "0755"
      tags: [pgbackrest, drill]

    - name: Render dr-drill systemd service + timer
      ansible.builtin.template:
        src: ../roles/pgbackrest/templates/{{ item.src }}
        dest: "{{ item.dest }}"
        owner: root
        group: root
        mode: "0644"
      loop:
        - { src: pgbackrest-drill.service.j2, dest: /etc/systemd/system/pgbackrest-drill.service }
        - { src: pgbackrest-drill.timer.j2,   dest: /etc/systemd/system/pgbackrest-drill.timer }
      tags: [pgbackrest, drill]
      vars:
        pgbackrest_stanza: "{{ hostvars[groups['postgres_ha_nodes'][0]]['pgbackrest_stanza'] | default('veza') }}"
        pgbackrest_drill_schedule: "{{ hostvars[groups['postgres_ha_nodes'][0]]['pgbackrest_drill_schedule'] | default('Sun *-*-* 04:00:00') }}"

    - name: Enable + start drill timer
      ansible.builtin.systemd:
        name: pgbackrest-drill.timer
        state: started
        enabled: true
        daemon_reload: true
      tags: [pgbackrest, drill]