25 changed files with 66 additions and 2744 deletions
--- a/.forgejo/workflows/cleanup-failed.yml
+++ b/.forgejo/workflows/cleanup-failed.yml
@ -1,79 +0,0 @@
-# cleanup-failed.yml — workflow_dispatch only.
-#
-# Tears down the kept-alive failed-deploy color (the inactive one
-# that survived a Phase D / Phase F failure for forensics).
-# Operator triggers this once they have read the journalctl output.
-#
-# Hard safety in playbooks/cleanup_failed.yml: refuses to destroy
-# the currently-active color.
-name: Veza cleanup failed-deploy color
-
-on:
-    workflow_dispatch:
-        inputs:
-            env:
-                description: "Environment to clean up"
-                required: true
-                type: choice
-                options: [staging, prod]
-            color:
-                description: "Color to destroy (must NOT be the active one)"
-                required: true
-                type: choice
-                options: [blue, green]
-
-concurrency:
-    group: cleanup-${{ inputs.env }}
-    cancel-in-progress: false
-
-jobs:
-    cleanup:
-        name: Destroy ${{ inputs.color }} app containers in ${{ inputs.env }}
-        runs-on: [self-hosted, incus]
-        timeout-minutes: 10
-        steps:
-            - uses: actions/checkout@v4
-              with:
-                  fetch-depth: 1
-
-            - name: Install ansible
-              run: |
-                  sudo apt-get update -qq
-                  sudo apt-get install -y ansible
-                  ansible-galaxy collection install community.general
-
-            - name: Write vault password
-              env:
-                  VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
-              run: |
-                  printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
-                  chmod 0400 "$RUNNER_TEMP/vault-pass"
-                  echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
-
-            - name: Run cleanup_failed.yml
-              working-directory: infra/ansible
-              env:
-                  ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-cleanup-${{ inputs.env }}-${{ inputs.color }}.log
-                  ANSIBLE_HOST_KEY_CHECKING: "False"
-              run: |
-                  ansible-playbook \
-                      -i inventory/${{ inputs.env }}.yml \
-                      playbooks/cleanup_failed.yml \
-                      --vault-password-file "$VAULT_PASS_FILE" \
-                      -e veza_env=${{ inputs.env }} \
-                      -e target_color=${{ inputs.color }}
-
-            - name: Upload Ansible log
-              if: always()
-              uses: actions/upload-artifact@v4
-              with:
-                  name: ansible-cleanup-${{ inputs.env }}-${{ inputs.color }}
-                  path: ${{ runner.temp }}/ansible-cleanup-*.log
-                  retention-days: 30
-
-            - name: Shred vault password file
-              if: always()
-              run: |
-                  if [ -f "$VAULT_PASS_FILE" ]; then
-                      shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
-                  fi
--- a/.forgejo/workflows/deploy.yml
+++ b/.forgejo/workflows/deploy.yml
@ -1,358 +0,0 @@
-# Veza deploy pipeline.
-#
-# Triggers (intentionally narrow — see SECURITY note below):
-#   push:main           → env=staging, sha=$GITHUB_SHA
-#   push:tags ['v*']    → env=prod,    sha=$GITHUB_SHA (tag's pointee)
-#   workflow_dispatch   → operator-supplied env + sha
-#
-# SECURITY: this workflow runs on a self-hosted runner with access to
-# the Incus unix socket (effectively root on the host). DO NOT add
-# `pull_request` or any fork-influenced trigger here — an attacker-
-# controlled fork would be able to `incus exec` arbitrarily. The
-# narrow trigger list above is the security boundary.
-#
-# Sequence : build (3 jobs in parallel) → upload artifacts → deploy.
-name: Veza deploy
-
-on:
-    push:
-        branches: [main]
-        tags: ['v*']
-    workflow_dispatch:
-        inputs:
-            env:
-                description: "Environment to deploy"
-                required: true
-                default: staging
-                type: choice
-                options: [staging, prod]
-            release_sha:
-                description: "Full git SHA to deploy (defaults to current HEAD if empty)"
-                required: false
-                type: string
-
-concurrency:
-    # Only one deploy per env at a time. Newer pushes cancel older
-    # in-flight builds for the same env (the user almost always wants
-    # the newer commit).
-    group: deploy-${{ github.ref_type == 'tag' && 'prod' || 'staging' }}
-    cancel-in-progress: true
-
-env:
-    # Where build artefacts land. Set in Forgejo repo Variables :
-    #   FORGEJO_REGISTRY_URL = https://forgejo.veza.fr/api/packages/talas/generic
-    REGISTRY_URL: ${{ vars.FORGEJO_REGISTRY_URL }}
-
-jobs:
-    # =================================================================
-    # Resolve env + sha from the trigger.
-    # =================================================================
-    resolve:
-        name: Resolve env + SHA
-        runs-on: ubuntu-latest
-        outputs:
-            env: ${{ steps.r.outputs.env }}
-            sha: ${{ steps.r.outputs.sha }}
-        steps:
-            - name: Resolve
-              id: r
-              run: |
-                  if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-                      ENV="${{ inputs.env }}"
-                      SHA="${{ inputs.release_sha || github.sha }}"
-                  elif [ "${{ github.ref_type }}" = "tag" ]; then
-                      ENV="prod"
-                      SHA="${{ github.sha }}"
-                  else
-                      ENV="staging"
-                      SHA="${{ github.sha }}"
-                  fi
-                  if ! echo "$SHA" | grep -Eq '^[0-9a-f]{40}$'; then
-                      echo "SHA '$SHA' is not a 40-char git SHA"
-                      exit 1
-                  fi
-                  echo "env=$ENV" >> "$GITHUB_OUTPUT"
-                  echo "sha=$SHA" >> "$GITHUB_OUTPUT"
-                  echo "Resolved env=$ENV sha=$SHA"
-
-    # =================================================================
-    # Build backend (Go).
-    # =================================================================
-    build-backend:
-        name: Build backend
-        needs: resolve
-        runs-on: ubuntu-latest
-        timeout-minutes: 20
-        steps:
-            - uses: actions/checkout@v4
-              with:
-                  fetch-depth: 1
-                  ref: ${{ needs.resolve.outputs.sha }}
-
-            - name: Set up Go
-              uses: actions/setup-go@v5
-              with:
-                  go-version: "1.25"
-                  cache: true
-                  cache-dependency-path: veza-backend-api/go.sum
-
-            - name: Test
-              working-directory: veza-backend-api
-              env:
-                  VEZA_SKIP_INTEGRATION: "1"
-              run: go test ./... -short -count=1 -timeout 300s
-
-            - name: Build veza-api (CGO=0, static)
-              working-directory: veza-backend-api
-              env:
-                  CGO_ENABLED: "0"
-                  GOOS: linux
-                  GOARCH: amd64
-              run: |
-                  go build -trimpath -ldflags "-s -w" \
-                      -o ./bin/veza-api ./cmd/api/main.go
-                  go build -trimpath -ldflags "-s -w" \
-                      -o ./bin/migrate_tool ./cmd/migrate_tool/main.go
-
-            - name: Stage tarball contents
-              working-directory: veza-backend-api
-              run: |
-                  STAGE="$RUNNER_TEMP/veza-backend"
-                  mkdir -p "$STAGE/migrations"
-                  cp ./bin/veza-api ./bin/migrate_tool "$STAGE/"
-                  cp -r ./migrations/* "$STAGE/migrations/" || true
-                  echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
-
-            - name: Pack tarball
-              run: |
-                  cd "$RUNNER_TEMP"
-                  tar --use-compress-program=zstd -cf \
-                      "veza-backend-${{ needs.resolve.outputs.sha }}.tar.zst" \
-                      -C "$RUNNER_TEMP/veza-backend" .
-
-            - name: Push to Forgejo Package Registry
-              env:
-                  TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
-              run: |
-                  set -e
-                  TARBALL="veza-backend-${{ needs.resolve.outputs.sha }}.tar.zst"
-                  URL="${REGISTRY_URL}/veza-backend/${{ needs.resolve.outputs.sha }}/${TARBALL}"
-                  echo "PUT → $URL"
-                  curl -fsSL --fail-with-body -X PUT \
-                      -H "Authorization: token ${TOKEN}" \
-                      --upload-file "$RUNNER_TEMP/${TARBALL}" \
-                      "${URL}"
-
-    # =================================================================
-    # Build stream (Rust).
-    # =================================================================
-    build-stream:
-        name: Build stream
-        needs: resolve
-        runs-on: ubuntu-latest
-        timeout-minutes: 30
-        steps:
-            - uses: actions/checkout@v4
-              with:
-                  fetch-depth: 1
-                  ref: ${{ needs.resolve.outputs.sha }}
-
-            - name: Set up Rust toolchain
-              run: |
-                  command -v rustup >/dev/null || \
-                      curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
-                  source "$HOME/.cargo/env"
-                  rustup target add x86_64-unknown-linux-musl
-                  echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
-                  sudo apt-get update -qq && sudo apt-get install -y musl-tools
-
-            - name: Cache cargo + target
-              uses: actions/cache@v4
-              with:
-                  path: |
-                      ~/.cargo/registry
-                      ~/.cargo/git
-                      veza-stream-server/target
-                  key: deploy-${{ runner.os }}-cargo-${{ hashFiles('veza-stream-server/Cargo.lock') }}
-                  restore-keys: |
-                      deploy-${{ runner.os }}-cargo-
-
-            - name: Test
-              working-directory: veza-stream-server
-              run: cargo test --workspace
-
-            - name: Build stream_server (musl static)
-              working-directory: veza-stream-server
-              run: |
-                  cargo build --release --locked \
-                      --target x86_64-unknown-linux-musl
-
-            - name: Stage tarball contents
-              working-directory: veza-stream-server
-              run: |
-                  STAGE="$RUNNER_TEMP/veza-stream"
-                  mkdir -p "$STAGE"
-                  cp ./target/x86_64-unknown-linux-musl/release/stream_server "$STAGE/"
-                  echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
-
-            - name: Pack tarball
-              run: |
-                  cd "$RUNNER_TEMP"
-                  tar --use-compress-program=zstd -cf \
-                      "veza-stream-${{ needs.resolve.outputs.sha }}.tar.zst" \
-                      -C "$RUNNER_TEMP/veza-stream" .
-
-            - name: Push to Forgejo Package Registry
-              env:
-                  TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
-              run: |
-                  set -e
-                  TARBALL="veza-stream-${{ needs.resolve.outputs.sha }}.tar.zst"
-                  URL="${REGISTRY_URL}/veza-stream/${{ needs.resolve.outputs.sha }}/${TARBALL}"
-                  echo "PUT → $URL"
-                  curl -fsSL --fail-with-body -X PUT \
-                      -H "Authorization: token ${TOKEN}" \
-                      --upload-file "$RUNNER_TEMP/${TARBALL}" \
-                      "${URL}"
-
-    # =================================================================
-    # Build web (React/Vite).
-    # =================================================================
-    build-web:
-        name: Build web
-        needs: resolve
-        runs-on: ubuntu-latest
-        timeout-minutes: 20
-        steps:
-            - uses: actions/checkout@v4
-              with:
-                  fetch-depth: 1
-                  ref: ${{ needs.resolve.outputs.sha }}
-
-            - name: Use Node.js
-              uses: actions/setup-node@v4
-              with:
-                  node-version: "20"
-                  cache: "npm"
-                  cache-dependency-path: package-lock.json
-
-            - name: Install dependencies
-              run: npm ci
-
-            - name: Build design tokens
-              run: npm run build:tokens --workspace=@veza/design-system
-
-            - name: Build SPA
-              working-directory: apps/web
-              env:
-                  VITE_API_URL: /api/v1
-                  VITE_DOMAIN: ${{ needs.resolve.outputs.env == 'prod' && 'veza.fr' || 'staging.veza.fr' }}
-                  VITE_RELEASE_SHA: ${{ needs.resolve.outputs.sha }}
-              run: npm run build
-
-            - name: Stage tarball contents
-              run: |
-                  STAGE="$RUNNER_TEMP/veza-web"
-                  mkdir -p "$STAGE"
-                  cp -r apps/web/dist/* "$STAGE/"
-                  echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
-
-            - name: Pack tarball
-              run: |
-                  cd "$RUNNER_TEMP"
-                  tar --use-compress-program=zstd -cf \
-                      "veza-web-${{ needs.resolve.outputs.sha }}.tar.zst" \
-                      -C "$RUNNER_TEMP/veza-web" .
-
-            - name: Push to Forgejo Package Registry
-              env:
-                  TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
-              run: |
-                  set -e
-                  TARBALL="veza-web-${{ needs.resolve.outputs.sha }}.tar.zst"
-                  URL="${REGISTRY_URL}/veza-web/${{ needs.resolve.outputs.sha }}/${TARBALL}"
-                  echo "PUT → $URL"
-                  curl -fsSL --fail-with-body -X PUT \
-                      -H "Authorization: token ${TOKEN}" \
-                      --upload-file "$RUNNER_TEMP/${TARBALL}" \
-                      "${URL}"
-
-    # =================================================================
-    # Deploy via Ansible. Runs on the self-hosted runner that has
-    # Incus socket access (label `incus`). Requires Forgejo secrets:
-    #   ANSIBLE_VAULT_PASSWORD   — unlocks group_vars/all/vault.yml
-    #   FORGEJO_REGISTRY_TOKEN   — same token the build jobs use,
-    #                              passed to ansible-playbook so
-    #                              the data containers can fetch
-    #                              the tarballs they were just sent.
-    # =================================================================
-    deploy:
-        name: Deploy via Ansible
-        needs: [resolve, build-backend, build-stream, build-web]
-        runs-on: [self-hosted, incus]
-        timeout-minutes: 30
-        steps:
-            - uses: actions/checkout@v4
-              with:
-                  fetch-depth: 1
-                  ref: ${{ needs.resolve.outputs.sha }}
-
-            - name: Install ansible + community.general + community.postgresql + community.rabbitmq
-              run: |
-                  sudo apt-get update -qq
-                  sudo apt-get install -y ansible python3-psycopg2 python3-pip
-                  ansible-galaxy collection install \
-                      community.general \
-                      community.postgresql \
-                      community.rabbitmq
-
-            - name: Write vault password to a tmpfile
-              env:
-                  VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
-              run: |
-                  printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
-                  chmod 0400 "$RUNNER_TEMP/vault-pass"
-                  echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
-
-            - name: Run deploy_data.yml (idempotent provisioning + ZFS snapshot)
-              working-directory: infra/ansible
-              env:
-                  ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-data-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}.log
-                  ANSIBLE_HOST_KEY_CHECKING: "False"
-              run: |
-                  ansible-playbook \
-                      -i inventory/${{ needs.resolve.outputs.env }}.yml \
-                      playbooks/deploy_data.yml \
-                      --vault-password-file "$VAULT_PASS_FILE" \
-                      -e veza_env=${{ needs.resolve.outputs.env }} \
-                      -e veza_release_sha=${{ needs.resolve.outputs.sha }} \
-                      -e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}
-
-            - name: Run deploy_app.yml (blue/green)
-              working-directory: infra/ansible
-              env:
-                  ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-app-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}.log
-                  ANSIBLE_HOST_KEY_CHECKING: "False"
-              run: |
-                  ansible-playbook \
-                      -i inventory/${{ needs.resolve.outputs.env }}.yml \
-                      playbooks/deploy_app.yml \
-                      --vault-password-file "$VAULT_PASS_FILE" \
-                      -e veza_env=${{ needs.resolve.outputs.env }} \
-                      -e veza_release_sha=${{ needs.resolve.outputs.sha }} \
-                      -e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}
-
-            - name: Upload Ansible logs (for forensics)
-              if: always()
-              uses: actions/upload-artifact@v4
-              with:
-                  name: ansible-logs-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}
-                  path: ${{ runner.temp }}/ansible-*.log
-                  retention-days: 30
-
-            - name: Shred vault password file
-              if: always()
-              run: |
-                  if [ -f "$VAULT_PASS_FILE" ]; then
-                      shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
-                  fi
--- a/.forgejo/workflows/rollback.yml
+++ b/.forgejo/workflows/rollback.yml
@ -1,118 +0,0 @@
-# rollback.yml — workflow_dispatch only.
-#
-# Two modes :
-#   fast — flip HAProxy back to the previous color. ~5s. Requires
-#          the target color's containers to still be alive
-#          (i.e., no later deploy has recycled them).
-#   full — re-run deploy_app.yml with a specific (older) release_sha.
-#          ~5-10min. The artefact must still be in the Forgejo
-#          registry (default retention 30 SHA per component).
-#
-# See docs/RUNBOOK_ROLLBACK.md for decision criteria.
-name: Veza rollback
-
-on:
-    workflow_dispatch:
-        inputs:
-            env:
-                description: "Environment to rollback"
-                required: true
-                type: choice
-                options: [staging, prod]
-            mode:
-                description: "Rollback mode"
-                required: true
-                type: choice
-                options: [fast, full]
-            target_color:
-                description: "(mode=fast only) color to flip back TO (the prior active one)"
-                required: false
-                type: choice
-                options: [blue, green]
-            release_sha:
-                description: "(mode=full only) 40-char SHA of the release to redeploy"
-                required: false
-                type: string
-
-concurrency:
-    group: rollback-${{ inputs.env }}
-    cancel-in-progress: false
-
-jobs:
-    rollback:
-        name: Rollback ${{ inputs.env }} (${{ inputs.mode }})
-        runs-on: [self-hosted, incus]
-        timeout-minutes: 30
-        steps:
-            - name: Validate inputs
-              run: |
-                  if [ "${{ inputs.mode }}" = "fast" ] && [ -z "${{ inputs.target_color }}" ]; then
-                      echo "mode=fast requires target_color"
-                      exit 1
-                  fi
-                  if [ "${{ inputs.mode }}" = "full" ]; then
-                      if [ -z "${{ inputs.release_sha }}" ]; then
-                          echo "mode=full requires release_sha"
-                          exit 1
-                      fi
-                      if ! echo "${{ inputs.release_sha }}" | grep -Eq '^[0-9a-f]{40}$'; then
-                          echo "release_sha is not a 40-char git SHA"
-                          exit 1
-                      fi
-                  fi
-
-            - uses: actions/checkout@v4
-              with:
-                  fetch-depth: 1
-                  ref: ${{ inputs.mode == 'full' && inputs.release_sha || github.ref }}
-
-            - name: Install ansible + collections
-              run: |
-                  sudo apt-get update -qq
-                  sudo apt-get install -y ansible python3-psycopg2
-                  ansible-galaxy collection install \
-                      community.general \
-                      community.postgresql \
-                      community.rabbitmq
-
-            - name: Write vault password
-              env:
-                  VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
-              run: |
-                  printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
-                  chmod 0400 "$RUNNER_TEMP/vault-pass"
-                  echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
-
-            - name: Run rollback.yml
-              working-directory: infra/ansible
-              env:
-                  ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-rollback-${{ inputs.env }}-${{ inputs.mode }}.log
-                  ANSIBLE_HOST_KEY_CHECKING: "False"
-              run: |
-                  EXTRA="-e veza_env=${{ inputs.env }} -e mode=${{ inputs.mode }}"
-                  if [ "${{ inputs.mode }}" = "fast" ]; then
-                      EXTRA="$EXTRA -e target_color=${{ inputs.target_color }}"
-                  else
-                      EXTRA="$EXTRA -e veza_release_sha=${{ inputs.release_sha }}"
-                      EXTRA="$EXTRA -e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}"
-                  fi
-                  ansible-playbook \
-                      -i inventory/${{ inputs.env }}.yml \
-                      playbooks/rollback.yml \
-                      --vault-password-file "$VAULT_PASS_FILE" \
-                      $EXTRA
-
-            - name: Upload Ansible log
-              if: always()
-              uses: actions/upload-artifact@v4
-              with:
-                  name: ansible-rollback-${{ inputs.env }}-${{ inputs.mode }}
-                  path: ${{ runner.temp }}/ansible-rollback-*.log
-                  retention-days: 30
-
-            - name: Shred vault password file
-              if: always()
-              run: |
-                  if [ -f "$VAULT_PASS_FILE" ]; then
-                      shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
-                  fi
--- a/.gitignore
+++ b/.gitignore
@ -265,14 +265,3 @@ frontend_screenshots/

 # Audit_remediation glob (supersedes J2's exact-match json)
 apps/web/audit_remediation*
-
-# ============================================================
-# Ansible Vault — secrets at rest stay encrypted in vault.yml
-# (committed). The vault password used to unlock them MUST NOT
-# be committed; the Forgejo runner reads it from a repo secret.
-# ============================================================
-infra/ansible/.vault-pass
-infra/ansible/.vault-pass.*
-# Local copies devs sometimes drop next to the repo for editing
-.vault-pass
-.vault-pass.*
--- a/docs/CANARY_RELEASE.md
+++ b/docs/CANARY_RELEASE.md
@ -1,111 +0,0 @@
-# Canary release — backend-api
-
-> **Audience** : on-call engineer running a release.
-> **Trigger** : a new backend-api binary signed-off for prod.
-> **Owner** : whoever's on the deploy rota that day.
-
-The canary recipe ships the new binary to **one** backend at a time, watches the SLI for a window, and only continues to the next backend when the SLI stays green. If the SLI breaches at any point, the canary node rolls back automatically to the last-known-good binary.
-
-## Trigger conditions
-
-Run the canary script when one of these is true :
-
- A normal feature release. New code path, no schema migration that requires lockstep coordination.
- A hot-fix on a Sev-2 or below issue. Sev-1 (security or data-integrity) follows the all-stop rotate path documented in `docs/runbooks/INCIDENT_RESPONSE.md` instead.
-
-## Pre-flight checklist
-
- [ ] **Migration backward-compat** : the latest schema migration is additive only — no `DROP COLUMN`, no `ALTER COLUMN ... TYPE`, no `ADD COLUMN ... NOT NULL` without `DEFAULT`. The script's pre-deploy hook (`scripts/check-migration-backward-compat.sh`) refuses to proceed when it finds one ; bypass with `FORCE_MIGRATE=1` only after you've split the migration in your head.
- [ ] **Last-known-good binary** is preserved. Either : (a) the previous release's `veza-api` is still on the host at `/opt/veza/backend-api/veza-api.previous`, OR (b) you have it locally and pass `ROLLBACK_BINARY=/path/to/old/veza-api` as env to the script.
- [ ] **Prometheus reachable** from the deploy host. The SLI monitor queries `${PROM_URL}` (default `http://prom.lxd:9090`) every `${SLI_PROBE_INTERVAL}` seconds for 1 hour.
- [ ] **HAProxy admin socket reachable** : the script execs into the haproxy Incus container to drive `set server ${POOL}/${NODE} state drain|ready` via socat.
- [ ] **No game day in the same window.** Canary needs a quiet baseline ; chaos drills will push the SLI red and trigger a false rollback.
-
-## How
-
-### One-shot via Make
-
-```bash
-make deploy-canary ARTIFACT=/tmp/veza-api-v1.0.10
-```
-
-The Make target wraps the script with reasonable defaults. Override any env (see the script header) by exporting before the `make` call.
-
-### Direct script invocation
-
-```bash
-ARTIFACT=/tmp/veza-api-v1.0.10 \
-ROLLBACK_BINARY=/opt/veza/backend-api/veza-api.previous \
-SLI_WINDOW=3600 \
-PROM_URL=http://prom.lxd:9090 \
-bash scripts/deploy-canary.sh
-```
-
-The script is idempotent on the steps that matter : draining an already-drained server is a no-op ; pushing the same binary twice is a no-op (file mtime invariant). Re-runs after a partial failure are safe.
-
-## What happens, in order
-
-1. **Pre-deploy hook** runs `scripts/check-migration-backward-compat.sh` on the new-since-`origin/main` migration files. Forbidden patterns abort the deploy.
-2. **Drain `CANARY_NODE`** (default `backend-api-2`) via the HAProxy admin socket. Wait until the node has 0 active connections.
-3. **Push the binary** to `/opt/veza/backend-api/veza-api` on the canary container. `systemctl restart veza-backend-api`.
-4. **Per-node health check** : `curl http://127.0.0.1:8080/api/v1/health` from inside the container. If the node doesn't return 200 within 60 s, rollback.
-5. **Re-enable** the canary node in HAProxy.
-6. **LB-side health check** : `curl http://haproxy.lxd${HEALTH_PATH}` returns 200 (proves HAProxy sees the node ready and routes through it).
-7. **SLI monitor** for `SLI_WINDOW` seconds (default 3600 = 1h). Probes Prometheus every `SLI_PROBE_INTERVAL` (default 30 s) for :
-    - p95 of `veza_gin_http_request_duration_seconds_bucket` < `PROM_P95_THRESHOLD_S` (0.5 s)
-    - error rate (5xx ÷ total) < `PROM_ERR_RATE_THRESHOLD` (0.005 = 0.5%)
-   First red probe → rollback.
-8. **Roll the peers** : for each `PEER_NODES` entry (default `backend-api-1`), repeat steps 2–6 (drain → deploy → health → re-enable → LB check). The peer roll skips the SLI monitor because the canary already proved the SLI ; if a peer-specific failure happens (binary corrupt on push, container disk full), the script bails out.
-
-## Rollback path
-
-The script handles the canary rollback automatically when :
-
- The pre-deploy hook fails. Nothing is changed ; nothing to revert.
- The canary's health check fails after the deploy. Old binary restored from `ROLLBACK_BINARY`, canary re-enabled.
- The SLI breaches during the monitor window. Same as above.
-
-The script does **NOT** rollback peers automatically — by the time peers are rolling, the canary has already accumulated a green-SLI window. A peer health failure is an artifact of the deploy step (corrupt push, container memory issue), not of the new binary itself, and re-running after fixing the local issue is safer than ping-ponging the binary.
-
-## Manual rollback (full)
-
-When the script doesn't catch the regression — say a slow leak that surfaces after the SLI window closes — the on-call manually drives :
-
-```bash
-# Find which backend is on the new binary :
-incus exec backend-api-1 -- ls -la /opt/veza/backend-api/veza-api
-incus exec backend-api-2 -- ls -la /opt/veza/backend-api/veza-api
-
-# Rotate both back to the previous binary :
-for ct in backend-api-1 backend-api-2; do
-  incus exec "$ct" -- mv /opt/veza/backend-api/veza-api.previous /opt/veza/backend-api/veza-api
-  incus exec "$ct" -- systemctl restart veza-backend-api
-done
-```
-
-The previous binary is conventionally kept at `${INSTALL_DIR}/veza-api.previous` ; the canary script does NOT copy the current binary there before overwriting (deliberate — that's a deploy-pipeline responsibility, not a per-canary responsibility).
-
-## Configuration knobs
-
-All of these are env vars — the script header is the source of truth for defaults.
-
-| Knob                          | Default                       | When to change                                        |
-| ----------------------------- | ----------------------------- | ----------------------------------------------------- |
-| `POOL_BACKEND`                | `api_pool`                    | If you renamed the HAProxy backend                    |
-| `CANARY_NODE`                 | `backend-api-2`               | Toggle which node receives the canary first           |
-| `PEER_NODES`                  | `backend-api-1`               | When the fleet grows beyond 2 nodes                   |
-| `SLI_WINDOW`                  | `3600` (1 h)                  | Shorten for hot-fixes (300 = 5 min minimum)           |
-| `SLI_PROBE_INTERVAL`          | `30` s                        | Tighter probes catch a leak faster but cost Prom load |
-| `PROM_P95_THRESHOLD_S`        | `0.5`                         | Match the SLO ; loosening it hides regressions        |
-| `PROM_ERR_RATE_THRESHOLD`     | `0.005` (0.5 %)               | Match the SLO                                         |
-| `ROLLBACK_BINARY`             | (unset)                       | Always set in a real run — auto-rollback can't work without it |
-
-## Acceptance bar (Day 23)
-
-Per `docs/ROADMAP_V1.0_LAUNCH.md` : 3 canary deploys on staging, 2 normal + 1 with a deliberate rollback (e.g. push a binary that hardcodes a 500 on `/api/v1/health`). The rollback exercise verifies the script's auto-revert path actually fires.
-
-## What this doesn't do
-
- **Cross-LB rolls** : single haproxy assumed. When phase-2 adds keepalived + a second LB, the canary script will need a `--lb-set` arg to roll the LB pair too.
- **Database migrations** : split-read-write migrations (e.g. dual-write during a rename) need a multi-step deploy that this script doesn't model. For now, only additive migrations are supported through the canary.
- **Stream-server canary** : the Rust streamer follows a separate playbook (URI-hash routing means a per-track-id affinity, not a per-session affinity). Same principles apply but the script is backend-api-specific.
--- a/infra/ansible/group_vars/README.md
+++ b/infra/ansible/group_vars/README.md
@ -1,67 +0,0 @@
-# `group_vars/` layout
-
-Three layers, in order of precedence (later wins):
-
-1. `all/main.yml` — defaults shared across every inventory. Cross-cutting
-   values like SSH hardening, monitoring agent version, and the Veza
-   deploy contract (artifact URL, base image, ports, health probes).
-2. `<env>.yml` — environment overrides. Today: `staging.yml`, `prod.yml`
-   (and `lab.yml` would live here too if `inventory/lab.yml` ever
-   referenced an `all/lab` group). Targets that pin the Incus host,
-   container prefix, public domain, log level, feature flags.
-3. `all/vault.yml` — encrypted secrets (Ansible Vault). All entries
-   prefixed `vault_*`. Plaintext template at `all/vault.yml.example`.
-
-## Bootstrapping the vault
-
-The vault file is **not** committed at first. To stand it up:
-
-```bash
-cd infra/ansible
-cp group_vars/all/vault.yml.example group_vars/all/vault.yml
-$EDITOR group_vars/all/vault.yml          # fill in <TODO> placeholders
-ansible-vault encrypt group_vars/all/vault.yml
-echo "<your strong vault password>" > .vault-pass
-chmod 0400 .vault-pass
-```
-
-`.vault-pass` is gitignored — never commit it. The Forgejo runner
-gets the same password from the `ANSIBLE_VAULT_PASSWORD` repo secret
-(see `.forgejo/workflows/deploy.yml`).
-
-To edit later without decrypting on disk:
-
-```bash
-ansible-vault edit group_vars/all/vault.yml
-```
-
-To rotate the password (e.g., when an operator leaves):
-
-```bash
-ansible-vault rekey group_vars/all/vault.yml
-echo "<new password>" > .vault-pass
-# update Forgejo secret ANSIBLE_VAULT_PASSWORD to the new value
-```
-
-## How variables flow into containers
-
-```
-[Ansible runtime]                        [Container]
-   group_vars/all/main.yml      ┐
-   group_vars/<env>.yml         ├──→ roles/veza_app/templates/*.j2 ──→ /etc/veza/<component>.env
-   group_vars/all/vault.yml     ┘                                  ──→ /etc/veza/secrets/jwt-private.pem
-                                                                   ──→ systemd unit (EnvironmentFile=)
-```
-
-The systemd unit then reads `/etc/veza/<component>.env` at start time.
-Reload semantics: a config change re-templates the env file and
-notifies the systemd handler, which restarts the unit.
-
-## What lives in `host_vars/`?
-
-`host_vars/<host>.yml` for **per-host** overrides — typically when one
-container in an HA group needs a slightly different config (e.g., the
-postgres-primary needs `pg_auto_failover_role: node`, the monitor
-needs `pg_auto_failover_role: monitor`). The lab inventory inlines
-these as host-level vars; `host_vars/` exists for cases where they
-shouldn't bloat the inventory file.
--- a/infra/ansible/group_vars/all.yml
+++ b/infra/ansible/group_vars/all.yml
@ -0,0 +1,40 @@
+# Shared defaults across every inventory (lab/staging/prod). Override
+# per-environment in `group_vars/<group>.yml` or per-host in
+# `host_vars/<host>.yml`.
+---
+# Owner contact (used in some unattended-upgrades + monitoring agent configs).
+veza_ops_email: ops@veza.fr
+
+# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces.
+# Override these in production via group_vars/veza_prod.yml when the
+# bastion's specific port / allowed users are decided. Defaults are
+# safe for lab.
+ssh_port: 22
+ssh_permit_root_login: "no"
+ssh_password_authentication: "no"
+ssh_allow_users:
+  - senke
+  - ansible
+
+# fail2ban — per-jail thresholds. The defaults are conservative for
+# a self-hosted single-machine deployment; production may want
+# lower findtime / higher bantime once Forgejo + Veza traffic is
+# baselined.
+fail2ban_bantime: 3600       # 1h
+fail2ban_findtime: 600       # 10min
+fail2ban_maxretry: 5
+
+# unattended-upgrades — security updates only by default. The role
+# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins
+# downtime windows to controlled cycles, not OS-driven reboots.
+unattended_upgrades_origins:
+  - "${distro_id}:${distro_codename}-security"
+  - "${distro_id}ESMApps:${distro_codename}-apps-security"
+  - "${distro_id}ESM:${distro_codename}-infra-security"
+unattended_upgrades_auto_reboot: false
+
+# Monitoring agent: prometheus node_exporter is the bare-minimum
+# host metrics surface (CPU / memory / disk / network). The
+# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap.
+monitoring_node_exporter_version: "1.8.2"
+monitoring_node_exporter_port: 9100
--- a/infra/ansible/group_vars/all/main.yml
+++ b/infra/ansible/group_vars/all/main.yml
@ -1,90 +0,0 @@
-# Shared defaults across every inventory (lab/staging/prod). Override
-# per-environment in `group_vars/<group>.yml` or per-host in
-# `host_vars/<host>.yml`.
---
-# Owner contact (used in some unattended-upgrades + monitoring agent configs).
-veza_ops_email: ops@veza.fr
-
-# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces.
-# Override these in production via group_vars/veza_prod.yml when the
-# bastion's specific port / allowed users are decided. Defaults are
-# safe for lab.
-ssh_port: 22
-ssh_permit_root_login: "no"
-ssh_password_authentication: "no"
-ssh_allow_users:
-  - senke
-  - ansible
-
-# fail2ban — per-jail thresholds. The defaults are conservative for
-# a self-hosted single-machine deployment; production may want
-# lower findtime / higher bantime once Forgejo + Veza traffic is
-# baselined.
-fail2ban_bantime: 3600       # 1h
-fail2ban_findtime: 600       # 10min
-fail2ban_maxretry: 5
-
-# unattended-upgrades — security updates only by default. The role
-# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins
-# downtime windows to controlled cycles, not OS-driven reboots.
-unattended_upgrades_origins:
-  - "${distro_id}:${distro_codename}-security"
-  - "${distro_id}ESMApps:${distro_codename}-apps-security"
-  - "${distro_id}ESM:${distro_codename}-infra-security"
-unattended_upgrades_auto_reboot: false
-
-# Monitoring agent: prometheus node_exporter is the bare-minimum
-# host metrics surface (CPU / memory / disk / network). The
-# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap.
-monitoring_node_exporter_version: "1.8.2"
-monitoring_node_exporter_port: 9100
-
-# ============================================================
-# Veza app deploy — defaults shared by every environment.
-# Each can be overridden in group_vars/{staging,prod}.yml.
-# ============================================================
-
-# Forgejo Package Registry where the deploy workflow pushes release
-# tarballs. Forgejo's generic-package URL shape is:
-#   {base}/{owner}/generic/{package}/{version}/{filename}
-# We treat each component as a separate package (`veza-backend`,
-# `veza-stream`, `veza-web`), the SHA as the version, and the
-# tarball name as the filename. Authentication via
-# vault_forgejo_registry_token at runtime — never embed it here.
-veza_artifact_base_url: "https://forgejo.veza.fr/api/packages/talas/generic"
-
-# Container image used as the base for fresh app containers. The
-# `veza_app` role apt-installs OS deps on top. Pinned tag keeps deploys
-# reproducible across base-image updates.
-veza_app_base_image: "images:debian/13"
-
-# Per-component HTTP ports. Backend listens on `APP_PORT` env var;
-# stream listens on `PORT` env var. Templates render these into env
-# files; HAProxy reads them to wire backends.
-veza_backend_port: 8080
-veza_stream_port: 8082
-veza_web_port: 80
-
-# Health probe parameters — used by deploy_app's Phase D and by the
-# rollback playbook when verifying a switched color.
-veza_healthcheck_retries: 30
-veza_healthcheck_delay_seconds: 2
-veza_healthcheck_paths:
-  backend: /api/v1/health
-  stream: /health
-  web: /
-
-# OS package set installed in every fresh app container. Component-
-# specific extras live in roles/veza_app/vars/<component>.yml.
-veza_common_os_packages:
-  - ca-certificates
-  - curl
-  - tzdata
-  - zstd  # to decompress release tarballs
-
-# Where artefacts land in-container. Per-SHA subdirs let multiple
-# releases coexist for forensics without conflict.
-veza_install_root: /opt/veza
-veza_config_root: /etc/veza
-veza_log_root: /var/log/veza
-veza_state_root: /var/lib/veza
--- a/infra/ansible/group_vars/all/vault.yml.example
+++ b/infra/ansible/group_vars/all/vault.yml.example
@ -1,78 +0,0 @@
-# Template for group_vars/all/vault.yml — the encrypted secrets store
-# consumed by every playbook. Copy this file to vault.yml, fill in real
-# values, then encrypt:
-#
-#   cp vault.yml.example vault.yml
-#   $EDITOR vault.yml                                # fill in real values
-#   ansible-vault encrypt vault.yml                  # in place
-#   echo "<your strong password>" > ../../../.vault-pass    # gitignored
-#   chmod 0400 ../../../.vault-pass
-#
-# After that, every `ansible-playbook` invocation needs:
-#   ansible-playbook --vault-password-file infra/ansible/.vault-pass ...
-# The Forgejo deploy workflow handles this via the ANSIBLE_VAULT_PASSWORD
-# repo secret (see .forgejo/workflows/deploy.yml).
-#
-# Naming: every secret is prefixed `vault_*` so it's grep-able and so
-# `group_vars/all/main.yml` references like `postgres_password:
-# "{{ vault_postgres_password }}"` are unambiguous.
---
-# --- Database -----------------------------------------------------------
-vault_postgres_password: "<TODO: 32+ char strong password for veza role>"
-vault_postgres_replication_password: "<TODO: separate password for replication user>"
-
-# --- Cache / queue ------------------------------------------------------
-vault_redis_password: "<TODO>"
-vault_rabbitmq_password: "<TODO>"
-
-# --- Object storage (MinIO) ---------------------------------------------
-vault_minio_root_user: "<TODO: only used to bootstrap the cluster>"
-vault_minio_root_password: "<TODO: 16+ chars, MinIO refuses shorter>"
-vault_minio_access_key: "<TODO: app-tier access key>"
-vault_minio_secret_key: "<TODO: app-tier secret key>"
-
-# --- JWT ----------------------------------------------------------------
-# Backend prefers RS256 in prod. Generate with:
-#   openssl genrsa -out jwt-private.pem 4096
-#   openssl rsa -in jwt-private.pem -pubout -out jwt-public.pem
-# Then base64 each:
-#   base64 -w0 jwt-private.pem
-#   base64 -w0 jwt-public.pem
-vault_jwt_signing_key_b64: "<TODO: base64 of RS256 private PEM>"
-vault_jwt_public_key_b64: "<TODO: base64 of RS256 public PEM>"
-
-# Chat WebSocket signs its own short-lived tokens — must differ from the
-# main JWT secret in production (defense in depth).
-vault_chat_jwt_secret: "<TODO: 32+ chars, distinct from JWT signing key>"
-
-# --- App-internal API keys ---------------------------------------------
-# Backend ↔ stream-server shared secret. Both services must have the
-# same value so /api/v1/internal/* requests authenticate.
-vault_stream_internal_api_key: "<TODO: 32+ chars>"
-
-# OAuth refresh tokens are encrypted at rest with this key.
-vault_oauth_encryption_key: "<TODO: exactly 32 bytes, raw or hex>"
-
-# --- Email --------------------------------------------------------------
-vault_smtp_password: "<TODO>"
-
-# --- Payments -----------------------------------------------------------
-# Hyperswitch routes through Stripe Connect. Both keys are required if
-# `HYPERSWITCH_ENABLED=true` in group_vars/<env>.yml.
-vault_hyperswitch_api_key: "<TODO>"
-vault_hyperswitch_webhook_secret: "<TODO>"
-vault_stripe_secret_key: "<TODO: sk_live_… in prod, sk_test_… in staging>"
-
-# --- OAuth providers ----------------------------------------------------
-# Add only the providers you actually enable; keys consumed by
-# templates/backend.env.j2 conditionally on truthiness.
-vault_oauth_clients:
-  google:
-    id: "<TODO>"
-    secret: "<TODO>"
-  spotify:
-    id: "<TODO>"
-    secret: "<TODO>"
-
-# --- Sentry / observability --------------------------------------------
-vault_sentry_dsn: "<TODO: empty string disables Sentry>"
--- a/infra/ansible/group_vars/prod.yml
+++ b/infra/ansible/group_vars/prod.yml
@ -1,42 +0,0 @@
-# Prod-specific overrides. Same R720 host as staging in v1.0; separate
-# Incus network + container prefix prevents staging/prod from sharing
-# any state. Phase-2 (post v1.1) is expected to move prod to a
-# dedicated host, at which point only `veza_incus_host` flips.
---
-veza_env: prod
-veza_release_channel: prod
-
-veza_incus_host: veza-prod
-veza_incus_network: veza-net
-veza_incus_subnet: 10.0.20.0/24
-
-veza_container_prefix: "veza-"  # production uses unprefixed names — the established convention
-
-veza_incus_dns_suffix: lxd
-
-haproxy_topology: blue-green
-
-veza_public_host: veza.fr
-veza_public_url: "https://veza.fr"
-veza_cors_allowed_origins:
-  - "https://veza.fr"
-  - "https://app.veza.fr"
-
-# Prod is INFO so 99th-percentile log volume stays manageable. Bump to
-# DEBUG for a window via `ansible-playbook -e veza_log_level=DEBUG` if
-# triaging an incident.
-veza_log_level: INFO
-veza_otel_sample_rate: "0.05"
-
-veza_feature_flags:
-  HYPERSWITCH_ENABLED: "true"
-  STRIPE_CONNECT_ENABLED: "true"
-  WEBAUTHN_ENABLED: "true"
-
-# Larger retention than staging — prod rollback may need to reach a
-# release from up to a month ago when the cause was latent.
-veza_release_retention: 60
-
-postgres_password: "{{ vault_postgres_password }}"
-redis_password: "{{ vault_redis_password }}"
-rabbitmq_password: "{{ vault_rabbitmq_password }}"
--- a/infra/ansible/group_vars/staging.yml
+++ b/infra/ansible/group_vars/staging.yml
@ -1,67 +0,0 @@
-# Staging-specific overrides. Targets the local R720 Incus daemon (the
-# same host the Forgejo runner lives on). Containers prefixed `veza-*`
-# share the `veza-net` Incus bridge (10.0.20.0/24).
-#
-# Phase-1 simplification: staging and prod coexist on the same R720 but
-# on separate Incus networks (veza-staging-net 10.0.21.0/24 vs
-# veza-prod-net 10.0.20.0/24) and separate container name prefixes
-# (veza-staging-* vs veza-prod-*). When prod migrates off-box (Hetzner
-# or similar), this file's `veza_incus_host` flips to that target.
---
-veza_env: staging
-veza_release_channel: staging
-
-# Where the Incus daemon lives. Used by the deploy workflow to decide
-# which inventory host's `community.general.incus` connection plugin
-# to drive containers from.
-veza_incus_host: veza-staging
-veza_incus_network: veza-staging-net
-veza_incus_subnet: 10.0.21.0/24
-
-# Container name prefix — every app/data container ends up named
-# `<veza_container_prefix><component>[-<color>]`. e.g.
-# veza-staging-backend-blue, veza-staging-postgres.
-veza_container_prefix: "veza-staging-"
-
-# DNS suffix Incus assigns to managed containers. The HAProxy template
-# resolves backends as `<container>.<suffix>`. Default `.lxd` works
-# with the stock Incus DNS resolver; override if you've renamed the
-# managed network's DNS zone.
-veza_incus_dns_suffix: lxd
-
-# HAProxy strategy for the staging stack: blue/green, two app
-# containers per component (active + standby). Differs from the lab
-# inventory which uses an active/active multi-instance pattern.
-haproxy_topology: blue-green
-
-# Public-facing URLs — used by backend for OAuth redirects, email
-# links, CSP origins, and by HAProxy ACLs.
-veza_public_host: staging.veza.fr
-veza_public_url: "https://staging.veza.fr"
-veza_cors_allowed_origins:
-  - "https://staging.veza.fr"
-  - "https://staging-app.veza.fr"
-
-# Logging — staging keeps DEBUG to make incident triage easy. Prod
-# drops to INFO. Tracing sample rate stays at 100% in staging
-# (low traffic) and 5% in prod (cost).
-veza_log_level: DEBUG
-veza_otel_sample_rate: "1.0"
-
-# Feature flags exposed to the backend at boot. Keep this list small —
-# the backend's own .env.template is the canonical reference.
-veza_feature_flags:
-  HYPERSWITCH_ENABLED: "false"
-  STRIPE_CONNECT_ENABLED: "false"
-  WEBAUTHN_ENABLED: "true"
-
-# How many recent release SHAs the rollback workflow can target. Older
-# tarballs are pruned by the Forgejo registry retention policy (set
-# externally). 30 deploys ≈ a working week given the staging cadence.
-veza_release_retention: 30
-
-# Postgres password the migrations job uses — references vault.yml so
-# rotation is one ansible-vault edit + one redeploy.
-postgres_password: "{{ vault_postgres_password }}"
-redis_password: "{{ vault_redis_password }}"
-rabbitmq_password: "{{ vault_rabbitmq_password }}"
--- a/infra/ansible/playbooks/cleanup_failed.yml
+++ b/infra/ansible/playbooks/cleanup_failed.yml
@ -1,83 +0,0 @@
-# cleanup_failed.yml — destroy the app containers of a specific color.
-# Used when a deploy_app.yml run failed Phase D or Phase F and the
-# operator has finished forensics on the kept-alive failed color.
-#
-# Required extra-vars:
-#   env             staging | prod
-#   target_color    blue | green     (the color to tear down)
-#
-# Safety: refuses to destroy the CURRENTLY-ACTIVE color. Active color
-# is read from the HAProxy container's /var/lib/veza/active-color.
-#
-# Caller (workflow_dispatch only):
-#   ansible-playbook -i inventory/{{env}}.yml playbooks/cleanup_failed.yml \
-#     -e env={{env}} -e target_color={{color}}
---
- name: Validate inputs and refuse to nuke the active color
-  hosts: incus_hosts
-  become: true
-  gather_facts: false
-  tasks:
-    - name: Assert required vars
-      ansible.builtin.assert:
-        that:
-          - veza_env is defined
-          - veza_env in ['staging', 'prod']
-          - target_color is defined
-          - target_color in ['blue', 'green']
-        fail_msg: cleanup_failed.yml requires veza_env + target_color.
-        quiet: true
-
-    - name: Read active color from HAProxy container
-      ansible.builtin.shell: |
-        incus exec "{{ veza_container_prefix }}haproxy" -- \
-          cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]'
-      args:
-        executable: /bin/bash
-      register: active_color_raw
-      changed_when: false
-      failed_when: false
-
-    - name: Resolve current_active_color
-      ansible.builtin.set_fact:
-        current_active_color: "{{ active_color_raw.stdout if active_color_raw.stdout else 'blue' }}"
-
-    - name: Refuse if target_color matches the active color
-      ansible.builtin.fail:
-        msg: >-
-          target_color={{ target_color }} matches the currently-active
-          color in HAProxy. Refusing to destroy live containers.
-          Switch HAProxy first via rollback.yml or a re-deploy.
-      when: target_color == current_active_color
-
- name: Destroy the inactive-color app containers
-  hosts: incus_hosts
-  become: true
-  gather_facts: false
-  tasks:
-    - name: Force-delete each component container
-      ansible.builtin.shell: |
-        set -e
-        CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
-        if incus info "$CT" >/dev/null 2>&1; then
-          incus delete --force "$CT"
-          echo "Destroyed $CT"
-        else
-          echo "$CT does not exist, skip"
-        fi
-      args:
-        executable: /bin/bash
-      loop:
-        - backend
-        - stream
-        - web
-      register: cleanup_result
-      changed_when: "'Destroyed' in (cleanup_result.stdout | default(''))"
-      tags: [cleanup]
-
-    - name: Report what was destroyed
-      ansible.builtin.debug:
-        msg: |
-          Cleanup of color {{ target_color }} in env {{ veza_env }} complete.
-          Active color unchanged: {{ current_active_color }}.
-          Next deploy will recreate {{ target_color }} containers from scratch.
--- a/infra/ansible/playbooks/deploy_app.yml
+++ b/infra/ansible/playbooks/deploy_app.yml
@ -1,355 +0,0 @@
-# deploy_app.yml — second-half of every deploy. Runs AFTER
-# deploy_data.yml has snapshot + ensured data services up.
-#
-# Phases (mirror docs/RUNBOOK_ROLLBACK.md):
-#   A — Run migrations in an ephemeral tools container.
-#   B — Read /var/lib/veza/active-color in the HAProxy container,
-#       compute inactive_color (the color we are deploying TO).
-#   C — Destroy + relaunch the three app containers in inactive_color.
-#       Apply roles/veza_app per component (artefact install + health
-#       probe).
-#   D — Implicit in C: veza_app role's probe.yml runs. If any color's
-#       probe fails, the playbook errors and Phase E is skipped (HAProxy
-#       still pointing at the prior active color).
-#   E — Switch HAProxy via roles/veza_haproxy_switch (block/rescue
-#       guards prior cfg).
-#   F — External verification : curl through HAProxy, fail the playbook
-#       (and reverse-switch) if the public health endpoint is < 200.
-#
-# Required extra-vars:
-#   env             staging | prod
-#   release_sha     40-char git SHA
---
-# =====================================================================
-# Phase A — Migrations
-# =====================================================================
- name: Phase A — apply database migrations
-  hosts: incus_hosts
-  become: true
-  gather_facts: true
-  tasks:
-    - name: Validate inputs
-      ansible.builtin.assert:
-        that:
-          - veza_env in ['staging', 'prod']
-          - veza_release_sha | length == 40
-        fail_msg: deploy_app.yml requires veza_env + veza_release_sha extra-vars.
-        quiet: true
-
-    - name: Ensure ephemeral tools container exists
-      ansible.builtin.shell: |
-        set -e
-        TOOLS="{{ veza_container_prefix }}backend-tools"
-        if ! incus info "$TOOLS" >/dev/null 2>&1; then
-          incus launch {{ veza_app_base_image }} "$TOOLS" \
-            --profile veza-app --profile veza-net \
-            --network "{{ veza_incus_network }}"
-          for i in $(seq 1 30); do
-            incus exec "$TOOLS" -- /bin/true 2>/dev/null && exit 0
-            sleep 1
-          done
-          echo "tools container did not become ready"
-          exit 1
-        fi
-      args:
-        executable: /bin/bash
-      register: tools_provision
-      changed_when: "'incus launch' in (tools_provision.stdout | default(''))"
-      tags: [phaseA, migrations]
-
-    - name: Refresh inventory so the tools container becomes reachable
-      ansible.builtin.meta: refresh_inventory
-      tags: [phaseA]
-
- name: Phase A — install backend artifact + run migrate_tool inside tools
-  hosts: "{{ veza_container_prefix + 'backend-tools' }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-    veza_component: backend
-    veza_target_color: tools  # not blue/green — bypass color logic in name
-  tasks:
-    - name: Apt deps for tools container
-      ansible.builtin.apt:
-        name:
-          - ca-certificates
-          - curl
-          - postgresql-client
-          - libssl3
-          - zstd
-        state: present
-        update_cache: true
-        cache_valid_time: 3600
-
-    - name: Ensure migrate user
-      ansible.builtin.user:
-        name: veza-migrate
-        system: true
-        shell: /usr/sbin/nologin
-
-    - name: Ensure /opt/veza/migrate
-      ansible.builtin.file:
-        path: /opt/veza/migrate
-        state: directory
-        owner: veza-migrate
-        mode: "0755"
-
-    - name: Fetch backend tarball
-      ansible.builtin.get_url:
-        url: "{{ veza_artifact_base_url }}/veza-backend/{{ veza_release_sha }}/veza-backend-{{ veza_release_sha }}.tar.zst"
-        dest: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
-        mode: "0600"
-        headers:
-          Authorization: "token {{ vault_forgejo_registry_token | default('') }}"
-        force: false
-
-    - name: Extract tarball into /opt/veza/migrate
-      ansible.builtin.unarchive:
-        src: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
-        dest: "/opt/veza/migrate"
-        remote_src: true
-        owner: veza-migrate
-        creates: "/opt/veza/migrate/migrate_tool"
-
-    - name: Run migrate_tool
-      ansible.builtin.command: /opt/veza/migrate/migrate_tool --up
-      environment:
-        DATABASE_URL: "postgres://veza:{{ vault_postgres_password }}@{{ veza_container_prefix }}postgres.{{ veza_incus_dns_suffix }}:5432/veza?sslmode=disable"
-      register: migrate_result
-      changed_when: "'no changes' not in (migrate_result.stdout | default('').lower())"
-      no_log: true  # DATABASE_URL contains the password
-      tags: [phaseA, migrations]
-
-# =====================================================================
-# Phase B — Determine inactive color
-# =====================================================================
- name: Phase B — read active color, compute inactive_color
-  hosts: "{{ veza_container_prefix + 'haproxy' }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-  tasks:
-    - name: Read currently-active color
-      ansible.builtin.slurp:
-        src: /var/lib/veza/active-color
-      register: prior_color_raw
-      failed_when: false
-
-    - name: Resolve prior_active_color (default blue if no history)
-      ansible.builtin.set_fact:
-        prior_active_color: >-
-          {{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
-             else 'blue' }}
-        cacheable: true
-
-    - name: Compute inactive_color (the one we deploy TO)
-      ansible.builtin.set_fact:
-        inactive_color: "{{ 'green' if prior_active_color == 'blue' else 'blue' }}"
-        cacheable: true
-
-    - name: Show what we are switching to
-      ansible.builtin.debug:
-        msg: >-
-          Deploying SHA {{ veza_release_sha[:12] }} to color
-          {{ inactive_color }} (currently active: {{ prior_active_color }}).
-
-# =====================================================================
-# Phase C — destroy + relaunch the three app containers in inactive_color
-# =====================================================================
- name: Phase C — recreate inactive-color app containers (host-side)
-  hosts: incus_hosts
-  become: true
-  gather_facts: false
-  vars:
-    inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-  tasks:
-    - name: Destroy + launch each component container
-      ansible.builtin.shell: |
-        set -e
-        CT="{{ veza_container_prefix }}{{ item }}-{{ inactive_color }}"
-        # Force-delete is fine — these are stateless app containers ; the
-        # active color is untouched.
-        incus delete --force "$CT" 2>/dev/null || true
-        incus launch {{ veza_app_base_image }} "$CT" \
-          --profile veza-app \
-          --profile veza-net \
-          --network "{{ veza_incus_network }}"
-        for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
-          if incus exec "$CT" -- /bin/true 2>/dev/null; then
-            exit 0
-          fi
-          sleep 1
-        done
-        echo "Container $CT did not become ready"
-        exit 1
-      args:
-        executable: /bin/bash
-      loop:
-        - backend
-        - stream
-        - web
-      changed_when: true
-      tags: [phaseC]
-
-    - name: Refresh inventory so freshly-launched containers become reachable
-      ansible.builtin.meta: refresh_inventory
-      tags: [phaseC]
-
- name: Phase C — provision backend (inactive color) via veza_app role
-  hosts: "{{ veza_container_prefix + 'backend-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-    veza_component: backend
-    veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-  roles:
-    - veza_app
-  tags: [phaseC, backend]
-
- name: Phase C — provision stream (inactive color)
-  hosts: "{{ veza_container_prefix + 'stream-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-    veza_component: stream
-    veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-  roles:
-    - veza_app
-  tags: [phaseC, stream]
-
- name: Phase C — provision web (inactive color)
-  hosts: "{{ veza_container_prefix + 'web-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-    veza_component: web
-    veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-  roles:
-    - veza_app
-  tags: [phaseC, web]
-
-# =====================================================================
-# Phase D — cross-container probes (in addition to in-container probes
-# that veza_app already ran). This catches the case where the service
-# is up locally but unreachable via Incus DNS.
-# =====================================================================
- name: Phase D — probe each component via Incus DNS (cross-container)
-  hosts: "{{ veza_container_prefix + 'haproxy' }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-  tasks:
-    - name: Curl each component's health endpoint
-      ansible.builtin.uri:
-        url: "http://{{ veza_container_prefix }}{{ item.component }}-{{ inactive_color }}.{{ veza_incus_dns_suffix }}:{{ item.port }}{{ item.path }}"
-        method: GET
-        status_code: [200]
-        timeout: 5
-      register: cross_probe
-      retries: "{{ veza_healthcheck_retries }}"
-      delay: "{{ veza_healthcheck_delay_seconds }}"
-      until: cross_probe.status == 200
-      changed_when: false
-      loop:
-        - { component: backend, port: "{{ veza_backend_port }}", path: "{{ veza_healthcheck_paths.backend }}" }
-        - { component: stream,  port: "{{ veza_stream_port }}",  path: "{{ veza_healthcheck_paths.stream }}" }
-        - { component: web,     port: "{{ veza_web_port }}",     path: "{{ veza_healthcheck_paths.web }}" }
-      tags: [phaseD, probe]
-
-# =====================================================================
-# Phase E — switch HAProxy. roles/veza_haproxy_switch wraps render +
-# validate + atomic-swap + HUP in a block/rescue that restores prior
-# cfg on failure.
-# =====================================================================
- name: Phase E — switch HAProxy to the new color
-  hosts: "{{ veza_container_prefix + 'haproxy' }}"
-  become: true
-  gather_facts: true   # roles/veza_haproxy_switch wants ansible_date_time
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-    veza_active_color: "{{ inactive_color }}"  # the color we ARE switching TO
-  roles:
-    - veza_haproxy_switch
-  tags: [phaseE, switch]
-
-# =====================================================================
-# Phase F — Post-deploy verification (external curl through HAProxy).
-# If this fails, we revert HAProxy to the prior color via a second run
-# of veza_haproxy_switch and fail the playbook.
-# =====================================================================
- name: Phase F — verify externally + record deploy state
-  hosts: incus_hosts
-  become: true
-  gather_facts: true
-  vars:
-    inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
-    prior_active_color: "{{ hostvars[veza_container_prefix + 'haproxy']['prior_active_color'] }}"
-  tasks:
-    - name: Curl public health endpoint via HAProxy
-      ansible.builtin.uri:
-        url: "{{ veza_public_url }}/api/v1/health"
-        method: GET
-        status_code: [200]
-        timeout: 10
-        validate_certs: "{{ veza_public_url.startswith('https://') }}"
-      register: public_health
-      retries: 10
-      delay: 3
-      until: public_health.status == 200
-      tags: [phaseF, verify]
-
-    - name: Write deploy-state.json (consumed by node-exporter textfile)
-      ansible.builtin.copy:
-        dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
-        content: |
-          # HELP veza_deploy_active_color 0=blue, 1=green.
-          # TYPE veza_deploy_active_color gauge
-          veza_deploy_active_color{env="{{ veza_env }}"} {{ 0 if inactive_color == 'blue' else 1 }}
-          # HELP veza_deploy_release_sha info metric, label=sha.
-          # TYPE veza_deploy_release_sha gauge
-          veza_deploy_release_sha{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} 1
-          # HELP veza_deploy_last_success_timestamp unix epoch of last successful deploy.
-          # TYPE veza_deploy_last_success_timestamp gauge
-          veza_deploy_last_success_timestamp{env="{{ veza_env }}"} {{ ansible_date_time.epoch }}
-        mode: "0644"
-      tags: [phaseF, metrics]
-  rescue:
-    - name: Public health failed — record the failure timestamp
-      ansible.builtin.copy:
-        dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
-        content: |
-          # HELP veza_deploy_last_failure_timestamp unix epoch of last failed deploy.
-          # TYPE veza_deploy_last_failure_timestamp gauge
-          veza_deploy_last_failure_timestamp{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} {{ ansible_date_time.epoch }}
-        mode: "0644"
-      failed_when: false
-
-    - name: Re-switch HAProxy back to the prior color
-      ansible.builtin.import_role:
-        name: veza_haproxy_switch
-      vars:
-        veza_active_color: "{{ prior_active_color }}"
-      delegate_to: "{{ veza_container_prefix + 'haproxy' }}"
-
-    - name: Fail the playbook
-      ansible.builtin.fail:
-        msg: >-
-          Public health probe via HAProxy failed after deploy of SHA
-          {{ veza_release_sha[:12] }} to color {{ inactive_color }}.
-          HAProxy reverted to the prior color ({{ prior_active_color }}).
-          The freshly-deployed {{ inactive_color }} containers are kept
-          alive for forensics — inspect with:
-            incus exec {{ veza_container_prefix }}backend-{{ inactive_color }} -- journalctl -u veza-backend -n 200
--- a/infra/ansible/playbooks/deploy_data.yml
+++ b/infra/ansible/playbooks/deploy_data.yml
@ -1,411 +0,0 @@
-# deploy_data.yml — idempotent data-tier provisioning. Runs FIRST in
-# every deploy. Three principles:
-#
-#   1. ZFS-snapshot every data container's dataset before doing
-#      anything else. The snapshot is the safety net for any later
-#      mistake in the same run.
-#   2. Containers are created if absent, never destroyed. Volumes
-#      survive every deploy.
-#   3. Service config drift is reconciled, but state-bearing things
-#      (data dirs, schema, MinIO buckets) are reload-not-restart
-#      where the daemon supports it.
-#
-# Required extra-vars:
-#   env             one of staging|prod  (selects inventory group_vars)
-#   release_sha     git SHA of the release  (snapshot label)
-#
-# Caller pattern in .forgejo/workflows/deploy.yml:
-#   ansible-playbook -i inventory/{{env}}.yml playbooks/deploy_data.yml \
-#     -e env={{env}} -e release_sha={{sha}}
---
- name: Pre-flight — validate inputs and resolve runtime context
-  hosts: incus_hosts
-  become: true
-  gather_facts: true
-  tasks:
-    - name: Assert required vars are set
-      ansible.builtin.assert:
-        that:
-          - veza_env is defined
-          - veza_env in ['staging', 'prod']
-          - veza_release_sha is defined
-          - veza_release_sha | length == 40
-        fail_msg: >-
-          deploy_data.yml requires veza_env (staging|prod) +
-          veza_release_sha (40-char SHA). Pass via -e on the
-          command line or via inventory group_vars.
-
-    - name: Compute the list of data containers we manage
-      ansible.builtin.set_fact:
-        veza_data_containers:
-          - name: "{{ veza_container_prefix }}postgres"
-            kind: postgres
-          - name: "{{ veza_container_prefix }}redis"
-            kind: redis
-          - name: "{{ veza_container_prefix }}rabbitmq"
-            kind: rabbitmq
-          - name: "{{ veza_container_prefix }}minio"
-            kind: minio
-
-# -----------------------------------------------------------------------
-# ZFS snapshot before mutation. A failed prune is logged but not fatal —
-# safer to lose disk to retained snapshots than to skip the snapshot.
-# -----------------------------------------------------------------------
- name: ZFS-snapshot every data container's dataset
-  hosts: incus_hosts
-  become: true
-  gather_facts: false
-  tasks:
-    - name: Snapshot per-container dataset
-      ansible.builtin.shell: |
-        set -e
-        # Best-effort dataset path resolution from `incus storage volume show`.
-        # If the container doesn't exist yet (first-ever deploy), skip — there's
-        # nothing to snapshot.
-        if ! incus info "{{ item.name }}" >/dev/null 2>&1; then
-          echo "Container {{ item.name }} does not yet exist, skip snapshot"
-          exit 0
-        fi
-        DATASET=$(zfs list -H -o name | grep -E "containers/{{ item.name }}$" | head -1 || true)
-        if [ -z "$DATASET" ]; then
-          echo "No ZFS dataset for {{ item.name }} — likely non-ZFS storage, skip"
-          exit 0
-        fi
-        SNAP_NAME="${DATASET}@pre-deploy-{{ veza_release_sha }}"
-        if zfs list -H -t snapshot "$SNAP_NAME" >/dev/null 2>&1; then
-          echo "Snapshot $SNAP_NAME already exists (idempotent rerun)"
-          exit 0
-        fi
-        zfs snapshot "$SNAP_NAME"
-        echo "Created $SNAP_NAME"
-      args:
-        executable: /bin/bash
-      loop: "{{ veza_data_containers }}"
-      register: snap_result
-      changed_when: "'Created' in (snap_result.stdout | default(''))"
-      tags: [data, zfs, snapshot]
-
-    - name: Prune ZFS snapshots beyond retention window
-      ansible.builtin.shell: |
-        set -e
-        # Keep the {{ veza_release_retention | default(30) }} most-recent
-        # pre-deploy snapshots per dataset ; delete the rest.
-        for dataset in $(zfs list -H -o name | grep -E "containers/{{ veza_container_prefix }}(postgres|redis|rabbitmq|minio)$"); do
-          zfs list -H -t snapshot -o name -s creation "$dataset" \
-            | grep "@pre-deploy-" \
-            | head -n -{{ veza_release_retention | default(30) }} \
-            | xargs -r -n1 zfs destroy -r || true
-        done
-      args:
-        executable: /bin/bash
-      changed_when: false
-      failed_when: false
-      tags: [data, zfs, prune]
-
-# -----------------------------------------------------------------------
-# Provision (create-if-absent) each data container. We don't recreate
-# existing ones — they own state.
-# -----------------------------------------------------------------------
- name: Ensure data containers exist
-  hosts: incus_hosts
-  become: true
-  gather_facts: false
-  tasks:
-    - name: Launch container if absent
-      ansible.builtin.shell: |
-        set -e
-        if incus info "{{ item.name }}" >/dev/null 2>&1; then
-          echo "{{ item.name }} already exists"
-          exit 0
-        fi
-        incus launch {{ veza_app_base_image }} "{{ item.name }}" \
-          --profile veza-data \
-          --profile veza-net \
-          --network "{{ veza_incus_network }}"
-        # Wait for the container's API to respond before any subsequent task
-        # (apt, systemd) hits a half-up container.
-        for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
-          if incus exec "{{ item.name }}" -- /bin/true 2>/dev/null; then
-            echo "Container {{ item.name }} ready"
-            exit 0
-          fi
-          sleep 1
-        done
-        echo "Container {{ item.name }} did not become ready within timeout"
-        exit 1
-      args:
-        executable: /bin/bash
-      loop: "{{ veza_data_containers }}"
-      register: launch_result
-      changed_when: "'Container' in (launch_result.stdout | default('')) and 'ready' in (launch_result.stdout | default(''))"
-      tags: [data, provision]
-
-    - name: Refresh inventory so the new containers become reachable
-      ansible.builtin.meta: refresh_inventory
-      tags: [data, provision]
-
-# -----------------------------------------------------------------------
-# Per-kind service config. Implemented inline rather than via roles so
-# this playbook stays readable. When a kind grows, lift it into its own
-# tasks/<kind>.yml or role.
-# -----------------------------------------------------------------------
- name: Configure postgres
-  hosts: "{{ veza_container_prefix + 'postgres' }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-  tasks:
-    - name: Install postgresql-16
-      ansible.builtin.apt:
-        name:
-          - postgresql-16
-          - python3-psycopg2  # Required by Ansible's postgresql_user/db modules
-        state: present
-        update_cache: true
-        cache_valid_time: 3600
-
-    - name: Ensure postgres is enabled + started
-      ansible.builtin.systemd:
-        name: postgresql
-        state: started
-        enabled: true
-
-    - name: Wait for postgres ready
-      ansible.builtin.wait_for:
-        port: 5432
-        host: 127.0.0.1
-        timeout: 30
-
-    - name: Ensure veza role exists with the vault-stored password
-      community.postgresql.postgresql_user:
-        name: veza
-        password: "{{ vault_postgres_password }}"
-        role_attr_flags: LOGIN
-      become_user: postgres
-      no_log: true
-
-    - name: Ensure veza database exists owned by veza role
-      community.postgresql.postgresql_db:
-        name: veza
-        owner: veza
-        encoding: UTF8
-        lc_collate: C
-        lc_ctype: C
-        template: template0
-      become_user: postgres
-  tags: [data, postgres]
-
- name: Configure redis
-  hosts: "{{ veza_container_prefix + 'redis' }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-  tasks:
-    - name: Install redis-server
-      ansible.builtin.apt:
-        name: redis-server
-        state: present
-        update_cache: true
-        cache_valid_time: 3600
-
-    - name: Render redis.conf with password
-      ansible.builtin.copy:
-        content: |
-          bind 0.0.0.0
-          protected-mode yes
-          port 6379
-          requirepass {{ vault_redis_password }}
-          maxmemory 256mb
-          maxmemory-policy allkeys-lru
-          appendonly yes
-          appendfsync everysec
-          dir /var/lib/redis
-        dest: /etc/redis/redis.conf
-        owner: redis
-        group: redis
-        mode: "0640"
-      no_log: true
-      notify: Restart redis
-
-    - name: Ensure redis is enabled + started
-      ansible.builtin.systemd:
-        name: redis-server
-        state: started
-        enabled: true
-
-    - name: Wait for redis ready
-      ansible.builtin.wait_for:
-        port: 6379
-        host: 127.0.0.1
-        timeout: 30
-  handlers:
-    - name: Restart redis
-      ansible.builtin.systemd:
-        name: redis-server
-        state: restarted
-  tags: [data, redis]
-
- name: Configure rabbitmq
-  hosts: "{{ veza_container_prefix + 'rabbitmq' }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-  tasks:
-    - name: Install rabbitmq-server
-      ansible.builtin.apt:
-        name: rabbitmq-server
-        state: present
-        update_cache: true
-        cache_valid_time: 3600
-
-    - name: Ensure rabbitmq is enabled + started
-      ansible.builtin.systemd:
-        name: rabbitmq-server
-        state: started
-        enabled: true
-
-    - name: Wait for rabbitmq ready
-      ansible.builtin.wait_for:
-        port: 5672
-        host: 127.0.0.1
-        timeout: 60
-
-    - name: Ensure /veza vhost exists
-      community.rabbitmq.rabbitmq_vhost:
-        name: /veza
-        state: present
-
-    - name: Ensure veza user exists with vault password
-      community.rabbitmq.rabbitmq_user:
-        user: veza
-        password: "{{ vault_rabbitmq_password }}"
-        vhost: /veza
-        configure_priv: ".*"
-        read_priv: ".*"
-        write_priv: ".*"
-        state: present
-        update_password: always
-      no_log: true
-  tags: [data, rabbitmq]
-
- name: Configure minio
-  hosts: "{{ veza_container_prefix + 'minio' }}"
-  become: true
-  gather_facts: false
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-  tasks:
-    - name: Install MinIO via apt (or fallback to direct download)
-      ansible.builtin.shell: |
-        set -e
-        if ! command -v minio >/dev/null 2>&1; then
-          curl -fsSL https://dl.min.io/server/minio/release/linux-amd64/minio -o /usr/local/bin/minio
-          chmod 0755 /usr/local/bin/minio
-        fi
-        if ! command -v mc >/dev/null 2>&1; then
-          curl -fsSL https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/local/bin/mc
-          chmod 0755 /usr/local/bin/mc
-        fi
-      args:
-        executable: /bin/bash
-      changed_when: false
-
-    - name: Ensure minio system user
-      ansible.builtin.user:
-        name: minio
-        system: true
-        shell: /usr/sbin/nologin
-        home: /var/lib/minio
-
-    - name: Ensure minio data dir
-      ansible.builtin.file:
-        path: /var/lib/minio
-        state: directory
-        owner: minio
-        group: minio
-        mode: "0750"
-
-    - name: Render minio EnvironmentFile
-      ansible.builtin.copy:
-        content: |
-          MINIO_ROOT_USER={{ vault_minio_root_user }}
-          MINIO_ROOT_PASSWORD={{ vault_minio_root_password }}
-          MINIO_VOLUMES=/var/lib/minio
-          MINIO_OPTS="--address :9000 --console-address :9001"
-        dest: /etc/default/minio
-        owner: root
-        group: root
-        mode: "0640"
-      no_log: true
-      notify: Restart minio
-
-    - name: Render minio systemd unit
-      ansible.builtin.copy:
-        content: |
-          [Unit]
-          Description=MinIO
-          After=network-online.target
-          Wants=network-online.target
-
-          [Service]
-          Type=simple
-          User=minio
-          Group=minio
-          EnvironmentFile=/etc/default/minio
-          ExecStart=/usr/local/bin/minio server $MINIO_OPTS $MINIO_VOLUMES
-          Restart=on-failure
-          LimitNOFILE=65535
-
-          [Install]
-          WantedBy=multi-user.target
-        dest: /etc/systemd/system/minio.service
-        mode: "0644"
-      notify:
-        - Reload systemd
-        - Restart minio
-
-    - name: Enable + start minio
-      ansible.builtin.systemd:
-        name: minio
-        state: started
-        enabled: true
-        daemon_reload: true
-
-    - name: Wait for minio ready
-      ansible.builtin.wait_for:
-        port: 9000
-        host: 127.0.0.1
-        timeout: 60
-
-    - name: Configure mc client alias
-      ansible.builtin.shell: |
-        set -e
-        mc alias set veza-local http://127.0.0.1:9000 \
-          "{{ vault_minio_root_user }}" "{{ vault_minio_root_password }}" >/dev/null
-      args:
-        executable: /bin/bash
-      changed_when: false
-      no_log: true
-
-    - name: Ensure veza-{{ veza_env }} bucket exists
-      ansible.builtin.shell: |
-        mc mb --ignore-existing veza-local/veza-{{ veza_env }}
-      args:
-        executable: /bin/bash
-      changed_when: false
-  handlers:
-    - name: Reload systemd
-      ansible.builtin.systemd:
-        daemon_reload: true
-    - name: Restart minio
-      ansible.builtin.systemd:
-        name: minio
-        state: restarted
-  tags: [data, minio]
--- a/infra/ansible/playbooks/rollback.yml
+++ b/infra/ansible/playbooks/rollback.yml
@ -1,113 +0,0 @@
-# rollback.yml — two modes :
-#
-#  1. fast      : flip HAProxy back to the previous active color.
-#                Works only if those containers are still alive
-#                (i.e., the next deploy has NOT yet recycled them).
-#                Effect time : ~5 seconds.
-#
-#  2. full      : redeploy a specific release_sha by re-running
-#                deploy_app.yml with that SHA. Works whenever the
-#                tarball is still in the Forgejo Registry. Effect
-#                time : ~5-10 minutes.
-#
-# Required extra-vars:
-#   env             staging | prod
-#   mode            fast | full
-#   target_color    (mode=fast only)  the color to flip TO
-#   release_sha     (mode=full only)  the SHA to redeploy
-#
-# Caller (workflow_dispatch only — see .forgejo/workflows/rollback.yml):
-#   ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
-#     -e env={{env}} -e mode=fast -e target_color=blue
-#   ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
-#     -e env={{env}} -e mode=full -e release_sha=<previous_sha>
---
- name: Validate inputs
-  hosts: incus_hosts
-  become: true
-  gather_facts: false
-  tasks:
-    - name: Assert env + mode
-      ansible.builtin.assert:
-        that:
-          - veza_env is defined
-          - veza_env in ['staging', 'prod']
-          - mode is defined
-          - mode in ['fast', 'full']
-        fail_msg: rollback.yml requires veza_env + mode (fast|full).
-        quiet: true
-
-    - name: Assert target_color when mode=fast
-      ansible.builtin.assert:
-        that:
-          - target_color is defined
-          - target_color in ['blue', 'green']
-        fail_msg: rollback.yml mode=fast requires target_color (blue|green).
-        quiet: true
-      when: mode == 'fast'
-
-    - name: Assert release_sha when mode=full
-      ansible.builtin.assert:
-        that:
-          - veza_release_sha is defined
-          - veza_release_sha | length == 40
-        fail_msg: rollback.yml mode=full requires release_sha (40-char SHA).
-        quiet: true
-      when: mode == 'full'
-
-# ---------------------------------------------------------------------
-# mode=fast  →  HAProxy flip only.
-# ---------------------------------------------------------------------
- name: Fast rollback — verify target_color containers are alive
-  hosts: incus_hosts
-  become: true
-  gather_facts: false
-  tasks:
-    - name: Check each target-color container exists
-      ansible.builtin.shell: |
-        set -e
-        CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
-        if ! incus info "$CT" >/dev/null 2>&1; then
-          echo "MISSING $CT"
-          exit 1
-        fi
-        STATE=$(incus list "$CT" -c s --format csv)
-        if [ "$STATE" != "RUNNING" ]; then
-          echo "$CT is $STATE (not RUNNING)"
-          exit 1
-        fi
-        echo "OK $CT"
-      args:
-        executable: /bin/bash
-      loop:
-        - backend
-        - stream
-        - web
-      changed_when: false
-      register: alive_check
-  when: mode == 'fast'
-  tags: [rollback, fast]
-
- name: Fast rollback — flip HAProxy
-  hosts: "{{ veza_container_prefix + 'haproxy' }}"
-  become: true
-  gather_facts: true
-  vars:
-    ansible_connection: community.general.incus
-    ansible_python_interpreter: /usr/bin/python3
-    veza_active_color: "{{ target_color }}"
-    # Fast rollback re-uses the previous SHA from the history file.
-    veza_release_sha: "{{ lookup('ansible.builtin.file', '/var/lib/veza/active-color.history', errors='ignore') | regex_search('sha=([0-9a-f]+)', '\\1') | default(['rollback'], true) | first }}"
-  roles:
-    - veza_haproxy_switch
-  when: mode == 'fast'
-  tags: [rollback, fast]
-
-# ---------------------------------------------------------------------
-# mode=full  →  re-import deploy_app.yml with the rollback SHA.
-# Functionally identical to a fresh deploy of an older release.
-# ---------------------------------------------------------------------
- name: Full rollback — delegate to deploy_app.yml with release_sha={{ veza_release_sha | default('') }}
-  ansible.builtin.import_playbook: deploy_app.yml
-  when: mode == 'full'
-  tags: [rollback, full]
--- a/infra/ansible/roles/haproxy/templates/haproxy.cfg.j2
+++ b/infra/ansible/roles/haproxy/templates/haproxy.cfg.j2
@ -1,16 +1,5 @@
 # Managed by Ansible — do not edit by hand.
-# v1.0.9 W4 Day 19 (multi-instance) → W5+ extended to blue/green.
-# `haproxy_topology` (set in group_vars/<env>.yml) selects between:
-#
-#   multi-instance (default, lab) — server list comes from inventory
-#       groups backend_api_instances, stream_server_instances ; sticky
-#       cookie load-balances across N peers.
-#   blue-green   (staging, prod)   — server list is exactly two:
-#       <prefix>backend-blue + <prefix>backend-green. veza_active_color
-#       picks which one is primary ; the other is `backup` (HAProxy
-#       routes to a backup server only when ALL primaries are down).
-#       The veza_haproxy_switch role re-renders this template with a
-#       new active_color, validates, atomic-swaps, and HUPs.
+# v1.0.9 W4 Day 19.

 global
    log /dev/log local0
@ -21,7 +10,11 @@ global
    user haproxy
    group haproxy
    daemon
+    # Avoid leaking the version banner in error pages.
    server-state-file /var/lib/haproxy/server-state
+    # ssl-default-bind-* tightens TLS to modern ciphers ; lifted directly
+    # from the Mozilla Intermediate profile. Only effective when a TLS
+    # cert is mounted (see haproxy_tls_cert_path).
    ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11
    ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305

@ -30,20 +23,22 @@ defaults
    mode http
    option httplog
    option dontlognull
-    option forwardfor
+    option forwardfor       # adds X-Forwarded-For so backend logs see the real IP
    option http-server-close
    timeout connect 5s
    timeout client 60s
    timeout server 60s
-    timeout tunnel 1h
+    timeout tunnel 1h        # WS connections are long-lived ; bumped from default 1m
    timeout client-fin 5s
    timeout http-keep-alive 15s
    timeout http-request 10s
+    # Restore previous server state on reload so health checks don't
+    # restart from scratch + the drain timer survives.
    load-server-state-from-file global

 # -----------------------------------------------------------------------
-# Stats endpoint — bound to loopback only ; the Prometheus haproxy
-# exporter sidecar scrapes it.
+# Stats endpoint — bound to loopback only so the prometheus haproxy
+# exporter (sidecar) can scrape it. Auth lives at the bridge layer.
 # -----------------------------------------------------------------------
 frontend stats
    bind 127.0.0.1:{{ haproxy_listen_stats }}
@ -55,7 +50,8 @@ frontend stats
    no log

 # -----------------------------------------------------------------------
-# Frontend — HTTP + (optionally) HTTPS. ACL-driven path routing.
+# Frontend HTTP. v1.0 lab uses HTTP only ; uncomment the HTTPS bind
+# when haproxy_tls_cert_path is non-empty (Mozilla intermediate).
 # -----------------------------------------------------------------------
 frontend veza_http_in
    bind *:{{ haproxy_listen_http }}
@ -65,102 +61,23 @@ frontend veza_http_in
    http-request redirect scheme https code 301 if !{ ssl_fc }
 {% endif %}

-    acl is_api          path_beg /api/v1
-{% if haproxy_topology | default('multi-instance') == 'blue-green' %}
-    acl is_stream_seg   path_beg /tracks/ path_end .m3u8
-    acl is_stream_seg   path_beg /tracks/ path_end .ts
-    acl is_stream_seg   path_beg /tracks/ path_end .m4s
-    acl is_stream_path  path_beg /stream
-    acl is_stream_path  path_beg /hls
-    use_backend backend_api  if is_api
-    use_backend stream_pool  if is_stream_seg
-    use_backend stream_pool  if is_stream_path
-    default_backend web_pool
-{% else %}
+    # Path-based routing :
+    #   /api/v1/ws/*  → backend api_pool (sticky cookie ; carries chat WS)
+    #   /api/v1/*     → backend api_pool (also sticky so 401 → /me roundtrips work)
+    #   /tracks/*/hls → backend stream_pool (URI-hash for cache locality)
+    #   else          → backend api_pool (default)
    acl is_stream    path_beg /tracks/ path_end .m3u8
    acl is_stream    path_beg /tracks/ path_end .ts
    acl is_stream    path_beg /tracks/ path_end .m4s
+
    use_backend stream_pool if is_stream
    default_backend api_pool
-{% endif %}
-
-{% if haproxy_topology | default('multi-instance') == 'blue-green' %}
-# =======================================================================
-# BLUE / GREEN topology (staging, prod)
-#
-# active_color is the variable veza_haproxy_switch passes in. It selects
-# which server gets `check` and which gets `check backup`. HAProxy only
-# routes to a `backup` server when EVERY non-backup is marked down by
-# its health check ; together with health-check fall=3 this gives us
-# instant rollback to the prior color if the new one starts failing
-# health checks (without re-running Ansible).
-#
-# Active color: {{ veza_active_color | default(haproxy_active_color | default('blue')) }}
-# Container prefix: {{ veza_container_prefix }}
-# DNS suffix: {{ veza_incus_dns_suffix }}
-# =======================================================================
-{% set _active = veza_active_color | default(haproxy_active_color | default('blue')) %}
-
-# -----------------------------------------------------------------------
-# Backend API pool — Go. Sticky cookie ; backup color sits idle.
-# -----------------------------------------------------------------------
-backend backend_api
-    balance roundrobin
-    option httpchk GET {{ veza_healthcheck_paths.backend | default('/api/v1/health') }}
-    http-check expect status 200
-    cookie {{ haproxy_sticky_cookie_name }} insert indirect nocache httponly secure
-    default-server check
-        inter {{ haproxy_health_check_interval_ms }}
-        fall {{ haproxy_health_check_fall }}
-        rise {{ haproxy_health_check_rise }}
-        on-marked-down shutdown-sessions
-        slowstart {{ haproxy_graceful_drain_seconds }}s
-    server backend_blue  {{ veza_container_prefix }}backend-blue.{{ veza_incus_dns_suffix }}:{{ veza_backend_port }}  cookie backend_blue  {{ '' if _active == 'blue' else 'backup' }}
-    server backend_green {{ veza_container_prefix }}backend-green.{{ veza_incus_dns_suffix }}:{{ veza_backend_port }} cookie backend_green {{ '' if _active == 'green' else 'backup' }}
-
-# -----------------------------------------------------------------------
-# Stream pool — Rust Axum HLS. URI-hash for cache locality. Same
-# blue/green pair, same backup-flag pattern.
-# -----------------------------------------------------------------------
-backend stream_pool
-    balance uri whole
-    hash-type consistent
-    option httpchk GET {{ veza_healthcheck_paths.stream | default('/health') }}
-    http-check expect status 200
-    timeout tunnel 1h
-    default-server check
-        inter {{ haproxy_health_check_interval_ms }}
-        fall {{ haproxy_health_check_fall }}
-        rise {{ haproxy_health_check_rise }}
-        on-marked-down shutdown-sessions
-        slowstart {{ haproxy_graceful_drain_seconds }}s
-    server stream_blue  {{ veza_container_prefix }}stream-blue.{{ veza_incus_dns_suffix }}:{{ veza_stream_port }}  {{ '' if _active == 'blue' else 'backup' }}
-    server stream_green {{ veza_container_prefix }}stream-green.{{ veza_incus_dns_suffix }}:{{ veza_stream_port }} {{ '' if _active == 'green' else 'backup' }}
-
-# -----------------------------------------------------------------------
-# Web pool — React SPA served by nginx. Same pair, same pattern.
-# -----------------------------------------------------------------------
-backend web_pool
-    balance roundrobin
-    option httpchk GET {{ veza_healthcheck_paths.web | default('/') }}
-    http-check expect status 200
-    default-server check
-        inter {{ haproxy_health_check_interval_ms }}
-        fall {{ haproxy_health_check_fall }}
-        rise {{ haproxy_health_check_rise }}
-        on-marked-down shutdown-sessions
-        slowstart {{ haproxy_graceful_drain_seconds }}s
-    server web_blue  {{ veza_container_prefix }}web-blue.{{ veza_incus_dns_suffix }}:{{ veza_web_port }}  {{ '' if _active == 'blue' else 'backup' }}
-    server web_green {{ veza_container_prefix }}web-green.{{ veza_incus_dns_suffix }}:{{ veza_web_port }} {{ '' if _active == 'green' else 'backup' }}
-
-{% else %}
-# =======================================================================
-# MULTI-INSTANCE topology (lab, default)
-# Server list comes from inventory groups ; sticky cookie load-balances.
-# =======================================================================

 # -----------------------------------------------------------------------
 # Backend api_pool — Gin REST API. Sticky cookie + active health check.
+# `cookie ... insert indirect nocache` : HAProxy sets the cookie on the
+# first response, the browser sends it back, subsequent requests stick
+# to the same server. WS upgrades inherit it.
 # -----------------------------------------------------------------------
 backend api_pool
    balance roundrobin
@ -181,7 +98,9 @@ backend api_pool

 # -----------------------------------------------------------------------
 # Backend stream_pool — Rust Axum HLS. URI hash so the same track_id
-# consistently lands on the same node.
+# consistently lands on the same node, keeping the in-process HLS
+# segment cache warm. `consistent` flag = jump-hash so adding/removing
+# a node doesn't flush the entire pool.
 # -----------------------------------------------------------------------
 backend stream_pool
    balance uri whole
@ -199,5 +118,3 @@ backend stream_pool
 {% for host in stream_hosts %}
    server {{ host }} {{ host }}.lxd:{{ haproxy_stream_server_port }}
 {% endfor %}
-
-{% endif %}
--- a/infra/ansible/roles/veza_app/defaults/main.yml
+++ b/infra/ansible/roles/veza_app/defaults/main.yml
@ -35,9 +35,7 @@ veza_app_binary_mode: "0755"
 veza_app_container_name: "{{ veza_container_prefix }}{{ veza_component }}-{{ veza_target_color }}"

 # URL to fetch the release tarball. Computed once per task chain.
-# `veza-<component>` is the Forgejo package name (one package per
-# component) ; SHA is the version ; tarball is the filename.
-veza_app_artifact_url: "{{ veza_artifact_base_url }}/veza-{{ veza_component }}/{{ veza_release_sha }}/veza-{{ veza_component }}-{{ veza_release_sha }}.tar.zst"
+veza_app_artifact_url: "{{ veza_artifact_base_url }}/{{ veza_component }}/{{ veza_release_sha }}/veza-{{ veza_component }}-{{ veza_release_sha }}.tar.zst"

 # How long to wait for the container's network namespace to come up
 # after `incus launch` before we start running tasks against it.
--- a/infra/ansible/roles/veza_haproxy_switch/README.md
+++ b/infra/ansible/roles/veza_haproxy_switch/README.md
@ -1,47 +0,0 @@
-# `veza_haproxy_switch` role
-
-Atomically swap HAProxy's active color. Runs against the
-`{{ veza_container_prefix }}haproxy` container after `veza_app` has
-recreated + health-probed all three components in the inactive color.
-
-## Why a separate role from `haproxy`?
-
- `roles/haproxy` provisions a fresh HAProxy container — install
-  the package, lay down the *initial* config, enable the systemd
-  unit. It runs once when the staging/prod env is bootstrapped and
-  occasionally when the global config shape changes.
- `roles/veza_haproxy_switch` performs the *per-deploy* delta —
-  re-template the cfg with a new `veza_active_color`, validate,
-  swap, HUP. It runs once at the end of every successful deploy.
-
-Splitting them keeps the per-deploy path narrow (no apt, no service
-install) and lets `roles/haproxy` remain idempotent when the global
-shape hasn't changed.
-
-## Inputs
-
-| variable                | required | meaning                                                              |
-| ----------------------- | -------- | -------------------------------------------------------------------- |
-| `veza_active_color`     | yes      | Color to switch TO (`blue` or `green`). Becomes the new active.      |
-| `veza_release_sha`      | yes      | SHA being deployed. Logged in the active-color history file.         |
-| `veza_container_prefix` | inherit  | From group_vars/<env>.yml.                                           |
-| `haproxy_topology`      | inherit  | Should be `blue-green` for this role to make sense.                  |
-
-## Failure semantics
-
-The render → validate → atomic-swap → HUP sequence runs in an
-Ansible `block:` with a `rescue:` that restores `haproxy.cfg.bak`
-(captured before the swap) and re-HUPs. So an invalid config or a
-HUP failure leaves HAProxy serving the *previous* active color
-exactly as before — the deploy as a whole then fails on the playbook
-level.
-
-## What the role does NOT do
-
- It does not destroy or recreate the HAProxy container. That's a
-  one-time operation under `roles/haproxy`.
- It does not touch app containers — by the time this role runs,
-  blue/green app containers are both healthy.
- It does not remove the previously-active color's containers. They
-  survive (intentional) so a rollback can flip back instantly. The
-  next deploy naturally recycles them.
--- a/infra/ansible/roles/veza_haproxy_switch/defaults/main.yml
+++ b/infra/ansible/roles/veza_haproxy_switch/defaults/main.yml
@ -1,18 +0,0 @@
---
-# These should be set by the caller — defaults here are guards that
-# fail loud if the caller forgot to pass them.
-veza_active_color: ""
-veza_release_sha: ""
-
-# Paths inside the HAProxy container.
-haproxy_cfg_path: /etc/haproxy/haproxy.cfg
-haproxy_cfg_new_path: /etc/haproxy/haproxy.cfg.new
-haproxy_cfg_backup_path: /etc/haproxy/haproxy.cfg.bak
-haproxy_state_dir: /var/lib/veza
-haproxy_active_color_file: /var/lib/veza/active-color
-haproxy_active_color_history: /var/lib/veza/active-color.history
-
-# How many history entries to keep before pruning. The rollback role
-# offers point-in-time switch within this window without redeploying
-# the artefact.
-haproxy_active_color_history_keep: 5
--- a/infra/ansible/roles/veza_haproxy_switch/handlers/main.yml
+++ b/infra/ansible/roles/veza_haproxy_switch/handlers/main.yml
@ -1,9 +0,0 @@
---
-# HUP haproxy via systemd reload (graceful — drains old workers).
-# Used both on success (after atomic swap) and on rescue (after
-# restoring backup).
- name: Reload haproxy
-  ansible.builtin.systemd:
-    name: haproxy
-    state: reloaded
-  listen: "veza-haproxy reload"
--- a/infra/ansible/roles/veza_haproxy_switch/meta/main.yml
+++ b/infra/ansible/roles/veza_haproxy_switch/meta/main.yml
@ -1,16 +0,0 @@
---
-galaxy_info:
-  role_name: veza_haproxy_switch
-  author: Veza Ops
-  description: >-
-    Atomically swap HAProxy's active color (blue/green) and persist
-    the new state. Runs once per deploy, after veza_app has health-
-    probed all components in the inactive color. Block/rescue
-    guarantees HAProxy never lands on a bad config.
-  license: proprietary
-  min_ansible_version: "2.15"
-  platforms:
-    - name: Debian
-      versions: ["13"]
-
-dependencies: []
--- a/infra/ansible/roles/veza_haproxy_switch/tasks/main.yml
+++ b/infra/ansible/roles/veza_haproxy_switch/tasks/main.yml
@ -1,142 +0,0 @@
-# Atomic blue/green switch. The HAProxy template lives in
-# roles/haproxy/templates/haproxy.cfg.j2 — it reads veza_active_color
-# to render the right `backup` directives. We re-template, validate,
-# atomic-swap, HUP.
-#
-# Block/rescue: any failure in the four-step sequence restores
-# haproxy.cfg from the backup we capture before touching anything.
-# That way, an invalid template or a HUP error never leaves HAProxy
-# serving from a stale or broken cfg — it stays on whatever was
-# active when the role started.
---
- name: Validate inputs
-  ansible.builtin.assert:
-    that:
-      - veza_active_color in ['blue', 'green']
-      - veza_release_sha | length == 40
-    fail_msg: >-
-      veza_haproxy_switch role requires veza_active_color (blue|green)
-      and veza_release_sha (40-char git SHA). Got: color={{ veza_active_color }}
-      sha={{ veza_release_sha }}.
-    quiet: true
-  tags: [veza_haproxy_switch, always]
-
- name: Ensure veza state dir exists in HAProxy container
-  ansible.builtin.file:
-    path: "{{ haproxy_state_dir }}"
-    state: directory
-    owner: root
-    group: root
-    mode: "0755"
-  tags: [veza_haproxy_switch]
-
- name: Read currently-active color (if any)
-  ansible.builtin.slurp:
-    src: "{{ haproxy_active_color_file }}"
-  register: prior_color_raw
-  failed_when: false
-  changed_when: false
-  tags: [veza_haproxy_switch]
-
- name: Resolve prior_active_color (default blue if no history)
-  ansible.builtin.set_fact:
-    prior_active_color: >-
-      {{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
-         else 'blue' }}
-  tags: [veza_haproxy_switch]
-
- name: Switch sequence (block/rescue — restores cfg on any failure)
-  block:
-    - name: Backup current haproxy.cfg
-      ansible.builtin.copy:
-        src: "{{ haproxy_cfg_path }}"
-        dest: "{{ haproxy_cfg_backup_path }}"
-        remote_src: true
-        mode: "0640"
-      tags: [veza_haproxy_switch]
-
-    - name: Render fresh haproxy.cfg with new active_color
-      ansible.builtin.template:
-        src: "{{ playbook_dir }}/../roles/haproxy/templates/haproxy.cfg.j2"
-        dest: "{{ haproxy_cfg_new_path }}"
-        owner: root
-        group: haproxy
-        mode: "0640"
-        validate: "haproxy -f %s -c -q"
-      vars:
-        # Make absolutely sure the template sees the new color we are
-        # switching to — set both names because the older template
-        # used `veza_active_color` and a future revision might use
-        # `haproxy_active_color`.
-        haproxy_active_color: "{{ veza_active_color }}"
-      tags: [veza_haproxy_switch]
-
-    - name: Atomic swap — mv haproxy.cfg.new → haproxy.cfg
-      ansible.builtin.command: mv -f "{{ haproxy_cfg_new_path }}" "{{ haproxy_cfg_path }}"
-      changed_when: true
-      tags: [veza_haproxy_switch]
-
-    - name: HUP haproxy (graceful reload, no connection drop)
-      ansible.builtin.systemd:
-        name: haproxy
-        state: reloaded
-      tags: [veza_haproxy_switch]
-  rescue:
-    - name: Restore haproxy.cfg from backup
-      ansible.builtin.command: mv -f "{{ haproxy_cfg_backup_path }}" "{{ haproxy_cfg_path }}"
-      when: haproxy_cfg_backup_path is file or true  # always try; benign if backup missing
-      changed_when: true
-      tags: [veza_haproxy_switch]
-
-    - name: HUP haproxy back to the prior config
-      ansible.builtin.systemd:
-        name: haproxy
-        state: reloaded
-      failed_when: false
-      tags: [veza_haproxy_switch]
-
-    - name: Report the failure
-      ansible.builtin.fail:
-        msg: >-
-          HAProxy switch to color {{ veza_active_color }} (sha
-          {{ veza_release_sha[:12] }}) failed — config rolled back
-          to the prior state. HAProxy continues serving from
-          {{ prior_active_color }}. Inspect the validate step's
-          stderr in the playbook output above.
-
-# Success path: persist new active color + history.
- name: Write new active color
-  ansible.builtin.copy:
-    dest: "{{ haproxy_active_color_file }}"
-    content: "{{ veza_active_color }}\n"
-    owner: root
-    group: root
-    mode: "0644"
-  tags: [veza_haproxy_switch]
-
- name: Append to active-color history
-  ansible.builtin.lineinfile:
-    path: "{{ haproxy_active_color_history }}"
-    line: "{{ ansible_date_time.iso8601 }} sha={{ veza_release_sha }} color={{ veza_active_color }} prior={{ prior_active_color }}"
-    create: true
-    insertbefore: BOF
-    mode: "0644"
-  tags: [veza_haproxy_switch]
-
- name: Prune history beyond keep limit
-  ansible.builtin.shell: |
-    set -e
-    if [ -f "{{ haproxy_active_color_history }}" ]; then
-      head -n {{ haproxy_active_color_history_keep }} "{{ haproxy_active_color_history }}" > "{{ haproxy_active_color_history }}.tmp"
-      mv -f "{{ haproxy_active_color_history }}.tmp" "{{ haproxy_active_color_history }}"
-    fi
-  args:
-    executable: /bin/bash
-  changed_when: false
-  tags: [veza_haproxy_switch]
-
- name: Drop the now-stale backup
-  ansible.builtin.file:
-    path: "{{ haproxy_cfg_backup_path }}"
-    state: absent
-  tags: [veza_haproxy_switch]
--- a/make/incus.mk
+++ b/make/incus.mk
@ -198,22 +198,3 @@ incus-logs: ## [LOW] Show logs from Incus container (usage: make incus-logs SERV
 		exit 1; \
 	fi
 	@incus exec veza-$(SERVICE) -- journalctl -f
-
-# ==============================================================================
-# CANARY RELEASE (W5 Day 23)
-# ==============================================================================
-
-.PHONY: deploy-canary
-
-deploy-canary: ## [HIGH] Canary release : drain → deploy → SLI monitor → rollback on red. ARTIFACT=/path required. See docs/CANARY_RELEASE.md.
-	@if [ -z "$(ARTIFACT)" ]; then \
-		$(ECHO_CMD) "${RED}❌ ARTIFACT=/path/to/veza-api required${NC}"; \
-		$(ECHO_CMD) "${YELLOW}   See docs/CANARY_RELEASE.md for the full env-var surface.${NC}"; \
-		exit 1; \
-	fi
-	@$(ECHO_CMD) "${BLUE}🚦 Canary deploy : $(ARTIFACT)${NC}"
-	@ARTIFACT="$(ARTIFACT)" \
-	 ROLLBACK_BINARY="$(ROLLBACK_BINARY)" \
-	 SLI_WINDOW="$(SLI_WINDOW)" \
-	 PROM_URL="$(PROM_URL)" \
-	 bash $(CURDIR)/scripts/deploy-canary.sh
--- a/scripts/check-migration-backward-compat.sh
+++ b/scripts/check-migration-backward-compat.sh
@ -1,112 +0,0 @@
-#!/usr/bin/env bash
-# check-migration-backward-compat.sh — pre-deploy gate for canary releases.
-#
-# Refuses to deploy when the latest migration is NOT backward-compatible
-# with the running schema. Backward-compat = the OLD code can still
-# read/write against the NEW schema for at least one canary window
-# (otherwise canary mode is meaningless ; the old node would crash on
-# the first request that touches a removed column).
-#
-# Heuristic : reject migrations that contain any of these patterns :
-#   - DROP COLUMN
-#   - DROP TABLE
-#   - ALTER COLUMN ... TYPE      (type change is rarely backward-compat)
-#   - ADD COLUMN ... NOT NULL    (without DEFAULT — old code can't INSERT)
-#   - DROP CONSTRAINT
-#   - DROP INDEX UNIQUE          (existing data may already violate)
-#
-# This is a STATIC check ; some patterns are false-positives (e.g.
-# DROP COLUMN of a column that no code reads). When a real migration
-# is flagged, the operator either :
-#   1. Splits the migration : ship the additive part now, drop in v+1
-#      after old-version backends are decommissioned.
-#   2. Bypasses with FORCE_MIGRATE=1 + a justification in the commit
-#      message of the migration file.
-#
-# v1.0.9 W5 Day 23.
-#
-# Usage :
-#   bash scripts/check-migration-backward-compat.sh
-#
-# Required env :
-#   MIGRATIONS_DIR  default veza-backend-api/migrations
-#   GIT_RANGE       default origin/main..HEAD ; the range to inspect for
-#                   newly-added migration files
-# Optional env :
-#   FORCE_MIGRATE=1 bypass with a logged warning. Use sparingly.
-#
-# Exit codes :
-#   0  — all new migrations are backward-compat (or FORCE_MIGRATE=1)
-#   1  — at least one migration carries a forbidden pattern
-#   3  — required tool missing / config error
-set -euo pipefail
-
-MIGRATIONS_DIR=${MIGRATIONS_DIR:-veza-backend-api/migrations}
-GIT_RANGE=${GIT_RANGE:-origin/main..HEAD}
-FORCE_MIGRATE=${FORCE_MIGRATE:-0}
-
-log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
-fail() { log "FAIL: $*"; exit "${2:-1}"; }
-
-require() {
-  command -v "$1" >/dev/null 2>&1 || fail "required tool missing: $1" 3
-}
-
-require git
-require grep
-require date
-
-# Patterns that indicate non-backward-compat schema change.
-# Heredoc preserves the pipe characters as alternations.
-FORBIDDEN_PATTERNS='DROP COLUMN|DROP TABLE|ALTER COLUMN [A-Za-z_]+ TYPE|ADD COLUMN [A-Za-z_]+ [^,;]* NOT NULL[^,;]*(;|$)|DROP CONSTRAINT|DROP INDEX [A-Za-z_]*UNIQUE'
-
-# Identify newly-added migration files in the current range.
-new_migrations=$(git diff --name-only --diff-filter=A "$GIT_RANGE" -- "$MIGRATIONS_DIR" 2>/dev/null \
-  | grep -E "^${MIGRATIONS_DIR}/[0-9]+_.*\.sql$" || true)
-
-if [ -z "$new_migrations" ]; then
-  log "no new migrations in $GIT_RANGE — nothing to check"
-  exit 0
-fi
-
-log "checking $(echo "$new_migrations" | wc -l) new migration(s) in $GIT_RANGE"
-findings=0
-for f in $new_migrations; do
-  log "  scanning $f"
-  # -i case-insensitive ; -E extended regex ; -n line numbers
-  matches=$(grep -inE "$FORBIDDEN_PATTERNS" "$f" || true)
-  if [ -n "$matches" ]; then
-    findings=$((findings + 1))
-    log ""
-    log "  ⚠ NON-BACKWARD-COMPAT pattern in $f :"
-    echo "$matches" | sed 's/^/      /' >&2
-    # Special case : ADD COLUMN ... NOT NULL ... DEFAULT <x> is fine.
-    # The regex above tries to exclude that but the match-then-filter
-    # approach is more reliable than a single regex. Suppress matches
-    # that include `DEFAULT` on the same line.
-    real=$(echo "$matches" | grep -ivE "DEFAULT" || true)
-    if [ -z "$real" ]; then
-      log "      ↳ all matches include DEFAULT clause — actually backward-compat"
-      findings=$((findings - 1))
-    fi
-  fi
-done
-
-if [ "$findings" -gt 0 ]; then
-  log ""
-  log "$findings migration(s) flagged as potentially non-backward-compat."
-  if [ "$FORCE_MIGRATE" = "1" ]; then
-    log "FORCE_MIGRATE=1 set — proceeding anyway."
-    exit 0
-  fi
-  log ""
-  log "Options to proceed :"
-  log "  1. Split the migration : ship the additive part now, drop the"
-  log "     non-compat part in v+1 after old backends are off."
-  log "  2. Set FORCE_MIGRATE=1 if you accept the risk + document the"
-  log "     justification in the migration's commit message."
-  exit 1
-fi
-
-log "PASS : all new migrations are backward-compat"
-exit 0
--- a/scripts/deploy-canary.sh
+++ b/scripts/deploy-canary.sh
@ -1,287 +0,0 @@
-#!/usr/bin/env bash
-# deploy-canary.sh — canary release for the active/active backend-api fleet.
-#
-# Walks the standard canary recipe (drain → deploy → health → re-enable
-# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
-# the host that owns the backend-api Incus containers + the haproxy
-# admin socket.
-#
-# v1.0.9 W5 Day 23.
-#
-# Usage :
-#   bash scripts/deploy-canary.sh /path/to/new/veza-api
-#
-# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
-#
-# Required env :
-#   ARTIFACT          path to the new veza-api binary (passed as $1 too)
-# Optional env :
-#   POOL_BACKEND      HAProxy backend name (default api_pool)
-#   CANARY_NODE       which container to canary first (default backend-api-2)
-#   PEER_NODES        comma-separated list of peers to roll AFTER canary
-#                     succeeds (default backend-api-1)
-#   HEALTH_HOST       host to curl (default haproxy.lxd ; LB-routed)
-#   HEALTH_PATH       default /api/v1/health
-#   SLI_WINDOW        SLI monitor duration in seconds (default 3600 = 1h)
-#   SLI_PROBE_INTERVAL  seconds between SLI probes (default 30)
-#   PROM_URL          Prometheus query URL (default http://prom.lxd:9090)
-#   PROM_P95_THRESHOLD_S    p95 SLI threshold in seconds (default 0.5)
-#   PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
-#   ROLLBACK_BINARY   path to the previous-known-good binary (used on red)
-#                     If unset, rollback skips the binary swap and just
-#                     re-enables the canary node — operator handles the
-#                     real revert.
-#   PRE_DEPLOY_HOOK   path to script that validates migrations are
-#                     backward-compat. Defaults to scripts/check-migration-backward-compat.sh
-#                     when present.
-#
-# Exit codes :
-#   0  — canary + full roll succeeded
-#   1  — pre-deploy validation failed ; nothing was changed
-#   2  — canary failed ; rollback executed
-#   3  — required tool / env missing
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
-
-ARTIFACT=${ARTIFACT:-${1:-?}}
-POOL_BACKEND=${POOL_BACKEND:-api_pool}
-CANARY_NODE=${CANARY_NODE:-backend-api-2}
-PEER_NODES=${PEER_NODES:-backend-api-1}
-HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
-HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
-SLI_WINDOW=${SLI_WINDOW:-3600}
-SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
-PROM_URL=${PROM_URL:-http://prom.lxd:9090}
-PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
-PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
-ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
-PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}
-
-log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
-die()  { log "FAIL: $*"; exit "${2:-1}"; }
-
-require() {
-  command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
-}
-
-require incus
-require curl
-require socat
-require date
-
-if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
-  die "ARTIFACT (or \$1) must point to an existing binary" 1
-fi
-
-# --------------------------------------------------------------------
-# Helpers : HAProxy admin socket commands.
-# --------------------------------------------------------------------
-HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}
-
-ha_cmd() {
-  incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
-}
-
-ha_state() {
-  local node=$1
-  ha_cmd "show servers state $POOL_BACKEND" \
-    | awk -v n="$node" '$0 ~ n {print $7}' | head -1
-  # field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
-}
-
-ha_drain() {
-  log "haproxy : drain $1"
-  ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
-}
-
-ha_ready() {
-  log "haproxy : ready $1"
-  ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
-}
-
-ha_wait_drained() {
-  # Drain finishes when the server reports 0 active connections.
-  local node=$1
-  local deadline=$(( $(date +%s) + 60 ))
-  while [ "$(date +%s)" -lt "$deadline" ]; do
-    local n
-    n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
-    if [ "${n:-0}" = "0" ]; then
-      log "haproxy : $node drained (0 active connections)"
-      return 0
-    fi
-    sleep 2
-  done
-  log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
-}
-
-curl_health() {
-  curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
-    "http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
-}
-
-# --------------------------------------------------------------------
-# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
-# any probe reports red so we can rollback fast.
-# --------------------------------------------------------------------
-prom_query() {
-  local q=$1
-  curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
-    "${PROM_URL}/api/v1/query" 2>/dev/null \
-    | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
-}
-
-monitor_sli() {
-  log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
-  local deadline=$(( $(date +%s) + SLI_WINDOW ))
-  local probes=0
-  local first_red=""
-  while [ "$(date +%s)" -lt "$deadline" ]; do
-    probes=$((probes + 1))
-    local p95 err
-    p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
-    err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
-    log "  probe $probes : p95=${p95}s err=${err}"
-
-    # awk used for float comparison ; bash test only does integers.
-    if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
-      first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
-      break
-    fi
-    if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
-      first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
-      break
-    fi
-    sleep "$SLI_PROBE_INTERVAL"
-  done
-  if [ -n "$first_red" ]; then
-    log "SLI red after $probes probe(s) : $first_red"
-    return 1
-  fi
-  log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
-  return 0
-}
-
-# --------------------------------------------------------------------
-# Deploy + rollback primitives.
-# --------------------------------------------------------------------
-deploy_to() {
-  local node=$1
-  local artifact=$2
-  log "deploying $artifact → $node"
-  incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
-    --uid 1001 --gid 1001 --mode 0755
-  incus exec "$node" -- systemctl restart veza-backend-api
-}
-
-verify_node_health() {
-  local node=$1
-  log "node health check : $node"
-  local deadline=$(( $(date +%s) + 60 ))
-  while [ "$(date +%s)" -lt "$deadline" ]; do
-    if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
-      log "  $node : 200"
-      return 0
-    fi
-    sleep 2
-  done
-  return 1
-}
-
-rollback_canary() {
-  log "ROLLBACK : restoring $CANARY_NODE"
-  if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
-    deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
-    verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
-  else
-    log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
-  fi
-  ha_ready "$CANARY_NODE"
-}
-
-# --------------------------------------------------------------------
-# 1. Pre-deploy hook (migration backward-compat).
-# --------------------------------------------------------------------
-log "step 1 : pre-deploy hook"
-if [ -x "$PRE_DEPLOY_HOOK" ]; then
-  if ! "$PRE_DEPLOY_HOOK"; then
-    die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
-  fi
-else
-  log "  PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
-fi
-
-# --------------------------------------------------------------------
-# 2. Drain canary node.
-# --------------------------------------------------------------------
-log "step 2 : drain $CANARY_NODE in HAProxy"
-ha_drain "$CANARY_NODE"
-ha_wait_drained "$CANARY_NODE"
-
-# --------------------------------------------------------------------
-# 3. Deploy artifact to the canary node.
-# --------------------------------------------------------------------
-log "step 3 : deploy artifact to $CANARY_NODE"
-deploy_to "$CANARY_NODE" "$ARTIFACT"
-
-# --------------------------------------------------------------------
-# 4. Per-node health check.
-# --------------------------------------------------------------------
-log "step 4 : health check on $CANARY_NODE"
-if ! verify_node_health "$CANARY_NODE"; then
-  log "$CANARY_NODE failed health check post-deploy"
-  rollback_canary
-  exit 2
-fi
-
-# --------------------------------------------------------------------
-# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
-# --------------------------------------------------------------------
-log "step 5 : re-enable $CANARY_NODE in HAProxy"
-ha_ready "$CANARY_NODE"
-sleep 5
-lb_status=$(curl_health)
-if [ "$lb_status" != "200" ]; then
-  log "LB health check after re-enable returned $lb_status ; rolling back"
-  rollback_canary
-  exit 2
-fi
-
-# --------------------------------------------------------------------
-# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
-# --------------------------------------------------------------------
-log "step 6 : monitor SLI on the canary"
-if ! monitor_sli; then
-  log "SLI red — rolling back the canary"
-  rollback_canary
-  exit 2
-fi
-
-# --------------------------------------------------------------------
-# 7. SLI green — repeat on each peer.
-# --------------------------------------------------------------------
-log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
-IFS=',' read -ra peers <<< "$PEER_NODES"
-for peer in "${peers[@]}"; do
-  log "── peer $peer ───────────────────────────"
-  ha_drain "$peer"
-  ha_wait_drained "$peer"
-  deploy_to "$peer" "$ARTIFACT"
-  if ! verify_node_health "$peer"; then
-    log "$peer health check failed post-deploy"
-    log "WARN : leaving $peer drained ; canary node still serves traffic"
-    log "       operator must re-deploy known-good binary or repair $peer manually"
-    exit 2
-  fi
-  ha_ready "$peer"
-  sleep 5
-  lb_status=$(curl_health)
-  if [ "$lb_status" != "200" ]; then
-    log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
-    exit 2
-  fi
-done
-
-log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
-exit 0